aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile53
-rw-r--r--kernel/acct.c561
-rw-r--r--kernel/audit.c839
-rw-r--r--kernel/auditsc.c1015
-rw-r--r--kernel/capability.c220
-rw-r--r--kernel/compat.c860
-rw-r--r--kernel/configs.c118
-rw-r--r--kernel/cpu.c193
-rw-r--r--kernel/cpuset.c1564
-rw-r--r--kernel/dma.c158
-rw-r--r--kernel/exec_domain.c209
-rw-r--r--kernel/exit.c1527
-rw-r--r--kernel/extable.c67
-rw-r--r--kernel/fork.c1274
-rw-r--r--kernel/futex.c798
-rw-r--r--kernel/intermodule.c182
-rw-r--r--kernel/irq/Makefile5
-rw-r--r--kernel/irq/autoprobe.c189
-rw-r--r--kernel/irq/handle.c193
-rw-r--r--kernel/irq/internals.h18
-rw-r--r--kernel/irq/manage.c349
-rw-r--r--kernel/irq/proc.c159
-rw-r--r--kernel/irq/spurious.c96
-rw-r--r--kernel/itimer.c241
-rw-r--r--kernel/kallsyms.c411
-rw-r--r--kernel/kfifo.c168
-rw-r--r--kernel/kmod.c256
-rw-r--r--kernel/kprobes.c157
-rw-r--r--kernel/ksysfs.c57
-rw-r--r--kernel/kthread.c202
-rw-r--r--kernel/module.c2108
-rw-r--r--kernel/panic.c157
-rw-r--r--kernel/params.c721
-rw-r--r--kernel/pid.c292
-rw-r--r--kernel/posix-cpu-timers.c1559
-rw-r--r--kernel/posix-timers.c1584
-rw-r--r--kernel/power/Kconfig74
-rw-r--r--kernel/power/Makefile11
-rw-r--r--kernel/power/console.c58
-rw-r--r--kernel/power/disk.c431
-rw-r--r--kernel/power/main.c269
-rw-r--r--kernel/power/pm.c265
-rw-r--r--kernel/power/power.h52
-rw-r--r--kernel/power/poweroff.c45
-rw-r--r--kernel/power/process.c121
-rw-r--r--kernel/power/smp.c85
-rw-r--r--kernel/power/swsusp.c1433
-rw-r--r--kernel/printk.c996
-rw-r--r--kernel/profile.c563
-rw-r--r--kernel/ptrace.c389
-rw-r--r--kernel/rcupdate.c470
-rw-r--r--kernel/resource.c551
-rw-r--r--kernel/sched.c5004
-rw-r--r--kernel/seccomp.c56
-rw-r--r--kernel/signal.c2662
-rw-r--r--kernel/softirq.c496
-rw-r--r--kernel/spinlock.c371
-rw-r--r--kernel/stop_machine.c212
-rw-r--r--kernel/sys.c1725
-rw-r--r--kernel/sys_ni.c86
-rw-r--r--kernel/sysctl.c2337
-rw-r--r--kernel/time.c599
-rw-r--r--kernel/timer.c1611
-rw-r--r--kernel/uid16.c196
-rw-r--r--kernel/user.c189
-rw-r--r--kernel/wait.c246
-rw-r--r--kernel/workqueue.c555
67 files changed, 40718 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 000000000000..eb88b446c2cc
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,53 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o intermodule.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
11
12obj-$(CONFIG_FUTEX) += futex.o
13obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
14obj-$(CONFIG_SMP) += cpu.o spinlock.o
15obj-$(CONFIG_UID16) += uid16.o
16obj-$(CONFIG_MODULES) += module.o
17obj-$(CONFIG_KALLSYMS) += kallsyms.o
18obj-$(CONFIG_PM) += power/
19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
20obj-$(CONFIG_COMPAT) += compat.o
21obj-$(CONFIG_CPUSETS) += cpuset.o
22obj-$(CONFIG_IKCONFIG) += configs.o
23obj-$(CONFIG_IKCONFIG_PROC) += configs.o
24obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
25obj-$(CONFIG_AUDIT) += audit.o
26obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
27obj-$(CONFIG_KPROBES) += kprobes.o
28obj-$(CONFIG_SYSFS) += ksysfs.o
29obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
30obj-$(CONFIG_SECCOMP) += seccomp.o
31
32ifneq ($(CONFIG_IA64),y)
33# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
34# needed for x86 only. Why this used to be enabled for all architectures is beyond
35# me. I suspect most platforms don't need this, but until we know that for sure
36# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
37# to get a correct value for the wait-channel (WCHAN in ps). --davidm
38CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
39endif
40
41$(obj)/configs.o: $(obj)/config_data.h
42
43# config_data.h contains the same information as ikconfig.h but gzipped.
44# Info from config_data can be extracted from /proc/config*
45targets += config_data.gz
46$(obj)/config_data.gz: .config FORCE
47 $(call if_changed,gzip)
48
49quiet_cmd_ikconfiggz = IKCFG $@
50 cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
51targets += config_data.h
52$(obj)/config_data.h: $(obj)/config_data.gz FORCE
53 $(call if_changed,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
new file mode 100644
index 000000000000..4168f631868e
--- /dev/null
+++ b/kernel/acct.c
@@ -0,0 +1,561 @@
1/*
2 * linux/kernel/acct.c
3 *
4 * BSD Process Accounting for Linux
5 *
6 * Author: Marco van Wieringen <mvw@planets.elm.net>
7 *
8 * Some code based on ideas and code from:
9 * Thomas K. Dyas <tdyas@eden.rutgers.edu>
10 *
11 * This file implements BSD-style process accounting. Whenever any
12 * process exits, an accounting record of type "struct acct" is
13 * written to the file specified with the acct() system call. It is
14 * up to user-level programs to do useful things with the accounting
15 * log. The kernel just provides the raw accounting information.
16 *
17 * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
18 *
19 * Plugged two leaks. 1) It didn't return acct_file into the free_filps if
20 * the file happened to be read-only. 2) If the accounting was suspended
21 * due to the lack of space it happily allowed to reopen it and completely
22 * lost the old acct_file. 3/10/98, Al Viro.
23 *
24 * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
25 * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
26 *
27 * Fixed a nasty interaction with with sys_umount(). If the accointing
28 * was suspeneded we failed to stop it on umount(). Messy.
29 * Another one: remount to readonly didn't stop accounting.
30 * Question: what should we do if we have CAP_SYS_ADMIN but not
31 * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
32 * unless we are messing with the root. In that case we are getting a
33 * real mess with do_remount_sb(). 9/11/98, AV.
34 *
35 * Fixed a bunch of races (and pair of leaks). Probably not the best way,
36 * but this one obviously doesn't introduce deadlocks. Later. BTW, found
37 * one race (and leak) in BSD implementation.
38 * OK, that's better. ANOTHER race and leak in BSD variant. There always
39 * is one more bug... 10/11/98, AV.
40 *
41 * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
42 * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
43 * a struct file opened for write. Fixed. 2/6/2000, AV.
44 */
45
46#include <linux/config.h>
47#include <linux/mm.h>
48#include <linux/slab.h>
49#include <linux/acct.h>
50#include <linux/file.h>
51#include <linux/tty.h>
52#include <linux/security.h>
53#include <linux/vfs.h>
54#include <linux/jiffies.h>
55#include <linux/times.h>
56#include <linux/syscalls.h>
57#include <asm/uaccess.h>
58#include <asm/div64.h>
59#include <linux/blkdev.h> /* sector_div */
60
61/*
62 * These constants control the amount of freespace that suspend and
63 * resume the process accounting system, and the time delay between
64 * each check.
65 * Turned into sysctl-controllable parameters. AV, 12/11/98
66 */
67
68int acct_parm[3] = {4, 2, 30};
69#define RESUME (acct_parm[0]) /* >foo% free space - resume */
70#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
71#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
72
73/*
74 * External references and all of the globals.
75 */
76static void do_acct_process(long, struct file *);
77
78/*
79 * This structure is used so that all the data protected by lock
80 * can be placed in the same cache line as the lock. This primes
81 * the cache line to have the data after getting the lock.
82 */
83struct acct_glbs {
84 spinlock_t lock;
85 volatile int active;
86 volatile int needcheck;
87 struct file *file;
88 struct timer_list timer;
89};
90
91static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
92
93/*
94 * Called whenever the timer says to check the free space.
95 */
96static void acct_timeout(unsigned long unused)
97{
98 acct_globals.needcheck = 1;
99}
100
101/*
102 * Check the amount of free space and suspend/resume accordingly.
103 */
104static int check_free_space(struct file *file)
105{
106 struct kstatfs sbuf;
107 int res;
108 int act;
109 sector_t resume;
110 sector_t suspend;
111
112 spin_lock(&acct_globals.lock);
113 res = acct_globals.active;
114 if (!file || !acct_globals.needcheck)
115 goto out;
116 spin_unlock(&acct_globals.lock);
117
118 /* May block */
119 if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
120 return res;
121 suspend = sbuf.f_blocks * SUSPEND;
122 resume = sbuf.f_blocks * RESUME;
123
124 sector_div(suspend, 100);
125 sector_div(resume, 100);
126
127 if (sbuf.f_bavail <= suspend)
128 act = -1;
129 else if (sbuf.f_bavail >= resume)
130 act = 1;
131 else
132 act = 0;
133
134 /*
135 * If some joker switched acct_globals.file under us we'ld better be
136 * silent and _not_ touch anything.
137 */
138 spin_lock(&acct_globals.lock);
139 if (file != acct_globals.file) {
140 if (act)
141 res = act>0;
142 goto out;
143 }
144
145 if (acct_globals.active) {
146 if (act < 0) {
147 acct_globals.active = 0;
148 printk(KERN_INFO "Process accounting paused\n");
149 }
150 } else {
151 if (act > 0) {
152 acct_globals.active = 1;
153 printk(KERN_INFO "Process accounting resumed\n");
154 }
155 }
156
157 del_timer(&acct_globals.timer);
158 acct_globals.needcheck = 0;
159 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
160 add_timer(&acct_globals.timer);
161 res = acct_globals.active;
162out:
163 spin_unlock(&acct_globals.lock);
164 return res;
165}
166
167/*
168 * Close the old accouting file (if currently open) and then replace
169 * it with file (if non-NULL).
170 *
171 * NOTE: acct_globals.lock MUST be held on entry and exit.
172 */
173static void acct_file_reopen(struct file *file)
174{
175 struct file *old_acct = NULL;
176
177 if (acct_globals.file) {
178 old_acct = acct_globals.file;
179 del_timer(&acct_globals.timer);
180 acct_globals.active = 0;
181 acct_globals.needcheck = 0;
182 acct_globals.file = NULL;
183 }
184 if (file) {
185 acct_globals.file = file;
186 acct_globals.needcheck = 0;
187 acct_globals.active = 1;
188 /* It's been deleted if it was used before so this is safe */
189 init_timer(&acct_globals.timer);
190 acct_globals.timer.function = acct_timeout;
191 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
192 add_timer(&acct_globals.timer);
193 }
194 if (old_acct) {
195 spin_unlock(&acct_globals.lock);
196 do_acct_process(0, old_acct);
197 filp_close(old_acct, NULL);
198 spin_lock(&acct_globals.lock);
199 }
200}
201
202/*
203 * sys_acct() is the only system call needed to implement process
204 * accounting. It takes the name of the file where accounting records
205 * should be written. If the filename is NULL, accounting will be
206 * shutdown.
207 */
208asmlinkage long sys_acct(const char __user *name)
209{
210 struct file *file = NULL;
211 char *tmp;
212 int error;
213
214 if (!capable(CAP_SYS_PACCT))
215 return -EPERM;
216
217 if (name) {
218 tmp = getname(name);
219 if (IS_ERR(tmp)) {
220 return (PTR_ERR(tmp));
221 }
222 /* Difference from BSD - they don't do O_APPEND */
223 file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
224 putname(tmp);
225 if (IS_ERR(file)) {
226 return (PTR_ERR(file));
227 }
228 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
229 filp_close(file, NULL);
230 return (-EACCES);
231 }
232
233 if (!file->f_op->write) {
234 filp_close(file, NULL);
235 return (-EIO);
236 }
237 }
238
239 error = security_acct(file);
240 if (error) {
241 if (file)
242 filp_close(file, NULL);
243 return error;
244 }
245
246 spin_lock(&acct_globals.lock);
247 acct_file_reopen(file);
248 spin_unlock(&acct_globals.lock);
249
250 return (0);
251}
252
253/*
254 * If the accouting is turned on for a file in the filesystem pointed
255 * to by sb, turn accouting off.
256 */
257void acct_auto_close(struct super_block *sb)
258{
259 spin_lock(&acct_globals.lock);
260 if (acct_globals.file &&
261 acct_globals.file->f_dentry->d_inode->i_sb == sb) {
262 acct_file_reopen((struct file *)NULL);
263 }
264 spin_unlock(&acct_globals.lock);
265}
266
267/*
268 * encode an unsigned long into a comp_t
269 *
270 * This routine has been adopted from the encode_comp_t() function in
271 * the kern_acct.c file of the FreeBSD operating system. The encoding
272 * is a 13-bit fraction with a 3-bit (base 8) exponent.
273 */
274
275#define MANTSIZE 13 /* 13 bit mantissa. */
276#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
277#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
278
279static comp_t encode_comp_t(unsigned long value)
280{
281 int exp, rnd;
282
283 exp = rnd = 0;
284 while (value > MAXFRACT) {
285 rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
286 value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
287 exp++;
288 }
289
290 /*
291 * If we need to round up, do it (and handle overflow correctly).
292 */
293 if (rnd && (++value > MAXFRACT)) {
294 value >>= EXPSIZE;
295 exp++;
296 }
297
298 /*
299 * Clean it up and polish it off.
300 */
301 exp <<= MANTSIZE; /* Shift the exponent into place */
302 exp += value; /* and add on the mantissa. */
303 return exp;
304}
305
306#if ACCT_VERSION==1 || ACCT_VERSION==2
307/*
308 * encode an u64 into a comp2_t (24 bits)
309 *
310 * Format: 5 bit base 2 exponent, 20 bits mantissa.
311 * The leading bit of the mantissa is not stored, but implied for
312 * non-zero exponents.
313 * Largest encodable value is 50 bits.
314 */
315
316#define MANTSIZE2 20 /* 20 bit mantissa. */
317#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
318#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
319#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
320
321static comp2_t encode_comp2_t(u64 value)
322{
323 int exp, rnd;
324
325 exp = (value > (MAXFRACT2>>1));
326 rnd = 0;
327 while (value > MAXFRACT2) {
328 rnd = value & 1;
329 value >>= 1;
330 exp++;
331 }
332
333 /*
334 * If we need to round up, do it (and handle overflow correctly).
335 */
336 if (rnd && (++value > MAXFRACT2)) {
337 value >>= 1;
338 exp++;
339 }
340
341 if (exp > MAXEXP2) {
342 /* Overflow. Return largest representable number instead. */
343 return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
344 } else {
345 return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
346 }
347}
348#endif
349
350#if ACCT_VERSION==3
351/*
352 * encode an u64 into a 32 bit IEEE float
353 */
354static u32 encode_float(u64 value)
355{
356 unsigned exp = 190;
357 unsigned u;
358
359 if (value==0) return 0;
360 while ((s64)value > 0){
361 value <<= 1;
362 exp--;
363 }
364 u = (u32)(value >> 40) & 0x7fffffu;
365 return u | (exp << 23);
366}
367#endif
368
369/*
370 * Write an accounting entry for an exiting process
371 *
372 * The acct_process() call is the workhorse of the process
373 * accounting system. The struct acct is built here and then written
374 * into the accounting file. This function should only be called from
375 * do_exit().
376 */
377
378/*
379 * do_acct_process does all actual work. Caller holds the reference to file.
380 */
381static void do_acct_process(long exitcode, struct file *file)
382{
383 acct_t ac;
384 mm_segment_t fs;
385 unsigned long vsize;
386 unsigned long flim;
387 u64 elapsed;
388 u64 run_time;
389 struct timespec uptime;
390
391 /*
392 * First check to see if there is enough free_space to continue
393 * the process accounting system.
394 */
395 if (!check_free_space(file))
396 return;
397
398 /*
399 * Fill the accounting struct with the needed info as recorded
400 * by the different kernel functions.
401 */
402 memset((caddr_t)&ac, 0, sizeof(acct_t));
403
404 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
405 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
406
407 /* calculate run_time in nsec*/
408 do_posix_clock_monotonic_gettime(&uptime);
409 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
410 run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
411 + current->start_time.tv_nsec;
412 /* convert nsec -> AHZ */
413 elapsed = nsec_to_AHZ(run_time);
414#if ACCT_VERSION==3
415 ac.ac_etime = encode_float(elapsed);
416#else
417 ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
418 (unsigned long) elapsed : (unsigned long) -1l);
419#endif
420#if ACCT_VERSION==1 || ACCT_VERSION==2
421 {
422 /* new enlarged etime field */
423 comp2_t etime = encode_comp2_t(elapsed);
424 ac.ac_etime_hi = etime >> 16;
425 ac.ac_etime_lo = (u16) etime;
426 }
427#endif
428 do_div(elapsed, AHZ);
429 ac.ac_btime = xtime.tv_sec - elapsed;
430 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(
431 current->signal->utime +
432 current->group_leader->utime));
433 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(
434 current->signal->stime +
435 current->group_leader->stime));
436 /* we really need to bite the bullet and change layout */
437 ac.ac_uid = current->uid;
438 ac.ac_gid = current->gid;
439#if ACCT_VERSION==2
440 ac.ac_ahz = AHZ;
441#endif
442#if ACCT_VERSION==1 || ACCT_VERSION==2
443 /* backward-compatible 16 bit fields */
444 ac.ac_uid16 = current->uid;
445 ac.ac_gid16 = current->gid;
446#endif
447#if ACCT_VERSION==3
448 ac.ac_pid = current->tgid;
449 ac.ac_ppid = current->parent->tgid;
450#endif
451
452 read_lock(&tasklist_lock); /* pin current->signal */
453 ac.ac_tty = current->signal->tty ?
454 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
455 read_unlock(&tasklist_lock);
456
457 ac.ac_flag = 0;
458 if (current->flags & PF_FORKNOEXEC)
459 ac.ac_flag |= AFORK;
460 if (current->flags & PF_SUPERPRIV)
461 ac.ac_flag |= ASU;
462 if (current->flags & PF_DUMPCORE)
463 ac.ac_flag |= ACORE;
464 if (current->flags & PF_SIGNALED)
465 ac.ac_flag |= AXSIG;
466
467 vsize = 0;
468 if (current->mm) {
469 struct vm_area_struct *vma;
470 down_read(&current->mm->mmap_sem);
471 vma = current->mm->mmap;
472 while (vma) {
473 vsize += vma->vm_end - vma->vm_start;
474 vma = vma->vm_next;
475 }
476 up_read(&current->mm->mmap_sem);
477 }
478 vsize = vsize / 1024;
479 ac.ac_mem = encode_comp_t(vsize);
480 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
481 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
482 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
483 current->group_leader->min_flt);
484 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
485 current->group_leader->maj_flt);
486 ac.ac_swaps = encode_comp_t(0);
487 ac.ac_exitcode = exitcode;
488
489 /*
490 * Kernel segment override to datasegment and write it
491 * to the accounting file.
492 */
493 fs = get_fs();
494 set_fs(KERNEL_DS);
495 /*
496 * Accounting records are not subject to resource limits.
497 */
498 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
499 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
500 file->f_op->write(file, (char *)&ac,
501 sizeof(acct_t), &file->f_pos);
502 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
503 set_fs(fs);
504}
505
506/*
507 * acct_process - now just a wrapper around do_acct_process
508 */
509void acct_process(long exitcode)
510{
511 struct file *file = NULL;
512
513 /*
514 * accelerate the common fastpath:
515 */
516 if (!acct_globals.file)
517 return;
518
519 spin_lock(&acct_globals.lock);
520 file = acct_globals.file;
521 if (unlikely(!file)) {
522 spin_unlock(&acct_globals.lock);
523 return;
524 }
525 get_file(file);
526 spin_unlock(&acct_globals.lock);
527
528 do_acct_process(exitcode, file);
529 fput(file);
530}
531
532
533/*
534 * acct_update_integrals
535 * - update mm integral fields in task_struct
536 */
537void acct_update_integrals(struct task_struct *tsk)
538{
539 if (likely(tsk->mm)) {
540 long delta = tsk->stime - tsk->acct_stimexpd;
541
542 if (delta == 0)
543 return;
544 tsk->acct_stimexpd = tsk->stime;
545 tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
546 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
547 }
548}
549
550/*
551 * acct_clear_integrals
552 * - clear the mm integral fields in task_struct
553 */
554void acct_clear_integrals(struct task_struct *tsk)
555{
556 if (tsk) {
557 tsk->acct_stimexpd = 0;
558 tsk->acct_rss_mem1 = 0;
559 tsk->acct_vm_mem1 = 0;
560 }
561}
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..0f84dd7af2c8
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,839 @@
1/* audit.c -- Auditing support -*- linux-c -*-
2 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
3 * System-call specific features have moved to auditsc.c
4 *
5 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
6 * All Rights Reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 *
22 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
23 *
24 * Goals: 1) Integrate fully with SELinux.
25 * 2) Minimal run-time overhead:
26 * a) Minimal when syscall auditing is disabled (audit_enable=0).
27 * b) Small when syscall auditing is enabled and no audit record
28 * is generated (defer as much work as possible to record
29 * generation time):
30 * i) context is allocated,
31 * ii) names from getname are stored without a copy, and
32 * iii) inode information stored from path_lookup.
33 * 3) Ability to disable syscall auditing at boot time (audit=0).
34 * 4) Usable by other parts of the kernel (if audit_log* is called,
35 * then a syscall record will be generated automatically for the
36 * current syscall).
37 * 5) Netlink interface to user-space.
38 * 6) Support low-overhead kernel-based filtering to minimize the
39 * information that must be passed to user-space.
40 *
41 * Example user-space utilities: http://people.redhat.com/faith/audit/
42 */
43
44#include <linux/init.h>
45#include <asm/atomic.h>
46#include <asm/types.h>
47#include <linux/mm.h>
48#include <linux/module.h>
49
50#include <linux/audit.h>
51
52#include <net/sock.h>
53#include <linux/skbuff.h>
54#include <linux/netlink.h>
55
56/* No auditing will take place until audit_initialized != 0.
57 * (Initialization happens after skb_init is called.) */
58static int audit_initialized;
59
60/* No syscall auditing will take place unless audit_enabled != 0. */
61int audit_enabled;
62
63/* Default state when kernel boots without any parameters. */
64static int audit_default;
65
66/* If auditing cannot proceed, audit_failure selects what happens. */
67static int audit_failure = AUDIT_FAIL_PRINTK;
68
69/* If audit records are to be written to the netlink socket, audit_pid
70 * contains the (non-zero) pid. */
71static int audit_pid;
72
73/* If audit_limit is non-zero, limit the rate of sending audit records
74 * to that number per second. This prevents DoS attacks, but results in
75 * audit records being dropped. */
76static int audit_rate_limit;
77
78/* Number of outstanding audit_buffers allowed. */
79static int audit_backlog_limit = 64;
80static atomic_t audit_backlog = ATOMIC_INIT(0);
81
82/* Records can be lost in several ways:
83 0) [suppressed in audit_alloc]
84 1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
85 2) out of memory in audit_log_move [alloc_skb]
86 3) suppressed due to audit_rate_limit
87 4) suppressed due to audit_backlog_limit
88*/
89static atomic_t audit_lost = ATOMIC_INIT(0);
90
91/* The netlink socket. */
92static struct sock *audit_sock;
93
94/* There are two lists of audit buffers. The txlist contains audit
95 * buffers that cannot be sent immediately to the netlink device because
96 * we are in an irq context (these are sent later in a tasklet).
97 *
98 * The second list is a list of pre-allocated audit buffers (if more
99 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
100 * being placed on the freelist). */
101static DEFINE_SPINLOCK(audit_txlist_lock);
102static DEFINE_SPINLOCK(audit_freelist_lock);
103static int audit_freelist_count = 0;
104static LIST_HEAD(audit_txlist);
105static LIST_HEAD(audit_freelist);
106
107/* There are three lists of rules -- one to search at task creation
108 * time, one to search at syscall entry time, and another to search at
109 * syscall exit time. */
110static LIST_HEAD(audit_tsklist);
111static LIST_HEAD(audit_entlist);
112static LIST_HEAD(audit_extlist);
113
114/* The netlink socket is only to be read by 1 CPU, which lets us assume
115 * that list additions and deletions never happen simultaneiously in
116 * auditsc.c */
117static DECLARE_MUTEX(audit_netlink_sem);
118
119/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
120 * audit records. Since printk uses a 1024 byte buffer, this buffer
121 * should be at least that large. */
122#define AUDIT_BUFSIZ 1024
123
124/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
125 * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
126#define AUDIT_MAXFREE (2*NR_CPUS)
127
128/* The audit_buffer is used when formatting an audit record. The caller
129 * locks briefly to get the record off the freelist or to allocate the
130 * buffer, and locks briefly to send the buffer to the netlink layer or
131 * to place it on a transmit queue. Multiple audit_buffers can be in
132 * use simultaneously. */
133struct audit_buffer {
134 struct list_head list;
135 struct sk_buff_head sklist; /* formatted skbs ready to send */
136 struct audit_context *ctx; /* NULL or associated context */
137 int len; /* used area of tmp */
138 char tmp[AUDIT_BUFSIZ];
139
140 /* Pointer to header and contents */
141 struct nlmsghdr *nlh;
142 int total;
143 int type;
144 int pid;
145 int count; /* Times requeued */
146};
147
148void audit_set_type(struct audit_buffer *ab, int type)
149{
150 ab->type = type;
151}
152
153struct audit_entry {
154 struct list_head list;
155 struct audit_rule rule;
156};
157
158static void audit_log_end_irq(struct audit_buffer *ab);
159static void audit_log_end_fast(struct audit_buffer *ab);
160
161static void audit_panic(const char *message)
162{
163 switch (audit_failure)
164 {
165 case AUDIT_FAIL_SILENT:
166 break;
167 case AUDIT_FAIL_PRINTK:
168 printk(KERN_ERR "audit: %s\n", message);
169 break;
170 case AUDIT_FAIL_PANIC:
171 panic("audit: %s\n", message);
172 break;
173 }
174}
175
176static inline int audit_rate_check(void)
177{
178 static unsigned long last_check = 0;
179 static int messages = 0;
180 static DEFINE_SPINLOCK(lock);
181 unsigned long flags;
182 unsigned long now;
183 unsigned long elapsed;
184 int retval = 0;
185
186 if (!audit_rate_limit) return 1;
187
188 spin_lock_irqsave(&lock, flags);
189 if (++messages < audit_rate_limit) {
190 retval = 1;
191 } else {
192 now = jiffies;
193 elapsed = now - last_check;
194 if (elapsed > HZ) {
195 last_check = now;
196 messages = 0;
197 retval = 1;
198 }
199 }
200 spin_unlock_irqrestore(&lock, flags);
201
202 return retval;
203}
204
205/* Emit at least 1 message per second, even if audit_rate_check is
206 * throttling. */
207void audit_log_lost(const char *message)
208{
209 static unsigned long last_msg = 0;
210 static DEFINE_SPINLOCK(lock);
211 unsigned long flags;
212 unsigned long now;
213 int print;
214
215 atomic_inc(&audit_lost);
216
217 print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
218
219 if (!print) {
220 spin_lock_irqsave(&lock, flags);
221 now = jiffies;
222 if (now - last_msg > HZ) {
223 print = 1;
224 last_msg = now;
225 }
226 spin_unlock_irqrestore(&lock, flags);
227 }
228
229 if (print) {
230 printk(KERN_WARNING
231 "audit: audit_lost=%d audit_backlog=%d"
232 " audit_rate_limit=%d audit_backlog_limit=%d\n",
233 atomic_read(&audit_lost),
234 atomic_read(&audit_backlog),
235 audit_rate_limit,
236 audit_backlog_limit);
237 audit_panic(message);
238 }
239
240}
241
242static int audit_set_rate_limit(int limit)
243{
244 int old = audit_rate_limit;
245 audit_rate_limit = limit;
246 audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
247 audit_rate_limit, old);
248 return old;
249}
250
251static int audit_set_backlog_limit(int limit)
252{
253 int old = audit_backlog_limit;
254 audit_backlog_limit = limit;
255 audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
256 audit_backlog_limit, old);
257 return old;
258}
259
260static int audit_set_enabled(int state)
261{
262 int old = audit_enabled;
263 if (state != 0 && state != 1)
264 return -EINVAL;
265 audit_enabled = state;
266 audit_log(current->audit_context, "audit_enabled=%d old=%d",
267 audit_enabled, old);
268 return old;
269}
270
271static int audit_set_failure(int state)
272{
273 int old = audit_failure;
274 if (state != AUDIT_FAIL_SILENT
275 && state != AUDIT_FAIL_PRINTK
276 && state != AUDIT_FAIL_PANIC)
277 return -EINVAL;
278 audit_failure = state;
279 audit_log(current->audit_context, "audit_failure=%d old=%d",
280 audit_failure, old);
281 return old;
282}
283
284#ifdef CONFIG_NET
285void audit_send_reply(int pid, int seq, int type, int done, int multi,
286 void *payload, int size)
287{
288 struct sk_buff *skb;
289 struct nlmsghdr *nlh;
290 int len = NLMSG_SPACE(size);
291 void *data;
292 int flags = multi ? NLM_F_MULTI : 0;
293 int t = done ? NLMSG_DONE : type;
294
295 skb = alloc_skb(len, GFP_KERNEL);
296 if (!skb)
297 goto nlmsg_failure;
298
299 nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
300 nlh->nlmsg_flags = flags;
301 data = NLMSG_DATA(nlh);
302 memcpy(data, payload, size);
303 netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
304 return;
305
306nlmsg_failure: /* Used by NLMSG_PUT */
307 if (skb)
308 kfree_skb(skb);
309}
310
311/*
312 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
313 * control messages.
314 */
315static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
316{
317 int err = 0;
318
319 switch (msg_type) {
320 case AUDIT_GET:
321 case AUDIT_LIST:
322 case AUDIT_SET:
323 case AUDIT_ADD:
324 case AUDIT_DEL:
325 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
326 err = -EPERM;
327 break;
328 case AUDIT_USER:
329 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
330 err = -EPERM;
331 break;
332 default: /* bad msg */
333 err = -EINVAL;
334 }
335
336 return err;
337}
338
339static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
340{
341 u32 uid, pid, seq;
342 void *data;
343 struct audit_status *status_get, status_set;
344 int err;
345 struct audit_buffer *ab;
346 u16 msg_type = nlh->nlmsg_type;
347
348 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
349 if (err)
350 return err;
351
352 pid = NETLINK_CREDS(skb)->pid;
353 uid = NETLINK_CREDS(skb)->uid;
354 seq = nlh->nlmsg_seq;
355 data = NLMSG_DATA(nlh);
356
357 switch (msg_type) {
358 case AUDIT_GET:
359 status_set.enabled = audit_enabled;
360 status_set.failure = audit_failure;
361 status_set.pid = audit_pid;
362 status_set.rate_limit = audit_rate_limit;
363 status_set.backlog_limit = audit_backlog_limit;
364 status_set.lost = atomic_read(&audit_lost);
365 status_set.backlog = atomic_read(&audit_backlog);
366 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
367 &status_set, sizeof(status_set));
368 break;
369 case AUDIT_SET:
370 if (nlh->nlmsg_len < sizeof(struct audit_status))
371 return -EINVAL;
372 status_get = (struct audit_status *)data;
373 if (status_get->mask & AUDIT_STATUS_ENABLED) {
374 err = audit_set_enabled(status_get->enabled);
375 if (err < 0) return err;
376 }
377 if (status_get->mask & AUDIT_STATUS_FAILURE) {
378 err = audit_set_failure(status_get->failure);
379 if (err < 0) return err;
380 }
381 if (status_get->mask & AUDIT_STATUS_PID) {
382 int old = audit_pid;
383 audit_pid = status_get->pid;
384 audit_log(current->audit_context,
385 "audit_pid=%d old=%d", audit_pid, old);
386 }
387 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
388 audit_set_rate_limit(status_get->rate_limit);
389 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
390 audit_set_backlog_limit(status_get->backlog_limit);
391 break;
392 case AUDIT_USER:
393 ab = audit_log_start(NULL);
394 if (!ab)
395 break; /* audit_panic has been called */
396 audit_log_format(ab,
397 "user pid=%d uid=%d length=%d msg='%.1024s'",
398 pid, uid,
399 (int)(nlh->nlmsg_len
400 - ((char *)data - (char *)nlh)),
401 (char *)data);
402 ab->type = AUDIT_USER;
403 ab->pid = pid;
404 audit_log_end(ab);
405 break;
406 case AUDIT_ADD:
407 case AUDIT_DEL:
408 if (nlh->nlmsg_len < sizeof(struct audit_rule))
409 return -EINVAL;
410 /* fallthrough */
411 case AUDIT_LIST:
412#ifdef CONFIG_AUDITSYSCALL
413 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
414 uid, seq, data);
415#else
416 err = -EOPNOTSUPP;
417#endif
418 break;
419 default:
420 err = -EINVAL;
421 break;
422 }
423
424 return err < 0 ? err : 0;
425}
426
427/* Get message from skb (based on rtnetlink_rcv_skb). Each message is
428 * processed by audit_receive_msg. Malformed skbs with wrong length are
429 * discarded silently. */
430static int audit_receive_skb(struct sk_buff *skb)
431{
432 int err;
433 struct nlmsghdr *nlh;
434 u32 rlen;
435
436 while (skb->len >= NLMSG_SPACE(0)) {
437 nlh = (struct nlmsghdr *)skb->data;
438 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
439 return 0;
440 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
441 if (rlen > skb->len)
442 rlen = skb->len;
443 if ((err = audit_receive_msg(skb, nlh))) {
444 netlink_ack(skb, nlh, err);
445 } else if (nlh->nlmsg_flags & NLM_F_ACK)
446 netlink_ack(skb, nlh, 0);
447 skb_pull(skb, rlen);
448 }
449 return 0;
450}
451
452/* Receive messages from netlink socket. */
453static void audit_receive(struct sock *sk, int length)
454{
455 struct sk_buff *skb;
456
457 if (down_trylock(&audit_netlink_sem))
458 return;
459
460 /* FIXME: this must not cause starvation */
461 while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
462 if (audit_receive_skb(skb) && skb->len)
463 skb_queue_head(&sk->sk_receive_queue, skb);
464 else
465 kfree_skb(skb);
466 }
467 up(&audit_netlink_sem);
468}
469
470/* Move data from tmp buffer into an skb. This is an extra copy, and
471 * that is unfortunate. However, the copy will only occur when a record
472 * is being written to user space, which is already a high-overhead
473 * operation. (Elimination of the copy is possible, for example, by
474 * writing directly into a pre-allocated skb, at the cost of wasting
475 * memory. */
476static void audit_log_move(struct audit_buffer *ab)
477{
478 struct sk_buff *skb;
479 char *start;
480 int extra = ab->nlh ? 0 : NLMSG_SPACE(0);
481
482 /* possible resubmission */
483 if (ab->len == 0)
484 return;
485
486 skb = skb_peek(&ab->sklist);
487 if (!skb || skb_tailroom(skb) <= ab->len + extra) {
488 skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
489 if (!skb) {
490 ab->len = 0; /* Lose information in ab->tmp */
491 audit_log_lost("out of memory in audit_log_move");
492 return;
493 }
494 __skb_queue_tail(&ab->sklist, skb);
495 if (!ab->nlh)
496 ab->nlh = (struct nlmsghdr *)skb_put(skb,
497 NLMSG_SPACE(0));
498 }
499 start = skb_put(skb, ab->len);
500 memcpy(start, ab->tmp, ab->len);
501 ab->len = 0;
502}
503
504/* Iterate over the skbuff in the audit_buffer, sending their contents
505 * to user space. */
506static inline int audit_log_drain(struct audit_buffer *ab)
507{
508 struct sk_buff *skb;
509
510 while ((skb = skb_dequeue(&ab->sklist))) {
511 int retval = 0;
512
513 if (audit_pid) {
514 if (ab->nlh) {
515 ab->nlh->nlmsg_len = ab->total;
516 ab->nlh->nlmsg_type = ab->type;
517 ab->nlh->nlmsg_flags = 0;
518 ab->nlh->nlmsg_seq = 0;
519 ab->nlh->nlmsg_pid = ab->pid;
520 }
521 skb_get(skb); /* because netlink_* frees */
522 retval = netlink_unicast(audit_sock, skb, audit_pid,
523 MSG_DONTWAIT);
524 }
525 if (retval == -EAGAIN && ab->count < 5) {
526 ++ab->count;
527 skb_queue_tail(&ab->sklist, skb);
528 audit_log_end_irq(ab);
529 return 1;
530 }
531 if (retval < 0) {
532 if (retval == -ECONNREFUSED) {
533 printk(KERN_ERR
534 "audit: *NO* daemon at audit_pid=%d\n",
535 audit_pid);
536 audit_pid = 0;
537 } else
538 audit_log_lost("netlink socket too busy");
539 }
540 if (!audit_pid) { /* No daemon */
541 int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
542 int len = skb->len - offset;
543 printk(KERN_ERR "%*.*s\n",
544 len, len, skb->data + offset);
545 }
546 kfree_skb(skb);
547 ab->nlh = NULL;
548 }
549 return 0;
550}
551
552/* Initialize audit support at boot time. */
553static int __init audit_init(void)
554{
555 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
556 audit_default ? "enabled" : "disabled");
557 audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
558 if (!audit_sock)
559 audit_panic("cannot initialize netlink socket");
560
561 audit_initialized = 1;
562 audit_enabled = audit_default;
563 audit_log(NULL, "initialized");
564 return 0;
565}
566
567#else
568/* Without CONFIG_NET, we have no skbuffs. For now, print what we have
569 * in the buffer. */
570static void audit_log_move(struct audit_buffer *ab)
571{
572 printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
573 ab->len = 0;
574}
575
576static inline int audit_log_drain(struct audit_buffer *ab)
577{
578 return 0;
579}
580
581/* Initialize audit support at boot time. */
582int __init audit_init(void)
583{
584 printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
585 audit_sock = NULL;
586 audit_pid = 0;
587
588 audit_initialized = 1;
589 audit_enabled = audit_default;
590 audit_log(NULL, "initialized");
591 return 0;
592}
593#endif
594
595__initcall(audit_init);
596
597/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
598static int __init audit_enable(char *str)
599{
600 audit_default = !!simple_strtol(str, NULL, 0);
601 printk(KERN_INFO "audit: %s%s\n",
602 audit_default ? "enabled" : "disabled",
603 audit_initialized ? "" : " (after initialization)");
604 if (audit_initialized)
605 audit_enabled = audit_default;
606 return 0;
607}
608
609__setup("audit=", audit_enable);
610
611
612/* Obtain an audit buffer. This routine does locking to obtain the
613 * audit buffer, but then no locking is required for calls to
614 * audit_log_*format. If the tsk is a task that is currently in a
615 * syscall, then the syscall is marked as auditable and an audit record
616 * will be written at syscall exit. If there is no associated task, tsk
617 * should be NULL. */
618struct audit_buffer *audit_log_start(struct audit_context *ctx)
619{
620 struct audit_buffer *ab = NULL;
621 unsigned long flags;
622 struct timespec t;
623 int serial = 0;
624
625 if (!audit_initialized)
626 return NULL;
627
628 if (audit_backlog_limit
629 && atomic_read(&audit_backlog) > audit_backlog_limit) {
630 if (audit_rate_check())
631 printk(KERN_WARNING
632 "audit: audit_backlog=%d > "
633 "audit_backlog_limit=%d\n",
634 atomic_read(&audit_backlog),
635 audit_backlog_limit);
636 audit_log_lost("backlog limit exceeded");
637 return NULL;
638 }
639
640 spin_lock_irqsave(&audit_freelist_lock, flags);
641 if (!list_empty(&audit_freelist)) {
642 ab = list_entry(audit_freelist.next,
643 struct audit_buffer, list);
644 list_del(&ab->list);
645 --audit_freelist_count;
646 }
647 spin_unlock_irqrestore(&audit_freelist_lock, flags);
648
649 if (!ab)
650 ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
651 if (!ab) {
652 audit_log_lost("out of memory in audit_log_start");
653 return NULL;
654 }
655
656 atomic_inc(&audit_backlog);
657 skb_queue_head_init(&ab->sklist);
658
659 ab->ctx = ctx;
660 ab->len = 0;
661 ab->nlh = NULL;
662 ab->total = 0;
663 ab->type = AUDIT_KERNEL;
664 ab->pid = 0;
665 ab->count = 0;
666
667#ifdef CONFIG_AUDITSYSCALL
668 if (ab->ctx)
669 audit_get_stamp(ab->ctx, &t, &serial);
670 else
671#endif
672 t = CURRENT_TIME;
673
674 audit_log_format(ab, "audit(%lu.%03lu:%u): ",
675 t.tv_sec, t.tv_nsec/1000000, serial);
676 return ab;
677}
678
679
680/* Format an audit message into the audit buffer. If there isn't enough
681 * room in the audit buffer, more room will be allocated and vsnprint
682 * will be called a second time. Currently, we assume that a printk
683 * can't format message larger than 1024 bytes, so we don't either. */
684static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
685 va_list args)
686{
687 int len, avail;
688
689 if (!ab)
690 return;
691
692 avail = sizeof(ab->tmp) - ab->len;
693 if (avail <= 0) {
694 audit_log_move(ab);
695 avail = sizeof(ab->tmp) - ab->len;
696 }
697 len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
698 if (len >= avail) {
699 /* The printk buffer is 1024 bytes long, so if we get
700 * here and AUDIT_BUFSIZ is at least 1024, then we can
701 * log everything that printk could have logged. */
702 audit_log_move(ab);
703 avail = sizeof(ab->tmp) - ab->len;
704 len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
705 }
706 ab->len += (len < avail) ? len : avail;
707 ab->total += (len < avail) ? len : avail;
708}
709
710/* Format a message into the audit buffer. All the work is done in
711 * audit_log_vformat. */
712void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
713{
714 va_list args;
715
716 if (!ab)
717 return;
718 va_start(args, fmt);
719 audit_log_vformat(ab, fmt, args);
720 va_end(args);
721}
722
723/* This is a helper-function to print the d_path without using a static
724 * buffer or allocating another buffer in addition to the one in
725 * audit_buffer. */
726void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
727 struct dentry *dentry, struct vfsmount *vfsmnt)
728{
729 char *p;
730 int len, avail;
731
732 if (prefix) audit_log_format(ab, " %s", prefix);
733
734 if (ab->len > 128)
735 audit_log_move(ab);
736 avail = sizeof(ab->tmp) - ab->len;
737 p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
738 if (IS_ERR(p)) {
739 /* FIXME: can we save some information here? */
740 audit_log_format(ab, "<toolong>");
741 } else {
742 /* path isn't at start of buffer */
743 len = (ab->tmp + sizeof(ab->tmp) - 1) - p;
744 memmove(ab->tmp + ab->len, p, len);
745 ab->len += len;
746 ab->total += len;
747 }
748}
749
750/* Remove queued messages from the audit_txlist and send them to userspace. */
751static void audit_tasklet_handler(unsigned long arg)
752{
753 LIST_HEAD(list);
754 struct audit_buffer *ab;
755 unsigned long flags;
756
757 spin_lock_irqsave(&audit_txlist_lock, flags);
758 list_splice_init(&audit_txlist, &list);
759 spin_unlock_irqrestore(&audit_txlist_lock, flags);
760
761 while (!list_empty(&list)) {
762 ab = list_entry(list.next, struct audit_buffer, list);
763 list_del(&ab->list);
764 audit_log_end_fast(ab);
765 }
766}
767
768static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
769
770/* The netlink_* functions cannot be called inside an irq context, so
771 * the audit buffer is places on a queue and a tasklet is scheduled to
772 * remove them from the queue outside the irq context. May be called in
773 * any context. */
774static void audit_log_end_irq(struct audit_buffer *ab)
775{
776 unsigned long flags;
777
778 if (!ab)
779 return;
780 spin_lock_irqsave(&audit_txlist_lock, flags);
781 list_add_tail(&ab->list, &audit_txlist);
782 spin_unlock_irqrestore(&audit_txlist_lock, flags);
783
784 tasklet_schedule(&audit_tasklet);
785}
786
787/* Send the message in the audit buffer directly to user space. May not
788 * be called in an irq context. */
789static void audit_log_end_fast(struct audit_buffer *ab)
790{
791 unsigned long flags;
792
793 BUG_ON(in_irq());
794 if (!ab)
795 return;
796 if (!audit_rate_check()) {
797 audit_log_lost("rate limit exceeded");
798 } else {
799 audit_log_move(ab);
800 if (audit_log_drain(ab))
801 return;
802 }
803
804 atomic_dec(&audit_backlog);
805 spin_lock_irqsave(&audit_freelist_lock, flags);
806 if (++audit_freelist_count > AUDIT_MAXFREE)
807 kfree(ab);
808 else
809 list_add(&ab->list, &audit_freelist);
810 spin_unlock_irqrestore(&audit_freelist_lock, flags);
811}
812
813/* Send or queue the message in the audit buffer, depending on the
814 * current context. (A convenience function that may be called in any
815 * context.) */
816void audit_log_end(struct audit_buffer *ab)
817{
818 if (in_irq())
819 audit_log_end_irq(ab);
820 else
821 audit_log_end_fast(ab);
822}
823
824/* Log an audit record. This is a convenience function that calls
825 * audit_log_start, audit_log_vformat, and audit_log_end. It may be
826 * called in any context. */
827void audit_log(struct audit_context *ctx, const char *fmt, ...)
828{
829 struct audit_buffer *ab;
830 va_list args;
831
832 ab = audit_log_start(ctx);
833 if (ab) {
834 va_start(args, fmt);
835 audit_log_vformat(ab, fmt, args);
836 va_end(args);
837 audit_log_end(ab);
838 }
839}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000000..8c454852d6a5
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,1015 @@
1/* auditsc.c -- System-call auditing support -*- linux-c -*-
2 * Handles all system-call specific auditing features.
3 *
4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
5 * All Rights Reserved.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
22 *
23 * Many of the ideas implemented here are from Stephen C. Tweedie,
24 * especially the idea of avoiding a copy by using getname.
25 *
26 * The method for actual interception of syscall entry and exit (not in
27 * this file -- see entry.S) is based on a GPL'd patch written by
28 * okir@suse.de and Copyright 2003 SuSE Linux AG.
29 *
30 */
31
32#include <linux/init.h>
33#include <asm/atomic.h>
34#include <asm/types.h>
35#include <linux/mm.h>
36#include <linux/module.h>
37
38#include <linux/audit.h>
39#include <linux/personality.h>
40#include <linux/time.h>
41#include <asm/unistd.h>
42
43/* 0 = no checking
44 1 = put_count checking
45 2 = verbose put_count checking
46*/
47#define AUDIT_DEBUG 0
48
49/* No syscall auditing will take place unless audit_enabled != 0. */
50extern int audit_enabled;
51
52/* AUDIT_NAMES is the number of slots we reserve in the audit_context
53 * for saving names from getname(). */
54#define AUDIT_NAMES 20
55
56/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
57 * audit_context from being used for nameless inodes from
58 * path_lookup. */
59#define AUDIT_NAMES_RESERVED 7
60
61/* At task start time, the audit_state is set in the audit_context using
62 a per-task filter. At syscall entry, the audit_state is augmented by
63 the syscall filter. */
64enum audit_state {
65 AUDIT_DISABLED, /* Do not create per-task audit_context.
66 * No syscall-specific audit records can
67 * be generated. */
68 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
69 * but don't necessarily fill it in at
70 * syscall entry time (i.e., filter
71 * instead). */
72 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
73 * and always fill it in at syscall
74 * entry time. This makes a full
75 * syscall record available if some
76 * other part of the kernel decides it
77 * should be recorded. */
78 AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
79 * always fill it in at syscall entry
80 * time, and always write out the audit
81 * record at syscall exit time. */
82};
83
84/* When fs/namei.c:getname() is called, we store the pointer in name and
85 * we don't let putname() free it (instead we free all of the saved
86 * pointers at syscall exit time).
87 *
88 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
89struct audit_names {
90 const char *name;
91 unsigned long ino;
92 dev_t dev;
93 umode_t mode;
94 uid_t uid;
95 gid_t gid;
96 dev_t rdev;
97};
98
99struct audit_aux_data {
100 struct audit_aux_data *next;
101 int type;
102};
103
104#define AUDIT_AUX_IPCPERM 0
105
106struct audit_aux_data_ipcctl {
107 struct audit_aux_data d;
108 struct ipc_perm p;
109 unsigned long qbytes;
110 uid_t uid;
111 gid_t gid;
112 mode_t mode;
113};
114
115
116/* The per-task audit context. */
117struct audit_context {
118 int in_syscall; /* 1 if task is in a syscall */
119 enum audit_state state;
120 unsigned int serial; /* serial number for record */
121 struct timespec ctime; /* time of syscall entry */
122 uid_t loginuid; /* login uid (identity) */
123 int major; /* syscall number */
124 unsigned long argv[4]; /* syscall arguments */
125 int return_valid; /* return code is valid */
126 int return_code;/* syscall return code */
127 int auditable; /* 1 if record should be written */
128 int name_count;
129 struct audit_names names[AUDIT_NAMES];
130 struct audit_context *previous; /* For nested syscalls */
131 struct audit_aux_data *aux;
132
133 /* Save things to print about task_struct */
134 pid_t pid;
135 uid_t uid, euid, suid, fsuid;
136 gid_t gid, egid, sgid, fsgid;
137 unsigned long personality;
138
139#if AUDIT_DEBUG
140 int put_count;
141 int ino_count;
142#endif
143};
144
145 /* Public API */
146/* There are three lists of rules -- one to search at task creation
147 * time, one to search at syscall entry time, and another to search at
148 * syscall exit time. */
149static LIST_HEAD(audit_tsklist);
150static LIST_HEAD(audit_entlist);
151static LIST_HEAD(audit_extlist);
152
153struct audit_entry {
154 struct list_head list;
155 struct rcu_head rcu;
156 struct audit_rule rule;
157};
158
159/* Check to see if two rules are identical. It is called from
160 * audit_del_rule during AUDIT_DEL. */
161static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
162{
163 int i;
164
165 if (a->flags != b->flags)
166 return 1;
167
168 if (a->action != b->action)
169 return 1;
170
171 if (a->field_count != b->field_count)
172 return 1;
173
174 for (i = 0; i < a->field_count; i++) {
175 if (a->fields[i] != b->fields[i]
176 || a->values[i] != b->values[i])
177 return 1;
178 }
179
180 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
181 if (a->mask[i] != b->mask[i])
182 return 1;
183
184 return 0;
185}
186
187/* Note that audit_add_rule and audit_del_rule are called via
188 * audit_receive() in audit.c, and are protected by
189 * audit_netlink_sem. */
190static inline int audit_add_rule(struct audit_entry *entry,
191 struct list_head *list)
192{
193 if (entry->rule.flags & AUDIT_PREPEND) {
194 entry->rule.flags &= ~AUDIT_PREPEND;
195 list_add_rcu(&entry->list, list);
196 } else {
197 list_add_tail_rcu(&entry->list, list);
198 }
199 return 0;
200}
201
202static void audit_free_rule(struct rcu_head *head)
203{
204 struct audit_entry *e = container_of(head, struct audit_entry, rcu);
205 kfree(e);
206}
207
208/* Note that audit_add_rule and audit_del_rule are called via
209 * audit_receive() in audit.c, and are protected by
210 * audit_netlink_sem. */
211static inline int audit_del_rule(struct audit_rule *rule,
212 struct list_head *list)
213{
214 struct audit_entry *e;
215
216 /* Do not use the _rcu iterator here, since this is the only
217 * deletion routine. */
218 list_for_each_entry(e, list, list) {
219 if (!audit_compare_rule(rule, &e->rule)) {
220 list_del_rcu(&e->list);
221 call_rcu(&e->rcu, audit_free_rule);
222 return 0;
223 }
224 }
225 return -EFAULT; /* No matching rule */
226}
227
228#ifdef CONFIG_NET
229/* Copy rule from user-space to kernel-space. Called during
230 * AUDIT_ADD. */
231static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
232{
233 int i;
234
235 if (s->action != AUDIT_NEVER
236 && s->action != AUDIT_POSSIBLE
237 && s->action != AUDIT_ALWAYS)
238 return -1;
239 if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
240 return -1;
241
242 d->flags = s->flags;
243 d->action = s->action;
244 d->field_count = s->field_count;
245 for (i = 0; i < d->field_count; i++) {
246 d->fields[i] = s->fields[i];
247 d->values[i] = s->values[i];
248 }
249 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
250 return 0;
251}
252
253int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
254{
255 u32 flags;
256 struct audit_entry *entry;
257 int err = 0;
258
259 switch (type) {
260 case AUDIT_LIST:
261 /* The *_rcu iterators not needed here because we are
262 always called with audit_netlink_sem held. */
263 list_for_each_entry(entry, &audit_tsklist, list)
264 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
265 &entry->rule, sizeof(entry->rule));
266 list_for_each_entry(entry, &audit_entlist, list)
267 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
268 &entry->rule, sizeof(entry->rule));
269 list_for_each_entry(entry, &audit_extlist, list)
270 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
271 &entry->rule, sizeof(entry->rule));
272 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
273 break;
274 case AUDIT_ADD:
275 if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
276 return -ENOMEM;
277 if (audit_copy_rule(&entry->rule, data)) {
278 kfree(entry);
279 return -EINVAL;
280 }
281 flags = entry->rule.flags;
282 if (!err && (flags & AUDIT_PER_TASK))
283 err = audit_add_rule(entry, &audit_tsklist);
284 if (!err && (flags & AUDIT_AT_ENTRY))
285 err = audit_add_rule(entry, &audit_entlist);
286 if (!err && (flags & AUDIT_AT_EXIT))
287 err = audit_add_rule(entry, &audit_extlist);
288 break;
289 case AUDIT_DEL:
290 flags =((struct audit_rule *)data)->flags;
291 if (!err && (flags & AUDIT_PER_TASK))
292 err = audit_del_rule(data, &audit_tsklist);
293 if (!err && (flags & AUDIT_AT_ENTRY))
294 err = audit_del_rule(data, &audit_entlist);
295 if (!err && (flags & AUDIT_AT_EXIT))
296 err = audit_del_rule(data, &audit_extlist);
297 break;
298 default:
299 return -EINVAL;
300 }
301
302 return err;
303}
304#endif
305
306/* Compare a task_struct with an audit_rule. Return 1 on match, 0
307 * otherwise. */
308static int audit_filter_rules(struct task_struct *tsk,
309 struct audit_rule *rule,
310 struct audit_context *ctx,
311 enum audit_state *state)
312{
313 int i, j;
314
315 for (i = 0; i < rule->field_count; i++) {
316 u32 field = rule->fields[i] & ~AUDIT_NEGATE;
317 u32 value = rule->values[i];
318 int result = 0;
319
320 switch (field) {
321 case AUDIT_PID:
322 result = (tsk->pid == value);
323 break;
324 case AUDIT_UID:
325 result = (tsk->uid == value);
326 break;
327 case AUDIT_EUID:
328 result = (tsk->euid == value);
329 break;
330 case AUDIT_SUID:
331 result = (tsk->suid == value);
332 break;
333 case AUDIT_FSUID:
334 result = (tsk->fsuid == value);
335 break;
336 case AUDIT_GID:
337 result = (tsk->gid == value);
338 break;
339 case AUDIT_EGID:
340 result = (tsk->egid == value);
341 break;
342 case AUDIT_SGID:
343 result = (tsk->sgid == value);
344 break;
345 case AUDIT_FSGID:
346 result = (tsk->fsgid == value);
347 break;
348 case AUDIT_PERS:
349 result = (tsk->personality == value);
350 break;
351
352 case AUDIT_EXIT:
353 if (ctx && ctx->return_valid)
354 result = (ctx->return_code == value);
355 break;
356 case AUDIT_SUCCESS:
357 if (ctx && ctx->return_valid)
358 result = (ctx->return_code >= 0);
359 break;
360 case AUDIT_DEVMAJOR:
361 if (ctx) {
362 for (j = 0; j < ctx->name_count; j++) {
363 if (MAJOR(ctx->names[j].dev)==value) {
364 ++result;
365 break;
366 }
367 }
368 }
369 break;
370 case AUDIT_DEVMINOR:
371 if (ctx) {
372 for (j = 0; j < ctx->name_count; j++) {
373 if (MINOR(ctx->names[j].dev)==value) {
374 ++result;
375 break;
376 }
377 }
378 }
379 break;
380 case AUDIT_INODE:
381 if (ctx) {
382 for (j = 0; j < ctx->name_count; j++) {
383 if (ctx->names[j].ino == value) {
384 ++result;
385 break;
386 }
387 }
388 }
389 break;
390 case AUDIT_LOGINUID:
391 result = 0;
392 if (ctx)
393 result = (ctx->loginuid == value);
394 break;
395 case AUDIT_ARG0:
396 case AUDIT_ARG1:
397 case AUDIT_ARG2:
398 case AUDIT_ARG3:
399 if (ctx)
400 result = (ctx->argv[field-AUDIT_ARG0]==value);
401 break;
402 }
403
404 if (rule->fields[i] & AUDIT_NEGATE)
405 result = !result;
406 if (!result)
407 return 0;
408 }
409 switch (rule->action) {
410 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
411 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
412 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
413 }
414 return 1;
415}
416
417/* At process creation time, we can determine if system-call auditing is
418 * completely disabled for this task. Since we only have the task
419 * structure at this point, we can only check uid and gid.
420 */
421static enum audit_state audit_filter_task(struct task_struct *tsk)
422{
423 struct audit_entry *e;
424 enum audit_state state;
425
426 rcu_read_lock();
427 list_for_each_entry_rcu(e, &audit_tsklist, list) {
428 if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
429 rcu_read_unlock();
430 return state;
431 }
432 }
433 rcu_read_unlock();
434 return AUDIT_BUILD_CONTEXT;
435}
436
437/* At syscall entry and exit time, this filter is called if the
438 * audit_state is not low enough that auditing cannot take place, but is
439 * also not high enough that we already know we have to write and audit
440 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
441 */
442static enum audit_state audit_filter_syscall(struct task_struct *tsk,
443 struct audit_context *ctx,
444 struct list_head *list)
445{
446 struct audit_entry *e;
447 enum audit_state state;
448 int word = AUDIT_WORD(ctx->major);
449 int bit = AUDIT_BIT(ctx->major);
450
451 rcu_read_lock();
452 list_for_each_entry_rcu(e, list, list) {
453 if ((e->rule.mask[word] & bit) == bit
454 && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
455 rcu_read_unlock();
456 return state;
457 }
458 }
459 rcu_read_unlock();
460 return AUDIT_BUILD_CONTEXT;
461}
462
463/* This should be called with task_lock() held. */
464static inline struct audit_context *audit_get_context(struct task_struct *tsk,
465 int return_valid,
466 int return_code)
467{
468 struct audit_context *context = tsk->audit_context;
469
470 if (likely(!context))
471 return NULL;
472 context->return_valid = return_valid;
473 context->return_code = return_code;
474
475 if (context->in_syscall && !context->auditable) {
476 enum audit_state state;
477 state = audit_filter_syscall(tsk, context, &audit_extlist);
478 if (state == AUDIT_RECORD_CONTEXT)
479 context->auditable = 1;
480 }
481
482 context->pid = tsk->pid;
483 context->uid = tsk->uid;
484 context->gid = tsk->gid;
485 context->euid = tsk->euid;
486 context->suid = tsk->suid;
487 context->fsuid = tsk->fsuid;
488 context->egid = tsk->egid;
489 context->sgid = tsk->sgid;
490 context->fsgid = tsk->fsgid;
491 context->personality = tsk->personality;
492 tsk->audit_context = NULL;
493 return context;
494}
495
496static inline void audit_free_names(struct audit_context *context)
497{
498 int i;
499
500#if AUDIT_DEBUG == 2
501 if (context->auditable
502 ||context->put_count + context->ino_count != context->name_count) {
503 printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
504 " name_count=%d put_count=%d"
505 " ino_count=%d [NOT freeing]\n",
506 __LINE__,
507 context->serial, context->major, context->in_syscall,
508 context->name_count, context->put_count,
509 context->ino_count);
510 for (i = 0; i < context->name_count; i++)
511 printk(KERN_ERR "names[%d] = %p = %s\n", i,
512 context->names[i].name,
513 context->names[i].name);
514 dump_stack();
515 return;
516 }
517#endif
518#if AUDIT_DEBUG
519 context->put_count = 0;
520 context->ino_count = 0;
521#endif
522
523 for (i = 0; i < context->name_count; i++)
524 if (context->names[i].name)
525 __putname(context->names[i].name);
526 context->name_count = 0;
527}
528
529static inline void audit_free_aux(struct audit_context *context)
530{
531 struct audit_aux_data *aux;
532
533 while ((aux = context->aux)) {
534 context->aux = aux->next;
535 kfree(aux);
536 }
537}
538
539static inline void audit_zero_context(struct audit_context *context,
540 enum audit_state state)
541{
542 uid_t loginuid = context->loginuid;
543
544 memset(context, 0, sizeof(*context));
545 context->state = state;
546 context->loginuid = loginuid;
547}
548
549static inline struct audit_context *audit_alloc_context(enum audit_state state)
550{
551 struct audit_context *context;
552
553 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
554 return NULL;
555 audit_zero_context(context, state);
556 return context;
557}
558
559/* Filter on the task information and allocate a per-task audit context
560 * if necessary. Doing so turns on system call auditing for the
561 * specified task. This is called from copy_process, so no lock is
562 * needed. */
563int audit_alloc(struct task_struct *tsk)
564{
565 struct audit_context *context;
566 enum audit_state state;
567
568 if (likely(!audit_enabled))
569 return 0; /* Return if not auditing. */
570
571 state = audit_filter_task(tsk);
572 if (likely(state == AUDIT_DISABLED))
573 return 0;
574
575 if (!(context = audit_alloc_context(state))) {
576 audit_log_lost("out of memory in audit_alloc");
577 return -ENOMEM;
578 }
579
580 /* Preserve login uid */
581 context->loginuid = -1;
582 if (current->audit_context)
583 context->loginuid = current->audit_context->loginuid;
584
585 tsk->audit_context = context;
586 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
587 return 0;
588}
589
590static inline void audit_free_context(struct audit_context *context)
591{
592 struct audit_context *previous;
593 int count = 0;
594
595 do {
596 previous = context->previous;
597 if (previous || (count && count < 10)) {
598 ++count;
599 printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
600 " freeing multiple contexts (%d)\n",
601 context->serial, context->major,
602 context->name_count, count);
603 }
604 audit_free_names(context);
605 audit_free_aux(context);
606 kfree(context);
607 context = previous;
608 } while (context);
609 if (count >= 10)
610 printk(KERN_ERR "audit: freed %d contexts\n", count);
611}
612
613static void audit_log_exit(struct audit_context *context)
614{
615 int i;
616 struct audit_buffer *ab;
617
618 ab = audit_log_start(context);
619 if (!ab)
620 return; /* audit_panic has been called */
621 audit_log_format(ab, "syscall=%d", context->major);
622 if (context->personality != PER_LINUX)
623 audit_log_format(ab, " per=%lx", context->personality);
624 if (context->return_valid)
625 audit_log_format(ab, " exit=%d", context->return_code);
626 audit_log_format(ab,
627 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
628 " pid=%d loginuid=%d uid=%d gid=%d"
629 " euid=%d suid=%d fsuid=%d"
630 " egid=%d sgid=%d fsgid=%d",
631 context->argv[0],
632 context->argv[1],
633 context->argv[2],
634 context->argv[3],
635 context->name_count,
636 context->pid,
637 context->loginuid,
638 context->uid,
639 context->gid,
640 context->euid, context->suid, context->fsuid,
641 context->egid, context->sgid, context->fsgid);
642 audit_log_end(ab);
643 while (context->aux) {
644 struct audit_aux_data *aux;
645
646 ab = audit_log_start(context);
647 if (!ab)
648 continue; /* audit_panic has been called */
649
650 aux = context->aux;
651 context->aux = aux->next;
652
653 audit_log_format(ab, "auxitem=%d", aux->type);
654 switch (aux->type) {
655 case AUDIT_AUX_IPCPERM: {
656 struct audit_aux_data_ipcctl *axi = (void *)aux;
657 audit_log_format(ab,
658 " qbytes=%lx uid=%d gid=%d mode=%x",
659 axi->qbytes, axi->uid, axi->gid, axi->mode);
660 }
661 }
662 audit_log_end(ab);
663 kfree(aux);
664 }
665
666 for (i = 0; i < context->name_count; i++) {
667 ab = audit_log_start(context);
668 if (!ab)
669 continue; /* audit_panic has been called */
670 audit_log_format(ab, "item=%d", i);
671 if (context->names[i].name)
672 audit_log_format(ab, " name=%s",
673 context->names[i].name);
674 if (context->names[i].ino != (unsigned long)-1)
675 audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
676 " uid=%d gid=%d rdev=%02x:%02x",
677 context->names[i].ino,
678 MAJOR(context->names[i].dev),
679 MINOR(context->names[i].dev),
680 context->names[i].mode,
681 context->names[i].uid,
682 context->names[i].gid,
683 MAJOR(context->names[i].rdev),
684 MINOR(context->names[i].rdev));
685 audit_log_end(ab);
686 }
687}
688
689/* Free a per-task audit context. Called from copy_process and
690 * __put_task_struct. */
691void audit_free(struct task_struct *tsk)
692{
693 struct audit_context *context;
694
695 task_lock(tsk);
696 context = audit_get_context(tsk, 0, 0);
697 task_unlock(tsk);
698
699 if (likely(!context))
700 return;
701
702 /* Check for system calls that do not go through the exit
703 * function (e.g., exit_group), then free context block. */
704 if (context->in_syscall && context->auditable)
705 audit_log_exit(context);
706
707 audit_free_context(context);
708}
709
710/* Compute a serial number for the audit record. Audit records are
711 * written to user-space as soon as they are generated, so a complete
712 * audit record may be written in several pieces. The timestamp of the
713 * record and this serial number are used by the user-space daemon to
714 * determine which pieces belong to the same audit record. The
715 * (timestamp,serial) tuple is unique for each syscall and is live from
716 * syscall entry to syscall exit.
717 *
718 * Atomic values are only guaranteed to be 24-bit, so we count down.
719 *
720 * NOTE: Another possibility is to store the formatted records off the
721 * audit context (for those records that have a context), and emit them
722 * all at syscall exit. However, this could delay the reporting of
723 * significant errors until syscall exit (or never, if the system
724 * halts). */
725static inline unsigned int audit_serial(void)
726{
727 static atomic_t serial = ATOMIC_INIT(0xffffff);
728 unsigned int a, b;
729
730 do {
731 a = atomic_read(&serial);
732 if (atomic_dec_and_test(&serial))
733 atomic_set(&serial, 0xffffff);
734 b = atomic_read(&serial);
735 } while (b != a - 1);
736
737 return 0xffffff - b;
738}
739
740/* Fill in audit context at syscall entry. This only happens if the
741 * audit context was created when the task was created and the state or
742 * filters demand the audit context be built. If the state from the
743 * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
744 * then the record will be written at syscall exit time (otherwise, it
745 * will only be written if another part of the kernel requests that it
746 * be written). */
747void audit_syscall_entry(struct task_struct *tsk, int major,
748 unsigned long a1, unsigned long a2,
749 unsigned long a3, unsigned long a4)
750{
751 struct audit_context *context = tsk->audit_context;
752 enum audit_state state;
753
754 BUG_ON(!context);
755
756 /* This happens only on certain architectures that make system
757 * calls in kernel_thread via the entry.S interface, instead of
758 * with direct calls. (If you are porting to a new
759 * architecture, hitting this condition can indicate that you
760 * got the _exit/_leave calls backward in entry.S.)
761 *
762 * i386 no
763 * x86_64 no
764 * ppc64 yes (see arch/ppc64/kernel/misc.S)
765 *
766 * This also happens with vm86 emulation in a non-nested manner
767 * (entries without exits), so this case must be caught.
768 */
769 if (context->in_syscall) {
770 struct audit_context *newctx;
771
772#if defined(__NR_vm86) && defined(__NR_vm86old)
773 /* vm86 mode should only be entered once */
774 if (major == __NR_vm86 || major == __NR_vm86old)
775 return;
776#endif
777#if AUDIT_DEBUG
778 printk(KERN_ERR
779 "audit(:%d) pid=%d in syscall=%d;"
780 " entering syscall=%d\n",
781 context->serial, tsk->pid, context->major, major);
782#endif
783 newctx = audit_alloc_context(context->state);
784 if (newctx) {
785 newctx->previous = context;
786 context = newctx;
787 tsk->audit_context = newctx;
788 } else {
789 /* If we can't alloc a new context, the best we
790 * can do is to leak memory (any pending putname
791 * will be lost). The only other alternative is
792 * to abandon auditing. */
793 audit_zero_context(context, context->state);
794 }
795 }
796 BUG_ON(context->in_syscall || context->name_count);
797
798 if (!audit_enabled)
799 return;
800
801 context->major = major;
802 context->argv[0] = a1;
803 context->argv[1] = a2;
804 context->argv[2] = a3;
805 context->argv[3] = a4;
806
807 state = context->state;
808 if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
809 state = audit_filter_syscall(tsk, context, &audit_entlist);
810 if (likely(state == AUDIT_DISABLED))
811 return;
812
813 context->serial = audit_serial();
814 context->ctime = CURRENT_TIME;
815 context->in_syscall = 1;
816 context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
817}
818
819/* Tear down after system call. If the audit context has been marked as
820 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
821 * filtering, or because some other part of the kernel write an audit
822 * message), then write out the syscall information. In call cases,
823 * free the names stored from getname(). */
824void audit_syscall_exit(struct task_struct *tsk, int return_code)
825{
826 struct audit_context *context;
827
828 get_task_struct(tsk);
829 task_lock(tsk);
830 context = audit_get_context(tsk, 1, return_code);
831 task_unlock(tsk);
832
833 /* Not having a context here is ok, since the parent may have
834 * called __put_task_struct. */
835 if (likely(!context))
836 return;
837
838 if (context->in_syscall && context->auditable)
839 audit_log_exit(context);
840
841 context->in_syscall = 0;
842 context->auditable = 0;
843 if (context->previous) {
844 struct audit_context *new_context = context->previous;
845 context->previous = NULL;
846 audit_free_context(context);
847 tsk->audit_context = new_context;
848 } else {
849 audit_free_names(context);
850 audit_free_aux(context);
851 audit_zero_context(context, context->state);
852 tsk->audit_context = context;
853 }
854 put_task_struct(tsk);
855}
856
857/* Add a name to the list. Called from fs/namei.c:getname(). */
858void audit_getname(const char *name)
859{
860 struct audit_context *context = current->audit_context;
861
862 if (!context || IS_ERR(name) || !name)
863 return;
864
865 if (!context->in_syscall) {
866#if AUDIT_DEBUG == 2
867 printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
868 __FILE__, __LINE__, context->serial, name);
869 dump_stack();
870#endif
871 return;
872 }
873 BUG_ON(context->name_count >= AUDIT_NAMES);
874 context->names[context->name_count].name = name;
875 context->names[context->name_count].ino = (unsigned long)-1;
876 ++context->name_count;
877}
878
879/* Intercept a putname request. Called from
880 * include/linux/fs.h:putname(). If we have stored the name from
881 * getname in the audit context, then we delay the putname until syscall
882 * exit. */
883void audit_putname(const char *name)
884{
885 struct audit_context *context = current->audit_context;
886
887 BUG_ON(!context);
888 if (!context->in_syscall) {
889#if AUDIT_DEBUG == 2
890 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
891 __FILE__, __LINE__, context->serial, name);
892 if (context->name_count) {
893 int i;
894 for (i = 0; i < context->name_count; i++)
895 printk(KERN_ERR "name[%d] = %p = %s\n", i,
896 context->names[i].name,
897 context->names[i].name);
898 }
899#endif
900 __putname(name);
901 }
902#if AUDIT_DEBUG
903 else {
904 ++context->put_count;
905 if (context->put_count > context->name_count) {
906 printk(KERN_ERR "%s:%d(:%d): major=%d"
907 " in_syscall=%d putname(%p) name_count=%d"
908 " put_count=%d\n",
909 __FILE__, __LINE__,
910 context->serial, context->major,
911 context->in_syscall, name, context->name_count,
912 context->put_count);
913 dump_stack();
914 }
915 }
916#endif
917}
918
919/* Store the inode and device from a lookup. Called from
920 * fs/namei.c:path_lookup(). */
921void audit_inode(const char *name, const struct inode *inode)
922{
923 int idx;
924 struct audit_context *context = current->audit_context;
925
926 if (!context->in_syscall)
927 return;
928 if (context->name_count
929 && context->names[context->name_count-1].name
930 && context->names[context->name_count-1].name == name)
931 idx = context->name_count - 1;
932 else if (context->name_count > 1
933 && context->names[context->name_count-2].name
934 && context->names[context->name_count-2].name == name)
935 idx = context->name_count - 2;
936 else {
937 /* FIXME: how much do we care about inodes that have no
938 * associated name? */
939 if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
940 return;
941 idx = context->name_count++;
942 context->names[idx].name = NULL;
943#if AUDIT_DEBUG
944 ++context->ino_count;
945#endif
946 }
947 context->names[idx].ino = inode->i_ino;
948 context->names[idx].dev = inode->i_sb->s_dev;
949 context->names[idx].mode = inode->i_mode;
950 context->names[idx].uid = inode->i_uid;
951 context->names[idx].gid = inode->i_gid;
952 context->names[idx].rdev = inode->i_rdev;
953}
954
955void audit_get_stamp(struct audit_context *ctx,
956 struct timespec *t, int *serial)
957{
958 if (ctx) {
959 t->tv_sec = ctx->ctime.tv_sec;
960 t->tv_nsec = ctx->ctime.tv_nsec;
961 *serial = ctx->serial;
962 ctx->auditable = 1;
963 } else {
964 *t = CURRENT_TIME;
965 *serial = 0;
966 }
967}
968
969extern int audit_set_type(struct audit_buffer *ab, int type);
970
971int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
972{
973 if (ctx) {
974 struct audit_buffer *ab;
975
976 ab = audit_log_start(NULL);
977 if (ab) {
978 audit_log_format(ab, "login pid=%d uid=%u "
979 "old loginuid=%u new loginuid=%u",
980 ctx->pid, ctx->uid, ctx->loginuid, loginuid);
981 audit_set_type(ab, AUDIT_LOGIN);
982 audit_log_end(ab);
983 }
984 ctx->loginuid = loginuid;
985 }
986 return 0;
987}
988
989uid_t audit_get_loginuid(struct audit_context *ctx)
990{
991 return ctx ? ctx->loginuid : -1;
992}
993
994int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
995{
996 struct audit_aux_data_ipcctl *ax;
997 struct audit_context *context = current->audit_context;
998
999 if (likely(!context))
1000 return 0;
1001
1002 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
1003 if (!ax)
1004 return -ENOMEM;
1005
1006 ax->qbytes = qbytes;
1007 ax->uid = uid;
1008 ax->gid = gid;
1009 ax->mode = mode;
1010
1011 ax->d.type = AUDIT_AUX_IPCPERM;
1012 ax->d.next = context->aux;
1013 context->aux = (void *)ax;
1014 return 0;
1015}
diff --git a/kernel/capability.c b/kernel/capability.c
new file mode 100644
index 000000000000..64db1ee820c2
--- /dev/null
+++ b/kernel/capability.c
@@ -0,0 +1,220 @@
1/*
2 * linux/kernel/capability.c
3 *
4 * Copyright (C) 1997 Andrew Main <zefram@fysh.org>
5 *
6 * Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com>
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */
9
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <linux/security.h>
13#include <linux/syscalls.h>
14#include <asm/uaccess.h>
15
16unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
17kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
18
19EXPORT_SYMBOL(securebits);
20EXPORT_SYMBOL(cap_bset);
21
22/*
23 * This lock protects task->cap_* for all tasks including current.
24 * Locking rule: acquire this prior to tasklist_lock.
25 */
26static DEFINE_SPINLOCK(task_capability_lock);
27
28/*
29 * For sys_getproccap() and sys_setproccap(), any of the three
30 * capability set pointers may be NULL -- indicating that that set is
31 * uninteresting and/or not to be changed.
32 */
33
34/*
35 * sys_capget - get the capabilities of a given process.
36 */
37asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
38{
39 int ret = 0;
40 pid_t pid;
41 __u32 version;
42 task_t *target;
43 struct __user_cap_data_struct data;
44
45 if (get_user(version, &header->version))
46 return -EFAULT;
47
48 if (version != _LINUX_CAPABILITY_VERSION) {
49 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
50 return -EFAULT;
51 return -EINVAL;
52 }
53
54 if (get_user(pid, &header->pid))
55 return -EFAULT;
56
57 if (pid < 0)
58 return -EINVAL;
59
60 spin_lock(&task_capability_lock);
61 read_lock(&tasklist_lock);
62
63 if (pid && pid != current->pid) {
64 target = find_task_by_pid(pid);
65 if (!target) {
66 ret = -ESRCH;
67 goto out;
68 }
69 } else
70 target = current;
71
72 ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
73
74out:
75 read_unlock(&tasklist_lock);
76 spin_unlock(&task_capability_lock);
77
78 if (!ret && copy_to_user(dataptr, &data, sizeof data))
79 return -EFAULT;
80
81 return ret;
82}
83
84/*
85 * cap_set_pg - set capabilities for all processes in a given process
86 * group. We call this holding task_capability_lock and tasklist_lock.
87 */
88static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
89 kernel_cap_t *inheritable,
90 kernel_cap_t *permitted)
91{
92 task_t *g, *target;
93 int ret = -EPERM;
94 int found = 0;
95
96 do_each_task_pid(pgrp, PIDTYPE_PGID, g) {
97 target = g;
98 while_each_thread(g, target) {
99 if (!security_capset_check(target, effective,
100 inheritable,
101 permitted)) {
102 security_capset_set(target, effective,
103 inheritable,
104 permitted);
105 ret = 0;
106 }
107 found = 1;
108 }
109 } while_each_task_pid(pgrp, PIDTYPE_PGID, g);
110
111 if (!found)
112 ret = 0;
113 return ret;
114}
115
116/*
117 * cap_set_all - set capabilities for all processes other than init
118 * and self. We call this holding task_capability_lock and tasklist_lock.
119 */
120static inline int cap_set_all(kernel_cap_t *effective,
121 kernel_cap_t *inheritable,
122 kernel_cap_t *permitted)
123{
124 task_t *g, *target;
125 int ret = -EPERM;
126 int found = 0;
127
128 do_each_thread(g, target) {
129 if (target == current || target->pid == 1)
130 continue;
131 found = 1;
132 if (security_capset_check(target, effective, inheritable,
133 permitted))
134 continue;
135 ret = 0;
136 security_capset_set(target, effective, inheritable, permitted);
137 } while_each_thread(g, target);
138
139 if (!found)
140 ret = 0;
141 return ret;
142}
143
144/*
145 * sys_capset - set capabilities for a given process, all processes, or all
146 * processes in a given process group.
147 *
148 * The restrictions on setting capabilities are specified as:
149 *
150 * [pid is for the 'target' task. 'current' is the calling task.]
151 *
152 * I: any raised capabilities must be a subset of the (old current) permitted
153 * P: any raised capabilities must be a subset of the (old current) permitted
154 * E: must be set to a subset of (new target) permitted
155 */
156asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
157{
158 kernel_cap_t inheritable, permitted, effective;
159 __u32 version;
160 task_t *target;
161 int ret;
162 pid_t pid;
163
164 if (get_user(version, &header->version))
165 return -EFAULT;
166
167 if (version != _LINUX_CAPABILITY_VERSION) {
168 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
169 return -EFAULT;
170 return -EINVAL;
171 }
172
173 if (get_user(pid, &header->pid))
174 return -EFAULT;
175
176 if (pid && pid != current->pid && !capable(CAP_SETPCAP))
177 return -EPERM;
178
179 if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
180 copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
181 copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
182 return -EFAULT;
183
184 spin_lock(&task_capability_lock);
185 read_lock(&tasklist_lock);
186
187 if (pid > 0 && pid != current->pid) {
188 target = find_task_by_pid(pid);
189 if (!target) {
190 ret = -ESRCH;
191 goto out;
192 }
193 } else
194 target = current;
195
196 ret = 0;
197
198 /* having verified that the proposed changes are legal,
199 we now put them into effect. */
200 if (pid < 0) {
201 if (pid == -1) /* all procs other than current and init */
202 ret = cap_set_all(&effective, &inheritable, &permitted);
203
204 else /* all procs in process group */
205 ret = cap_set_pg(-pid, &effective, &inheritable,
206 &permitted);
207 } else {
208 ret = security_capset_check(target, &effective, &inheritable,
209 &permitted);
210 if (!ret)
211 security_capset_set(target, &effective, &inheritable,
212 &permitted);
213 }
214
215out:
216 read_unlock(&tasklist_lock);
217 spin_unlock(&task_capability_lock);
218
219 return ret;
220}
diff --git a/kernel/compat.c b/kernel/compat.c
new file mode 100644
index 000000000000..dad10656bf14
--- /dev/null
+++ b/kernel/compat.c
@@ -0,0 +1,860 @@
1/*
2 * linux/kernel/compat.c
3 *
4 * Kernel compatibililty routines for e.g. 32 bit syscall support
5 * on 64 bit kernels.
6 *
7 * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include <linux/linkage.h>
15#include <linux/compat.h>
16#include <linux/errno.h>
17#include <linux/time.h>
18#include <linux/signal.h>
19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
20#include <linux/futex.h> /* for FUTEX_WAIT */
21#include <linux/syscalls.h>
22#include <linux/unistd.h>
23#include <linux/security.h>
24
25#include <asm/uaccess.h>
26#include <asm/bug.h>
27
28int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
29{
30 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
31 __get_user(ts->tv_sec, &cts->tv_sec) ||
32 __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
33}
34
35int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
36{
37 return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
38 __put_user(ts->tv_sec, &cts->tv_sec) ||
39 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
40}
41
42static long compat_nanosleep_restart(struct restart_block *restart)
43{
44 unsigned long expire = restart->arg0, now = jiffies;
45 struct compat_timespec __user *rmtp;
46
47 /* Did it expire while we handled signals? */
48 if (!time_after(expire, now))
49 return 0;
50
51 current->state = TASK_INTERRUPTIBLE;
52 expire = schedule_timeout(expire - now);
53 if (expire == 0)
54 return 0;
55
56 rmtp = (struct compat_timespec __user *)restart->arg1;
57 if (rmtp) {
58 struct compat_timespec ct;
59 struct timespec t;
60
61 jiffies_to_timespec(expire, &t);
62 ct.tv_sec = t.tv_sec;
63 ct.tv_nsec = t.tv_nsec;
64 if (copy_to_user(rmtp, &ct, sizeof(ct)))
65 return -EFAULT;
66 }
67 /* The 'restart' block is already filled in */
68 return -ERESTART_RESTARTBLOCK;
69}
70
71asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
72 struct compat_timespec __user *rmtp)
73{
74 struct timespec t;
75 struct restart_block *restart;
76 unsigned long expire;
77
78 if (get_compat_timespec(&t, rqtp))
79 return -EFAULT;
80
81 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
82 return -EINVAL;
83
84 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
85 current->state = TASK_INTERRUPTIBLE;
86 expire = schedule_timeout(expire);
87 if (expire == 0)
88 return 0;
89
90 if (rmtp) {
91 jiffies_to_timespec(expire, &t);
92 if (put_compat_timespec(&t, rmtp))
93 return -EFAULT;
94 }
95 restart = &current_thread_info()->restart_block;
96 restart->fn = compat_nanosleep_restart;
97 restart->arg0 = jiffies + expire;
98 restart->arg1 = (unsigned long) rmtp;
99 return -ERESTART_RESTARTBLOCK;
100}
101
102static inline long get_compat_itimerval(struct itimerval *o,
103 struct compat_itimerval __user *i)
104{
105 return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
106 (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
107 __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
108 __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
109 __get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
110}
111
112static inline long put_compat_itimerval(struct compat_itimerval __user *o,
113 struct itimerval *i)
114{
115 return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
116 (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
117 __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
118 __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
119 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
120}
121
122asmlinkage long compat_sys_getitimer(int which,
123 struct compat_itimerval __user *it)
124{
125 struct itimerval kit;
126 int error;
127
128 error = do_getitimer(which, &kit);
129 if (!error && put_compat_itimerval(it, &kit))
130 error = -EFAULT;
131 return error;
132}
133
134asmlinkage long compat_sys_setitimer(int which,
135 struct compat_itimerval __user *in,
136 struct compat_itimerval __user *out)
137{
138 struct itimerval kin, kout;
139 int error;
140
141 if (in) {
142 if (get_compat_itimerval(&kin, in))
143 return -EFAULT;
144 } else
145 memset(&kin, 0, sizeof(kin));
146
147 error = do_setitimer(which, &kin, out ? &kout : NULL);
148 if (error || !out)
149 return error;
150 if (put_compat_itimerval(out, &kout))
151 return -EFAULT;
152 return 0;
153}
154
155asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
156{
157 /*
158 * In the SMP world we might just be unlucky and have one of
159 * the times increment as we use it. Since the value is an
160 * atomically safe type this is just fine. Conceptually its
161 * as if the syscall took an instant longer to occur.
162 */
163 if (tbuf) {
164 struct compat_tms tmp;
165 struct task_struct *tsk = current;
166 struct task_struct *t;
167 cputime_t utime, stime, cutime, cstime;
168
169 read_lock(&tasklist_lock);
170 utime = tsk->signal->utime;
171 stime = tsk->signal->stime;
172 t = tsk;
173 do {
174 utime = cputime_add(utime, t->utime);
175 stime = cputime_add(stime, t->stime);
176 t = next_thread(t);
177 } while (t != tsk);
178
179 /*
180 * While we have tasklist_lock read-locked, no dying thread
181 * can be updating current->signal->[us]time. Instead,
182 * we got their counts included in the live thread loop.
183 * However, another thread can come in right now and
184 * do a wait call that updates current->signal->c[us]time.
185 * To make sure we always see that pair updated atomically,
186 * we take the siglock around fetching them.
187 */
188 spin_lock_irq(&tsk->sighand->siglock);
189 cutime = tsk->signal->cutime;
190 cstime = tsk->signal->cstime;
191 spin_unlock_irq(&tsk->sighand->siglock);
192 read_unlock(&tasklist_lock);
193
194 tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
195 tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
196 tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
197 tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
198 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
199 return -EFAULT;
200 }
201 return compat_jiffies_to_clock_t(jiffies);
202}
203
204/*
205 * Assumption: old_sigset_t and compat_old_sigset_t are both
206 * types that can be passed to put_user()/get_user().
207 */
208
209asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
210{
211 old_sigset_t s;
212 long ret;
213 mm_segment_t old_fs = get_fs();
214
215 set_fs(KERNEL_DS);
216 ret = sys_sigpending((old_sigset_t __user *) &s);
217 set_fs(old_fs);
218 if (ret == 0)
219 ret = put_user(s, set);
220 return ret;
221}
222
223asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
224 compat_old_sigset_t __user *oset)
225{
226 old_sigset_t s;
227 long ret;
228 mm_segment_t old_fs;
229
230 if (set && get_user(s, set))
231 return -EFAULT;
232 old_fs = get_fs();
233 set_fs(KERNEL_DS);
234 ret = sys_sigprocmask(how,
235 set ? (old_sigset_t __user *) &s : NULL,
236 oset ? (old_sigset_t __user *) &s : NULL);
237 set_fs(old_fs);
238 if (ret == 0)
239 if (oset)
240 ret = put_user(s, oset);
241 return ret;
242}
243
244#ifdef CONFIG_FUTEX
245asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
246 struct compat_timespec __user *utime, u32 __user *uaddr2,
247 int val3)
248{
249 struct timespec t;
250 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
251 int val2 = 0;
252
253 if ((op == FUTEX_WAIT) && utime) {
254 if (get_compat_timespec(&t, utime))
255 return -EFAULT;
256 timeout = timespec_to_jiffies(&t) + 1;
257 }
258 if (op >= FUTEX_REQUEUE)
259 val2 = (int) (unsigned long) utime;
260
261 return do_futex((unsigned long)uaddr, op, val, timeout,
262 (unsigned long)uaddr2, val2, val3);
263}
264#endif
265
266asmlinkage long compat_sys_setrlimit(unsigned int resource,
267 struct compat_rlimit __user *rlim)
268{
269 struct rlimit r;
270 int ret;
271 mm_segment_t old_fs = get_fs ();
272
273 if (resource >= RLIM_NLIMITS)
274 return -EINVAL;
275
276 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
277 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
278 __get_user(r.rlim_max, &rlim->rlim_max))
279 return -EFAULT;
280
281 if (r.rlim_cur == COMPAT_RLIM_INFINITY)
282 r.rlim_cur = RLIM_INFINITY;
283 if (r.rlim_max == COMPAT_RLIM_INFINITY)
284 r.rlim_max = RLIM_INFINITY;
285 set_fs(KERNEL_DS);
286 ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
287 set_fs(old_fs);
288 return ret;
289}
290
291#ifdef COMPAT_RLIM_OLD_INFINITY
292
293asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
294 struct compat_rlimit __user *rlim)
295{
296 struct rlimit r;
297 int ret;
298 mm_segment_t old_fs = get_fs();
299
300 set_fs(KERNEL_DS);
301 ret = sys_old_getrlimit(resource, &r);
302 set_fs(old_fs);
303
304 if (!ret) {
305 if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
306 r.rlim_cur = COMPAT_RLIM_INFINITY;
307 if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
308 r.rlim_max = COMPAT_RLIM_INFINITY;
309
310 if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
311 __put_user(r.rlim_cur, &rlim->rlim_cur) ||
312 __put_user(r.rlim_max, &rlim->rlim_max))
313 return -EFAULT;
314 }
315 return ret;
316}
317
318#endif
319
320asmlinkage long compat_sys_getrlimit (unsigned int resource,
321 struct compat_rlimit __user *rlim)
322{
323 struct rlimit r;
324 int ret;
325 mm_segment_t old_fs = get_fs();
326
327 set_fs(KERNEL_DS);
328 ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
329 set_fs(old_fs);
330 if (!ret) {
331 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
332 r.rlim_cur = COMPAT_RLIM_INFINITY;
333 if (r.rlim_max > COMPAT_RLIM_INFINITY)
334 r.rlim_max = COMPAT_RLIM_INFINITY;
335
336 if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
337 __put_user(r.rlim_cur, &rlim->rlim_cur) ||
338 __put_user(r.rlim_max, &rlim->rlim_max))
339 return -EFAULT;
340 }
341 return ret;
342}
343
344int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
345{
346 if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
347 __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
348 __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
349 __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
350 __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
351 __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
352 __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
353 __put_user(r->ru_idrss, &ru->ru_idrss) ||
354 __put_user(r->ru_isrss, &ru->ru_isrss) ||
355 __put_user(r->ru_minflt, &ru->ru_minflt) ||
356 __put_user(r->ru_majflt, &ru->ru_majflt) ||
357 __put_user(r->ru_nswap, &ru->ru_nswap) ||
358 __put_user(r->ru_inblock, &ru->ru_inblock) ||
359 __put_user(r->ru_oublock, &ru->ru_oublock) ||
360 __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
361 __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
362 __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
363 __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
364 __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
365 return -EFAULT;
366 return 0;
367}
368
369asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
370{
371 struct rusage r;
372 int ret;
373 mm_segment_t old_fs = get_fs();
374
375 set_fs(KERNEL_DS);
376 ret = sys_getrusage(who, (struct rusage __user *) &r);
377 set_fs(old_fs);
378
379 if (ret)
380 return ret;
381
382 if (put_compat_rusage(&r, ru))
383 return -EFAULT;
384
385 return 0;
386}
387
388asmlinkage long
389compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
390 struct compat_rusage __user *ru)
391{
392 if (!ru) {
393 return sys_wait4(pid, stat_addr, options, NULL);
394 } else {
395 struct rusage r;
396 int ret;
397 unsigned int status;
398 mm_segment_t old_fs = get_fs();
399
400 set_fs (KERNEL_DS);
401 ret = sys_wait4(pid,
402 (stat_addr ?
403 (unsigned int __user *) &status : NULL),
404 options, (struct rusage __user *) &r);
405 set_fs (old_fs);
406
407 if (ret > 0) {
408 if (put_compat_rusage(&r, ru))
409 return -EFAULT;
410 if (stat_addr && put_user(status, stat_addr))
411 return -EFAULT;
412 }
413 return ret;
414 }
415}
416
417asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
418 struct compat_siginfo __user *uinfo, int options,
419 struct compat_rusage __user *uru)
420{
421 siginfo_t info;
422 struct rusage ru;
423 long ret;
424 mm_segment_t old_fs = get_fs();
425
426 memset(&info, 0, sizeof(info));
427
428 set_fs(KERNEL_DS);
429 ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
430 uru ? (struct rusage __user *)&ru : NULL);
431 set_fs(old_fs);
432
433 if ((ret < 0) || (info.si_signo == 0))
434 return ret;
435
436 if (uru) {
437 ret = put_compat_rusage(&ru, uru);
438 if (ret)
439 return ret;
440 }
441
442 BUG_ON(info.si_code & __SI_MASK);
443 info.si_code |= __SI_CHLD;
444 return copy_siginfo_to_user32(uinfo, &info);
445}
446
447static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
448 unsigned len, cpumask_t *new_mask)
449{
450 unsigned long *k;
451
452 if (len < sizeof(cpumask_t))
453 memset(new_mask, 0, sizeof(cpumask_t));
454 else if (len > sizeof(cpumask_t))
455 len = sizeof(cpumask_t);
456
457 k = cpus_addr(*new_mask);
458 return compat_get_bitmap(k, user_mask_ptr, len * 8);
459}
460
461asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
462 unsigned int len,
463 compat_ulong_t __user *user_mask_ptr)
464{
465 cpumask_t new_mask;
466 int retval;
467
468 retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask);
469 if (retval)
470 return retval;
471
472 return sched_setaffinity(pid, new_mask);
473}
474
475asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
476 compat_ulong_t __user *user_mask_ptr)
477{
478 int ret;
479 cpumask_t mask;
480 unsigned long *k;
481 unsigned int min_length = sizeof(cpumask_t);
482
483 if (NR_CPUS <= BITS_PER_COMPAT_LONG)
484 min_length = sizeof(compat_ulong_t);
485
486 if (len < min_length)
487 return -EINVAL;
488
489 ret = sched_getaffinity(pid, &mask);
490 if (ret < 0)
491 return ret;
492
493 k = cpus_addr(mask);
494 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
495 if (ret)
496 return ret;
497
498 return min_length;
499}
500
501static int get_compat_itimerspec(struct itimerspec *dst,
502 struct compat_itimerspec __user *src)
503{
504 if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
505 get_compat_timespec(&dst->it_value, &src->it_value))
506 return -EFAULT;
507 return 0;
508}
509
510static int put_compat_itimerspec(struct compat_itimerspec __user *dst,
511 struct itimerspec *src)
512{
513 if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
514 put_compat_timespec(&src->it_value, &dst->it_value))
515 return -EFAULT;
516 return 0;
517}
518
519long compat_sys_timer_settime(timer_t timer_id, int flags,
520 struct compat_itimerspec __user *new,
521 struct compat_itimerspec __user *old)
522{
523 long err;
524 mm_segment_t oldfs;
525 struct itimerspec newts, oldts;
526
527 if (!new)
528 return -EINVAL;
529 if (get_compat_itimerspec(&newts, new))
530 return -EFAULT;
531 oldfs = get_fs();
532 set_fs(KERNEL_DS);
533 err = sys_timer_settime(timer_id, flags,
534 (struct itimerspec __user *) &newts,
535 (struct itimerspec __user *) &oldts);
536 set_fs(oldfs);
537 if (!err && old && put_compat_itimerspec(old, &oldts))
538 return -EFAULT;
539 return err;
540}
541
542long compat_sys_timer_gettime(timer_t timer_id,
543 struct compat_itimerspec __user *setting)
544{
545 long err;
546 mm_segment_t oldfs;
547 struct itimerspec ts;
548
549 oldfs = get_fs();
550 set_fs(KERNEL_DS);
551 err = sys_timer_gettime(timer_id,
552 (struct itimerspec __user *) &ts);
553 set_fs(oldfs);
554 if (!err && put_compat_itimerspec(setting, &ts))
555 return -EFAULT;
556 return err;
557}
558
559long compat_sys_clock_settime(clockid_t which_clock,
560 struct compat_timespec __user *tp)
561{
562 long err;
563 mm_segment_t oldfs;
564 struct timespec ts;
565
566 if (get_compat_timespec(&ts, tp))
567 return -EFAULT;
568 oldfs = get_fs();
569 set_fs(KERNEL_DS);
570 err = sys_clock_settime(which_clock,
571 (struct timespec __user *) &ts);
572 set_fs(oldfs);
573 return err;
574}
575
576long compat_sys_clock_gettime(clockid_t which_clock,
577 struct compat_timespec __user *tp)
578{
579 long err;
580 mm_segment_t oldfs;
581 struct timespec ts;
582
583 oldfs = get_fs();
584 set_fs(KERNEL_DS);
585 err = sys_clock_gettime(which_clock,
586 (struct timespec __user *) &ts);
587 set_fs(oldfs);
588 if (!err && put_compat_timespec(&ts, tp))
589 return -EFAULT;
590 return err;
591}
592
593long compat_sys_clock_getres(clockid_t which_clock,
594 struct compat_timespec __user *tp)
595{
596 long err;
597 mm_segment_t oldfs;
598 struct timespec ts;
599
600 oldfs = get_fs();
601 set_fs(KERNEL_DS);
602 err = sys_clock_getres(which_clock,
603 (struct timespec __user *) &ts);
604 set_fs(oldfs);
605 if (!err && tp && put_compat_timespec(&ts, tp))
606 return -EFAULT;
607 return err;
608}
609
610long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
611 struct compat_timespec __user *rqtp,
612 struct compat_timespec __user *rmtp)
613{
614 long err;
615 mm_segment_t oldfs;
616 struct timespec in, out;
617
618 if (get_compat_timespec(&in, rqtp))
619 return -EFAULT;
620
621 oldfs = get_fs();
622 set_fs(KERNEL_DS);
623 err = sys_clock_nanosleep(which_clock, flags,
624 (struct timespec __user *) &in,
625 (struct timespec __user *) &out);
626 set_fs(oldfs);
627 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
628 put_compat_timespec(&out, rmtp))
629 return -EFAULT;
630 return err;
631}
632
633/*
634 * We currently only need the following fields from the sigevent
635 * structure: sigev_value, sigev_signo, sig_notify and (sometimes
636 * sigev_notify_thread_id). The others are handled in user mode.
637 * We also assume that copying sigev_value.sival_int is sufficient
638 * to keep all the bits of sigev_value.sival_ptr intact.
639 */
640int get_compat_sigevent(struct sigevent *event,
641 const struct compat_sigevent __user *u_event)
642{
643 memset(&event, 0, sizeof(*event));
644 return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
645 __get_user(event->sigev_value.sival_int,
646 &u_event->sigev_value.sival_int) ||
647 __get_user(event->sigev_signo, &u_event->sigev_signo) ||
648 __get_user(event->sigev_notify, &u_event->sigev_notify) ||
649 __get_user(event->sigev_notify_thread_id,
650 &u_event->sigev_notify_thread_id))
651 ? -EFAULT : 0;
652}
653
654/* timer_create is architecture specific because it needs sigevent conversion */
655
656long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
657 unsigned long bitmap_size)
658{
659 int i, j;
660 unsigned long m;
661 compat_ulong_t um;
662 unsigned long nr_compat_longs;
663
664 /* align bitmap up to nearest compat_long_t boundary */
665 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
666
667 if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
668 return -EFAULT;
669
670 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
671
672 for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
673 m = 0;
674
675 for (j = 0; j < sizeof(m)/sizeof(um); j++) {
676 /*
677 * We dont want to read past the end of the userspace
678 * bitmap. We must however ensure the end of the
679 * kernel bitmap is zeroed.
680 */
681 if (nr_compat_longs-- > 0) {
682 if (__get_user(um, umask))
683 return -EFAULT;
684 } else {
685 um = 0;
686 }
687
688 umask++;
689 m |= (long)um << (j * BITS_PER_COMPAT_LONG);
690 }
691 *mask++ = m;
692 }
693
694 return 0;
695}
696
697long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
698 unsigned long bitmap_size)
699{
700 int i, j;
701 unsigned long m;
702 compat_ulong_t um;
703 unsigned long nr_compat_longs;
704
705 /* align bitmap up to nearest compat_long_t boundary */
706 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
707
708 if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
709 return -EFAULT;
710
711 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
712
713 for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
714 m = *mask++;
715
716 for (j = 0; j < sizeof(m)/sizeof(um); j++) {
717 um = m;
718
719 /*
720 * We dont want to write past the end of the userspace
721 * bitmap.
722 */
723 if (nr_compat_longs-- > 0) {
724 if (__put_user(um, umask))
725 return -EFAULT;
726 }
727
728 umask++;
729 m >>= 4*sizeof(um);
730 m >>= 4*sizeof(um);
731 }
732 }
733
734 return 0;
735}
736
737void
738sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
739{
740 switch (_NSIG_WORDS) {
741#if defined (__COMPAT_ENDIAN_SWAP__)
742 case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
743 case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
744 case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
745 case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
746#else
747 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
748 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
749 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
750 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
751#endif
752 }
753}
754
755asmlinkage long
756compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
757 struct compat_siginfo __user *uinfo,
758 struct compat_timespec __user *uts, compat_size_t sigsetsize)
759{
760 compat_sigset_t s32;
761 sigset_t s;
762 int sig;
763 struct timespec t;
764 siginfo_t info;
765 long ret, timeout = 0;
766
767 if (sigsetsize != sizeof(sigset_t))
768 return -EINVAL;
769
770 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
771 return -EFAULT;
772 sigset_from_compat(&s, &s32);
773 sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
774 signotset(&s);
775
776 if (uts) {
777 if (get_compat_timespec (&t, uts))
778 return -EFAULT;
779 if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
780 || t.tv_sec < 0)
781 return -EINVAL;
782 }
783
784 spin_lock_irq(&current->sighand->siglock);
785 sig = dequeue_signal(current, &s, &info);
786 if (!sig) {
787 timeout = MAX_SCHEDULE_TIMEOUT;
788 if (uts)
789 timeout = timespec_to_jiffies(&t)
790 +(t.tv_sec || t.tv_nsec);
791 if (timeout) {
792 current->real_blocked = current->blocked;
793 sigandsets(&current->blocked, &current->blocked, &s);
794
795 recalc_sigpending();
796 spin_unlock_irq(&current->sighand->siglock);
797
798 current->state = TASK_INTERRUPTIBLE;
799 timeout = schedule_timeout(timeout);
800
801 spin_lock_irq(&current->sighand->siglock);
802 sig = dequeue_signal(current, &s, &info);
803 current->blocked = current->real_blocked;
804 siginitset(&current->real_blocked, 0);
805 recalc_sigpending();
806 }
807 }
808 spin_unlock_irq(&current->sighand->siglock);
809
810 if (sig) {
811 ret = sig;
812 if (uinfo) {
813 if (copy_siginfo_to_user32(uinfo, &info))
814 ret = -EFAULT;
815 }
816 }else {
817 ret = timeout?-EINTR:-EAGAIN;
818 }
819 return ret;
820
821}
822
823#ifdef __ARCH_WANT_COMPAT_SYS_TIME
824
825/* compat_time_t is a 32 bit "long" and needs to get converted. */
826
827asmlinkage long compat_sys_time(compat_time_t __user * tloc)
828{
829 compat_time_t i;
830 struct timeval tv;
831
832 do_gettimeofday(&tv);
833 i = tv.tv_sec;
834
835 if (tloc) {
836 if (put_user(i,tloc))
837 i = -EFAULT;
838 }
839 return i;
840}
841
842asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
843{
844 struct timespec tv;
845 int err;
846
847 if (get_user(tv.tv_sec, tptr))
848 return -EFAULT;
849
850 tv.tv_nsec = 0;
851
852 err = security_settime(&tv, NULL);
853 if (err)
854 return err;
855
856 do_settimeofday(&tv);
857 return 0;
858}
859
860#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
diff --git a/kernel/configs.c b/kernel/configs.c
new file mode 100644
index 000000000000..986f7af31e0a
--- /dev/null
+++ b/kernel/configs.c
@@ -0,0 +1,118 @@
1/*
2 * kernel/configs.c
3 * Echo the kernel .config file used to build the kernel
4 *
5 * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
6 * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
7 * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
8 * Copyright (C) 2002 Hewlett-Packard Company
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 */
25
26#include <linux/config.h>
27#include <linux/kernel.h>
28#include <linux/module.h>
29#include <linux/proc_fs.h>
30#include <linux/seq_file.h>
31#include <linux/init.h>
32#include <asm/uaccess.h>
33
34/**************************************************/
35/* the actual current config file */
36
37/*
38 * Define kernel_config_data and kernel_config_data_size, which contains the
39 * wrapped and compressed configuration file. The file is first compressed
40 * with gzip and then bounded by two eight byte magic numbers to allow
41 * extraction from a binary kernel image:
42 *
43 * IKCFG_ST
44 * <image>
45 * IKCFG_ED
46 */
47#define MAGIC_START "IKCFG_ST"
48#define MAGIC_END "IKCFG_ED"
49#include "config_data.h"
50
51
52#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
53#define kernel_config_data_size \
54 (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
55
56#ifdef CONFIG_IKCONFIG_PROC
57
58/**************************************************/
59/* globals and useful constants */
60
61static ssize_t
62ikconfig_read_current(struct file *file, char __user *buf,
63 size_t len, loff_t * offset)
64{
65 loff_t pos = *offset;
66 ssize_t count;
67
68 if (pos >= kernel_config_data_size)
69 return 0;
70
71 count = min(len, (size_t)(kernel_config_data_size - pos));
72 if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
73 return -EFAULT;
74
75 *offset += count;
76 return count;
77}
78
79static struct file_operations ikconfig_file_ops = {
80 .owner = THIS_MODULE,
81 .read = ikconfig_read_current,
82};
83
84/***************************************************/
85/* ikconfig_init: start up everything we need to */
86
87static int __init ikconfig_init(void)
88{
89 struct proc_dir_entry *entry;
90
91 /* create the current config file */
92 entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
93 &proc_root);
94 if (!entry)
95 return -ENOMEM;
96
97 entry->proc_fops = &ikconfig_file_ops;
98 entry->size = kernel_config_data_size;
99
100 return 0;
101}
102
103/***************************************************/
104/* ikconfig_cleanup: clean up our mess */
105
106static void __exit ikconfig_cleanup(void)
107{
108 remove_proc_entry("config.gz", &proc_root);
109}
110
111module_init(ikconfig_init);
112module_exit(ikconfig_cleanup);
113
114MODULE_LICENSE("GPL");
115MODULE_AUTHOR("Randy Dunlap");
116MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
117
118#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpu.c b/kernel/cpu.c
new file mode 100644
index 000000000000..628f4ccda127
--- /dev/null
+++ b/kernel/cpu.c
@@ -0,0 +1,193 @@
1/* CPU control.
2 * (C) 2001, 2002, 2003, 2004 Rusty Russell
3 *
4 * This code is licenced under the GPL.
5 */
6#include <linux/proc_fs.h>
7#include <linux/smp.h>
8#include <linux/init.h>
9#include <linux/notifier.h>
10#include <linux/sched.h>
11#include <linux/unistd.h>
12#include <linux/cpu.h>
13#include <linux/module.h>
14#include <linux/kthread.h>
15#include <linux/stop_machine.h>
16#include <asm/semaphore.h>
17
18/* This protects CPUs going up and down... */
19DECLARE_MUTEX(cpucontrol);
20
21static struct notifier_block *cpu_chain;
22
23/* Need to know about CPUs going up/down? */
24int register_cpu_notifier(struct notifier_block *nb)
25{
26 int ret;
27
28 if ((ret = down_interruptible(&cpucontrol)) != 0)
29 return ret;
30 ret = notifier_chain_register(&cpu_chain, nb);
31 up(&cpucontrol);
32 return ret;
33}
34EXPORT_SYMBOL(register_cpu_notifier);
35
36void unregister_cpu_notifier(struct notifier_block *nb)
37{
38 down(&cpucontrol);
39 notifier_chain_unregister(&cpu_chain, nb);
40 up(&cpucontrol);
41}
42EXPORT_SYMBOL(unregister_cpu_notifier);
43
44#ifdef CONFIG_HOTPLUG_CPU
45static inline void check_for_tasks(int cpu)
46{
47 struct task_struct *p;
48
49 write_lock_irq(&tasklist_lock);
50 for_each_process(p) {
51 if (task_cpu(p) == cpu &&
52 (!cputime_eq(p->utime, cputime_zero) ||
53 !cputime_eq(p->stime, cputime_zero)))
54 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
55 (state = %ld, flags = %lx) \n",
56 p->comm, p->pid, cpu, p->state, p->flags);
57 }
58 write_unlock_irq(&tasklist_lock);
59}
60
61/* Take this CPU down. */
62static int take_cpu_down(void *unused)
63{
64 int err;
65
66 /* Take offline: makes arch_cpu_down somewhat easier. */
67 cpu_clear(smp_processor_id(), cpu_online_map);
68
69 /* Ensure this CPU doesn't handle any more interrupts. */
70 err = __cpu_disable();
71 if (err < 0)
72 cpu_set(smp_processor_id(), cpu_online_map);
73 else
74 /* Force idle task to run as soon as we yield: it should
75 immediately notice cpu is offline and die quickly. */
76 sched_idle_next();
77
78 return err;
79}
80
81int cpu_down(unsigned int cpu)
82{
83 int err;
84 struct task_struct *p;
85 cpumask_t old_allowed, tmp;
86
87 if ((err = lock_cpu_hotplug_interruptible()) != 0)
88 return err;
89
90 if (num_online_cpus() == 1) {
91 err = -EBUSY;
92 goto out;
93 }
94
95 if (!cpu_online(cpu)) {
96 err = -EINVAL;
97 goto out;
98 }
99
100 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
101 (void *)(long)cpu);
102 if (err == NOTIFY_BAD) {
103 printk("%s: attempt to take down CPU %u failed\n",
104 __FUNCTION__, cpu);
105 err = -EINVAL;
106 goto out;
107 }
108
109 /* Ensure that we are not runnable on dying cpu */
110 old_allowed = current->cpus_allowed;
111 tmp = CPU_MASK_ALL;
112 cpu_clear(cpu, tmp);
113 set_cpus_allowed(current, tmp);
114
115 p = __stop_machine_run(take_cpu_down, NULL, cpu);
116 if (IS_ERR(p)) {
117 /* CPU didn't die: tell everyone. Can't complain. */
118 if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
119 (void *)(long)cpu) == NOTIFY_BAD)
120 BUG();
121
122 err = PTR_ERR(p);
123 goto out_allowed;
124 }
125
126 if (cpu_online(cpu))
127 goto out_thread;
128
129 /* Wait for it to sleep (leaving idle task). */
130 while (!idle_cpu(cpu))
131 yield();
132
133 /* This actually kills the CPU. */
134 __cpu_die(cpu);
135
136 /* Move it here so it can run. */
137 kthread_bind(p, get_cpu());
138 put_cpu();
139
140 /* CPU is completely dead: tell everyone. Too late to complain. */
141 if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
142 == NOTIFY_BAD)
143 BUG();
144
145 check_for_tasks(cpu);
146
147out_thread:
148 err = kthread_stop(p);
149out_allowed:
150 set_cpus_allowed(current, old_allowed);
151out:
152 unlock_cpu_hotplug();
153 return err;
154}
155#endif /*CONFIG_HOTPLUG_CPU*/
156
157int __devinit cpu_up(unsigned int cpu)
158{
159 int ret;
160 void *hcpu = (void *)(long)cpu;
161
162 if ((ret = down_interruptible(&cpucontrol)) != 0)
163 return ret;
164
165 if (cpu_online(cpu) || !cpu_present(cpu)) {
166 ret = -EINVAL;
167 goto out;
168 }
169 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
170 if (ret == NOTIFY_BAD) {
171 printk("%s: attempt to bring up CPU %u failed\n",
172 __FUNCTION__, cpu);
173 ret = -EINVAL;
174 goto out_notify;
175 }
176
177 /* Arch-specific enabling code. */
178 ret = __cpu_up(cpu);
179 if (ret != 0)
180 goto out_notify;
181 if (!cpu_online(cpu))
182 BUG();
183
184 /* Now call notifier in preparation. */
185 notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
186
187out_notify:
188 if (ret != 0)
189 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
190out:
191 up(&cpucontrol);
192 return ret;
193}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
new file mode 100644
index 000000000000..69792bbe2281
--- /dev/null
+++ b/kernel/cpuset.c
@@ -0,0 +1,1564 @@
1/*
2 * kernel/cpuset.c
3 *
4 * Processor and Memory placement constraints for sets of tasks.
5 *
6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004 Silicon Graphics, Inc.
8 *
9 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel
11 * Portions Copyright (c) 2004 Silicon Graphics, Inc.
12 *
13 * 2003-10-10 Written by Simon Derr <simon.derr@bull.net>
14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson <pj@sgi.com>
16 *
17 * This file is subject to the terms and conditions of the GNU General Public
18 * License. See the file COPYING in the main directory of the Linux
19 * distribution for more details.
20 */
21
22#include <linux/config.h>
23#include <linux/cpu.h>
24#include <linux/cpumask.h>
25#include <linux/cpuset.h>
26#include <linux/err.h>
27#include <linux/errno.h>
28#include <linux/file.h>
29#include <linux/fs.h>
30#include <linux/init.h>
31#include <linux/interrupt.h>
32#include <linux/kernel.h>
33#include <linux/kmod.h>
34#include <linux/list.h>
35#include <linux/mm.h>
36#include <linux/module.h>
37#include <linux/mount.h>
38#include <linux/namei.h>
39#include <linux/pagemap.h>
40#include <linux/proc_fs.h>
41#include <linux/sched.h>
42#include <linux/seq_file.h>
43#include <linux/slab.h>
44#include <linux/smp_lock.h>
45#include <linux/spinlock.h>
46#include <linux/stat.h>
47#include <linux/string.h>
48#include <linux/time.h>
49#include <linux/backing-dev.h>
50#include <linux/sort.h>
51
52#include <asm/uaccess.h>
53#include <asm/atomic.h>
54#include <asm/semaphore.h>
55
56#define CPUSET_SUPER_MAGIC 0x27e0eb
57
58struct cpuset {
59 unsigned long flags; /* "unsigned long" so bitops work */
60 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
61 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
62
63 atomic_t count; /* count tasks using this cpuset */
64
65 /*
66 * We link our 'sibling' struct into our parents 'children'.
67 * Our children link their 'sibling' into our 'children'.
68 */
69 struct list_head sibling; /* my parents children */
70 struct list_head children; /* my children */
71
72 struct cpuset *parent; /* my parent */
73 struct dentry *dentry; /* cpuset fs entry */
74
75 /*
76 * Copy of global cpuset_mems_generation as of the most
77 * recent time this cpuset changed its mems_allowed.
78 */
79 int mems_generation;
80};
81
82/* bits in struct cpuset flags field */
83typedef enum {
84 CS_CPU_EXCLUSIVE,
85 CS_MEM_EXCLUSIVE,
86 CS_REMOVED,
87 CS_NOTIFY_ON_RELEASE
88} cpuset_flagbits_t;
89
90/* convenient tests for these bits */
91static inline int is_cpu_exclusive(const struct cpuset *cs)
92{
93 return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
94}
95
96static inline int is_mem_exclusive(const struct cpuset *cs)
97{
98 return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
99}
100
101static inline int is_removed(const struct cpuset *cs)
102{
103 return !!test_bit(CS_REMOVED, &cs->flags);
104}
105
106static inline int notify_on_release(const struct cpuset *cs)
107{
108 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
109}
110
111/*
112 * Increment this atomic integer everytime any cpuset changes its
113 * mems_allowed value. Users of cpusets can track this generation
114 * number, and avoid having to lock and reload mems_allowed unless
115 * the cpuset they're using changes generation.
116 *
117 * A single, global generation is needed because attach_task() could
118 * reattach a task to a different cpuset, which must not have its
119 * generation numbers aliased with those of that tasks previous cpuset.
120 *
121 * Generations are needed for mems_allowed because one task cannot
122 * modify anothers memory placement. So we must enable every task,
123 * on every visit to __alloc_pages(), to efficiently check whether
124 * its current->cpuset->mems_allowed has changed, requiring an update
125 * of its current->mems_allowed.
126 */
127static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
128
129static struct cpuset top_cpuset = {
130 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
131 .cpus_allowed = CPU_MASK_ALL,
132 .mems_allowed = NODE_MASK_ALL,
133 .count = ATOMIC_INIT(0),
134 .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
135 .children = LIST_HEAD_INIT(top_cpuset.children),
136 .parent = NULL,
137 .dentry = NULL,
138 .mems_generation = 0,
139};
140
141static struct vfsmount *cpuset_mount;
142static struct super_block *cpuset_sb = NULL;
143
144/*
145 * cpuset_sem should be held by anyone who is depending on the children
146 * or sibling lists of any cpuset, or performing non-atomic operations
147 * on the flags or *_allowed values of a cpuset, such as raising the
148 * CS_REMOVED flag bit iff it is not already raised, or reading and
149 * conditionally modifying the *_allowed values. One kernel global
150 * cpuset semaphore should be sufficient - these things don't change
151 * that much.
152 *
153 * The code that modifies cpusets holds cpuset_sem across the entire
154 * operation, from cpuset_common_file_write() down, single threading
155 * all cpuset modifications (except for counter manipulations from
156 * fork and exit) across the system. This presumes that cpuset
157 * modifications are rare - better kept simple and safe, even if slow.
158 *
159 * The code that reads cpusets, such as in cpuset_common_file_read()
160 * and below, only holds cpuset_sem across small pieces of code, such
161 * as when reading out possibly multi-word cpumasks and nodemasks, as
162 * the risks are less, and the desire for performance a little greater.
163 * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
164 * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
165 *
166 * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
167 * (usually) grab cpuset_sem. These are the two most performance
168 * critical pieces of code here. The exception occurs on exit(),
169 * if the last task using a cpuset exits, and the cpuset was marked
170 * notify_on_release. In that case, the cpuset_sem is taken, the
171 * path to the released cpuset calculated, and a usermode call made
172 * to /sbin/cpuset_release_agent with the name of the cpuset (path
173 * relative to the root of cpuset file system) as the argument.
174 *
175 * A cpuset can only be deleted if both its 'count' of using tasks is
176 * zero, and its list of 'children' cpusets is empty. Since all tasks
177 * in the system use _some_ cpuset, and since there is always at least
178 * one task in the system (init, pid == 1), therefore, top_cpuset
179 * always has either children cpusets and/or using tasks. So no need
180 * for any special hack to ensure that top_cpuset cannot be deleted.
181 */
182
183static DECLARE_MUTEX(cpuset_sem);
184
185/*
186 * A couple of forward declarations required, due to cyclic reference loop:
187 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
188 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
189 */
190
191static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
192static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
193
194static struct backing_dev_info cpuset_backing_dev_info = {
195 .ra_pages = 0, /* No readahead */
196 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
197};
198
199static struct inode *cpuset_new_inode(mode_t mode)
200{
201 struct inode *inode = new_inode(cpuset_sb);
202
203 if (inode) {
204 inode->i_mode = mode;
205 inode->i_uid = current->fsuid;
206 inode->i_gid = current->fsgid;
207 inode->i_blksize = PAGE_CACHE_SIZE;
208 inode->i_blocks = 0;
209 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
210 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
211 }
212 return inode;
213}
214
215static void cpuset_diput(struct dentry *dentry, struct inode *inode)
216{
217 /* is dentry a directory ? if so, kfree() associated cpuset */
218 if (S_ISDIR(inode->i_mode)) {
219 struct cpuset *cs = dentry->d_fsdata;
220 BUG_ON(!(is_removed(cs)));
221 kfree(cs);
222 }
223 iput(inode);
224}
225
226static struct dentry_operations cpuset_dops = {
227 .d_iput = cpuset_diput,
228};
229
230static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
231{
232 struct qstr qstr;
233 struct dentry *d;
234
235 qstr.name = name;
236 qstr.len = strlen(name);
237 qstr.hash = full_name_hash(name, qstr.len);
238 d = lookup_hash(&qstr, parent);
239 if (!IS_ERR(d))
240 d->d_op = &cpuset_dops;
241 return d;
242}
243
244static void remove_dir(struct dentry *d)
245{
246 struct dentry *parent = dget(d->d_parent);
247
248 d_delete(d);
249 simple_rmdir(parent->d_inode, d);
250 dput(parent);
251}
252
253/*
254 * NOTE : the dentry must have been dget()'ed
255 */
256static void cpuset_d_remove_dir(struct dentry *dentry)
257{
258 struct list_head *node;
259
260 spin_lock(&dcache_lock);
261 node = dentry->d_subdirs.next;
262 while (node != &dentry->d_subdirs) {
263 struct dentry *d = list_entry(node, struct dentry, d_child);
264 list_del_init(node);
265 if (d->d_inode) {
266 d = dget_locked(d);
267 spin_unlock(&dcache_lock);
268 d_delete(d);
269 simple_unlink(dentry->d_inode, d);
270 dput(d);
271 spin_lock(&dcache_lock);
272 }
273 node = dentry->d_subdirs.next;
274 }
275 list_del_init(&dentry->d_child);
276 spin_unlock(&dcache_lock);
277 remove_dir(dentry);
278}
279
280static struct super_operations cpuset_ops = {
281 .statfs = simple_statfs,
282 .drop_inode = generic_delete_inode,
283};
284
285static int cpuset_fill_super(struct super_block *sb, void *unused_data,
286 int unused_silent)
287{
288 struct inode *inode;
289 struct dentry *root;
290
291 sb->s_blocksize = PAGE_CACHE_SIZE;
292 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
293 sb->s_magic = CPUSET_SUPER_MAGIC;
294 sb->s_op = &cpuset_ops;
295 cpuset_sb = sb;
296
297 inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
298 if (inode) {
299 inode->i_op = &simple_dir_inode_operations;
300 inode->i_fop = &simple_dir_operations;
301 /* directories start off with i_nlink == 2 (for "." entry) */
302 inode->i_nlink++;
303 } else {
304 return -ENOMEM;
305 }
306
307 root = d_alloc_root(inode);
308 if (!root) {
309 iput(inode);
310 return -ENOMEM;
311 }
312 sb->s_root = root;
313 return 0;
314}
315
316static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
317 int flags, const char *unused_dev_name,
318 void *data)
319{
320 return get_sb_single(fs_type, flags, data, cpuset_fill_super);
321}
322
323static struct file_system_type cpuset_fs_type = {
324 .name = "cpuset",
325 .get_sb = cpuset_get_sb,
326 .kill_sb = kill_litter_super,
327};
328
329/* struct cftype:
330 *
331 * The files in the cpuset filesystem mostly have a very simple read/write
332 * handling, some common function will take care of it. Nevertheless some cases
333 * (read tasks) are special and therefore I define this structure for every
334 * kind of file.
335 *
336 *
337 * When reading/writing to a file:
338 * - the cpuset to use in file->f_dentry->d_parent->d_fsdata
339 * - the 'cftype' of the file is file->f_dentry->d_fsdata
340 */
341
342struct cftype {
343 char *name;
344 int private;
345 int (*open) (struct inode *inode, struct file *file);
346 ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
347 loff_t *ppos);
348 int (*write) (struct file *file, const char __user *buf, size_t nbytes,
349 loff_t *ppos);
350 int (*release) (struct inode *inode, struct file *file);
351};
352
353static inline struct cpuset *__d_cs(struct dentry *dentry)
354{
355 return dentry->d_fsdata;
356}
357
358static inline struct cftype *__d_cft(struct dentry *dentry)
359{
360 return dentry->d_fsdata;
361}
362
363/*
364 * Call with cpuset_sem held. Writes path of cpuset into buf.
365 * Returns 0 on success, -errno on error.
366 */
367
368static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
369{
370 char *start;
371
372 start = buf + buflen;
373
374 *--start = '\0';
375 for (;;) {
376 int len = cs->dentry->d_name.len;
377 if ((start -= len) < buf)
378 return -ENAMETOOLONG;
379 memcpy(start, cs->dentry->d_name.name, len);
380 cs = cs->parent;
381 if (!cs)
382 break;
383 if (!cs->parent)
384 continue;
385 if (--start < buf)
386 return -ENAMETOOLONG;
387 *start = '/';
388 }
389 memmove(buf, start, buf + buflen - start);
390 return 0;
391}
392
393/*
394 * Notify userspace when a cpuset is released, by running
395 * /sbin/cpuset_release_agent with the name of the cpuset (path
396 * relative to the root of cpuset file system) as the argument.
397 *
398 * Most likely, this user command will try to rmdir this cpuset.
399 *
400 * This races with the possibility that some other task will be
401 * attached to this cpuset before it is removed, or that some other
402 * user task will 'mkdir' a child cpuset of this cpuset. That's ok.
403 * The presumed 'rmdir' will fail quietly if this cpuset is no longer
404 * unused, and this cpuset will be reprieved from its death sentence,
405 * to continue to serve a useful existence. Next time it's released,
406 * we will get notified again, if it still has 'notify_on_release' set.
407 *
408 * Note final arg to call_usermodehelper() is 0 - that means
409 * don't wait. Since we are holding the global cpuset_sem here,
410 * and we are asking another thread (started from keventd) to rmdir a
411 * cpuset, we can't wait - or we'd deadlock with the removing thread
412 * on cpuset_sem.
413 */
414
415static int cpuset_release_agent(char *cpuset_str)
416{
417 char *argv[3], *envp[3];
418 int i;
419
420 i = 0;
421 argv[i++] = "/sbin/cpuset_release_agent";
422 argv[i++] = cpuset_str;
423 argv[i] = NULL;
424
425 i = 0;
426 /* minimal command environment */
427 envp[i++] = "HOME=/";
428 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
429 envp[i] = NULL;
430
431 return call_usermodehelper(argv[0], argv, envp, 0);
432}
433
434/*
435 * Either cs->count of using tasks transitioned to zero, or the
436 * cs->children list of child cpusets just became empty. If this
437 * cs is notify_on_release() and now both the user count is zero and
438 * the list of children is empty, send notice to user land.
439 */
440
441static void check_for_release(struct cpuset *cs)
442{
443 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
444 list_empty(&cs->children)) {
445 char *buf;
446
447 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
448 if (!buf)
449 return;
450 if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
451 goto out;
452 cpuset_release_agent(buf);
453out:
454 kfree(buf);
455 }
456}
457
458/*
459 * Return in *pmask the portion of a cpusets's cpus_allowed that
460 * are online. If none are online, walk up the cpuset hierarchy
461 * until we find one that does have some online cpus. If we get
462 * all the way to the top and still haven't found any online cpus,
463 * return cpu_online_map. Or if passed a NULL cs from an exit'ing
464 * task, return cpu_online_map.
465 *
466 * One way or another, we guarantee to return some non-empty subset
467 * of cpu_online_map.
468 *
469 * Call with cpuset_sem held.
470 */
471
472static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
473{
474 while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
475 cs = cs->parent;
476 if (cs)
477 cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
478 else
479 *pmask = cpu_online_map;
480 BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
481}
482
483/*
484 * Return in *pmask the portion of a cpusets's mems_allowed that
485 * are online. If none are online, walk up the cpuset hierarchy
486 * until we find one that does have some online mems. If we get
487 * all the way to the top and still haven't found any online mems,
488 * return node_online_map.
489 *
490 * One way or another, we guarantee to return some non-empty subset
491 * of node_online_map.
492 *
493 * Call with cpuset_sem held.
494 */
495
496static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
497{
498 while (cs && !nodes_intersects(cs->mems_allowed, node_online_map))
499 cs = cs->parent;
500 if (cs)
501 nodes_and(*pmask, cs->mems_allowed, node_online_map);
502 else
503 *pmask = node_online_map;
504 BUG_ON(!nodes_intersects(*pmask, node_online_map));
505}
506
507/*
508 * Refresh current tasks mems_allowed and mems_generation from
509 * current tasks cpuset. Call with cpuset_sem held.
510 *
511 * Be sure to call refresh_mems() on any cpuset operation which
512 * (1) holds cpuset_sem, and (2) might possibly alloc memory.
513 * Call after obtaining cpuset_sem lock, before any possible
514 * allocation. Otherwise one risks trying to allocate memory
515 * while the task cpuset_mems_generation is not the same as
516 * the mems_generation in its cpuset, which would deadlock on
517 * cpuset_sem in cpuset_update_current_mems_allowed().
518 *
519 * Since we hold cpuset_sem, once refresh_mems() is called, the
520 * test (current->cpuset_mems_generation != cs->mems_generation)
521 * in cpuset_update_current_mems_allowed() will remain false,
522 * until we drop cpuset_sem. Anyone else who would change our
523 * cpusets mems_generation needs to lock cpuset_sem first.
524 */
525
526static void refresh_mems(void)
527{
528 struct cpuset *cs = current->cpuset;
529
530 if (current->cpuset_mems_generation != cs->mems_generation) {
531 guarantee_online_mems(cs, &current->mems_allowed);
532 current->cpuset_mems_generation = cs->mems_generation;
533 }
534}
535
536/*
537 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
538 *
539 * One cpuset is a subset of another if all its allowed CPUs and
540 * Memory Nodes are a subset of the other, and its exclusive flags
541 * are only set if the other's are set.
542 */
543
544static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
545{
546 return cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
547 nodes_subset(p->mems_allowed, q->mems_allowed) &&
548 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
549 is_mem_exclusive(p) <= is_mem_exclusive(q);
550}
551
552/*
553 * validate_change() - Used to validate that any proposed cpuset change
554 * follows the structural rules for cpusets.
555 *
556 * If we replaced the flag and mask values of the current cpuset
557 * (cur) with those values in the trial cpuset (trial), would
558 * our various subset and exclusive rules still be valid? Presumes
559 * cpuset_sem held.
560 *
561 * 'cur' is the address of an actual, in-use cpuset. Operations
562 * such as list traversal that depend on the actual address of the
563 * cpuset in the list must use cur below, not trial.
564 *
565 * 'trial' is the address of bulk structure copy of cur, with
566 * perhaps one or more of the fields cpus_allowed, mems_allowed,
567 * or flags changed to new, trial values.
568 *
569 * Return 0 if valid, -errno if not.
570 */
571
572static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
573{
574 struct cpuset *c, *par;
575
576 /* Each of our child cpusets must be a subset of us */
577 list_for_each_entry(c, &cur->children, sibling) {
578 if (!is_cpuset_subset(c, trial))
579 return -EBUSY;
580 }
581
582 /* Remaining checks don't apply to root cpuset */
583 if ((par = cur->parent) == NULL)
584 return 0;
585
586 /* We must be a subset of our parent cpuset */
587 if (!is_cpuset_subset(trial, par))
588 return -EACCES;
589
590 /* If either I or some sibling (!= me) is exclusive, we can't overlap */
591 list_for_each_entry(c, &par->children, sibling) {
592 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
593 c != cur &&
594 cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
595 return -EINVAL;
596 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
597 c != cur &&
598 nodes_intersects(trial->mems_allowed, c->mems_allowed))
599 return -EINVAL;
600 }
601
602 return 0;
603}
604
605static int update_cpumask(struct cpuset *cs, char *buf)
606{
607 struct cpuset trialcs;
608 int retval;
609
610 trialcs = *cs;
611 retval = cpulist_parse(buf, trialcs.cpus_allowed);
612 if (retval < 0)
613 return retval;
614 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
615 if (cpus_empty(trialcs.cpus_allowed))
616 return -ENOSPC;
617 retval = validate_change(cs, &trialcs);
618 if (retval == 0)
619 cs->cpus_allowed = trialcs.cpus_allowed;
620 return retval;
621}
622
623static int update_nodemask(struct cpuset *cs, char *buf)
624{
625 struct cpuset trialcs;
626 int retval;
627
628 trialcs = *cs;
629 retval = nodelist_parse(buf, trialcs.mems_allowed);
630 if (retval < 0)
631 return retval;
632 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
633 if (nodes_empty(trialcs.mems_allowed))
634 return -ENOSPC;
635 retval = validate_change(cs, &trialcs);
636 if (retval == 0) {
637 cs->mems_allowed = trialcs.mems_allowed;
638 atomic_inc(&cpuset_mems_generation);
639 cs->mems_generation = atomic_read(&cpuset_mems_generation);
640 }
641 return retval;
642}
643
644/*
645 * update_flag - read a 0 or a 1 in a file and update associated flag
646 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
647 * CS_NOTIFY_ON_RELEASE)
648 * cs: the cpuset to update
649 * buf: the buffer where we read the 0 or 1
650 */
651
652static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
653{
654 int turning_on;
655 struct cpuset trialcs;
656 int err;
657
658 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
659
660 trialcs = *cs;
661 if (turning_on)
662 set_bit(bit, &trialcs.flags);
663 else
664 clear_bit(bit, &trialcs.flags);
665
666 err = validate_change(cs, &trialcs);
667 if (err == 0) {
668 if (turning_on)
669 set_bit(bit, &cs->flags);
670 else
671 clear_bit(bit, &cs->flags);
672 }
673 return err;
674}
675
676static int attach_task(struct cpuset *cs, char *buf)
677{
678 pid_t pid;
679 struct task_struct *tsk;
680 struct cpuset *oldcs;
681 cpumask_t cpus;
682
683 if (sscanf(buf, "%d", &pid) != 1)
684 return -EIO;
685 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
686 return -ENOSPC;
687
688 if (pid) {
689 read_lock(&tasklist_lock);
690
691 tsk = find_task_by_pid(pid);
692 if (!tsk) {
693 read_unlock(&tasklist_lock);
694 return -ESRCH;
695 }
696
697 get_task_struct(tsk);
698 read_unlock(&tasklist_lock);
699
700 if ((current->euid) && (current->euid != tsk->uid)
701 && (current->euid != tsk->suid)) {
702 put_task_struct(tsk);
703 return -EACCES;
704 }
705 } else {
706 tsk = current;
707 get_task_struct(tsk);
708 }
709
710 task_lock(tsk);
711 oldcs = tsk->cpuset;
712 if (!oldcs) {
713 task_unlock(tsk);
714 put_task_struct(tsk);
715 return -ESRCH;
716 }
717 atomic_inc(&cs->count);
718 tsk->cpuset = cs;
719 task_unlock(tsk);
720
721 guarantee_online_cpus(cs, &cpus);
722 set_cpus_allowed(tsk, cpus);
723
724 put_task_struct(tsk);
725 if (atomic_dec_and_test(&oldcs->count))
726 check_for_release(oldcs);
727 return 0;
728}
729
730/* The various types of files and directories in a cpuset file system */
731
732typedef enum {
733 FILE_ROOT,
734 FILE_DIR,
735 FILE_CPULIST,
736 FILE_MEMLIST,
737 FILE_CPU_EXCLUSIVE,
738 FILE_MEM_EXCLUSIVE,
739 FILE_NOTIFY_ON_RELEASE,
740 FILE_TASKLIST,
741} cpuset_filetype_t;
742
743static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
744 size_t nbytes, loff_t *unused_ppos)
745{
746 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
747 struct cftype *cft = __d_cft(file->f_dentry);
748 cpuset_filetype_t type = cft->private;
749 char *buffer;
750 int retval = 0;
751
752 /* Crude upper limit on largest legitimate cpulist user might write. */
753 if (nbytes > 100 + 6 * NR_CPUS)
754 return -E2BIG;
755
756 /* +1 for nul-terminator */
757 if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
758 return -ENOMEM;
759
760 if (copy_from_user(buffer, userbuf, nbytes)) {
761 retval = -EFAULT;
762 goto out1;
763 }
764 buffer[nbytes] = 0; /* nul-terminate */
765
766 down(&cpuset_sem);
767
768 if (is_removed(cs)) {
769 retval = -ENODEV;
770 goto out2;
771 }
772
773 switch (type) {
774 case FILE_CPULIST:
775 retval = update_cpumask(cs, buffer);
776 break;
777 case FILE_MEMLIST:
778 retval = update_nodemask(cs, buffer);
779 break;
780 case FILE_CPU_EXCLUSIVE:
781 retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
782 break;
783 case FILE_MEM_EXCLUSIVE:
784 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
785 break;
786 case FILE_NOTIFY_ON_RELEASE:
787 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
788 break;
789 case FILE_TASKLIST:
790 retval = attach_task(cs, buffer);
791 break;
792 default:
793 retval = -EINVAL;
794 goto out2;
795 }
796
797 if (retval == 0)
798 retval = nbytes;
799out2:
800 up(&cpuset_sem);
801out1:
802 kfree(buffer);
803 return retval;
804}
805
806static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
807 size_t nbytes, loff_t *ppos)
808{
809 ssize_t retval = 0;
810 struct cftype *cft = __d_cft(file->f_dentry);
811 if (!cft)
812 return -ENODEV;
813
814 /* special function ? */
815 if (cft->write)
816 retval = cft->write(file, buf, nbytes, ppos);
817 else
818 retval = cpuset_common_file_write(file, buf, nbytes, ppos);
819
820 return retval;
821}
822
823/*
824 * These ascii lists should be read in a single call, by using a user
825 * buffer large enough to hold the entire map. If read in smaller
826 * chunks, there is no guarantee of atomicity. Since the display format
827 * used, list of ranges of sequential numbers, is variable length,
828 * and since these maps can change value dynamically, one could read
829 * gibberish by doing partial reads while a list was changing.
830 * A single large read to a buffer that crosses a page boundary is
831 * ok, because the result being copied to user land is not recomputed
832 * across a page fault.
833 */
834
835static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
836{
837 cpumask_t mask;
838
839 down(&cpuset_sem);
840 mask = cs->cpus_allowed;
841 up(&cpuset_sem);
842
843 return cpulist_scnprintf(page, PAGE_SIZE, mask);
844}
845
846static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
847{
848 nodemask_t mask;
849
850 down(&cpuset_sem);
851 mask = cs->mems_allowed;
852 up(&cpuset_sem);
853
854 return nodelist_scnprintf(page, PAGE_SIZE, mask);
855}
856
857static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
858 size_t nbytes, loff_t *ppos)
859{
860 struct cftype *cft = __d_cft(file->f_dentry);
861 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
862 cpuset_filetype_t type = cft->private;
863 char *page;
864 ssize_t retval = 0;
865 char *s;
866 char *start;
867 size_t n;
868
869 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
870 return -ENOMEM;
871
872 s = page;
873
874 switch (type) {
875 case FILE_CPULIST:
876 s += cpuset_sprintf_cpulist(s, cs);
877 break;
878 case FILE_MEMLIST:
879 s += cpuset_sprintf_memlist(s, cs);
880 break;
881 case FILE_CPU_EXCLUSIVE:
882 *s++ = is_cpu_exclusive(cs) ? '1' : '0';
883 break;
884 case FILE_MEM_EXCLUSIVE:
885 *s++ = is_mem_exclusive(cs) ? '1' : '0';
886 break;
887 case FILE_NOTIFY_ON_RELEASE:
888 *s++ = notify_on_release(cs) ? '1' : '0';
889 break;
890 default:
891 retval = -EINVAL;
892 goto out;
893 }
894 *s++ = '\n';
895 *s = '\0';
896
897 start = page + *ppos;
898 n = s - start;
899 retval = n - copy_to_user(buf, start, min(n, nbytes));
900 *ppos += retval;
901out:
902 free_page((unsigned long)page);
903 return retval;
904}
905
906static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes,
907 loff_t *ppos)
908{
909 ssize_t retval = 0;
910 struct cftype *cft = __d_cft(file->f_dentry);
911 if (!cft)
912 return -ENODEV;
913
914 /* special function ? */
915 if (cft->read)
916 retval = cft->read(file, buf, nbytes, ppos);
917 else
918 retval = cpuset_common_file_read(file, buf, nbytes, ppos);
919
920 return retval;
921}
922
923static int cpuset_file_open(struct inode *inode, struct file *file)
924{
925 int err;
926 struct cftype *cft;
927
928 err = generic_file_open(inode, file);
929 if (err)
930 return err;
931
932 cft = __d_cft(file->f_dentry);
933 if (!cft)
934 return -ENODEV;
935 if (cft->open)
936 err = cft->open(inode, file);
937 else
938 err = 0;
939
940 return err;
941}
942
943static int cpuset_file_release(struct inode *inode, struct file *file)
944{
945 struct cftype *cft = __d_cft(file->f_dentry);
946 if (cft->release)
947 return cft->release(inode, file);
948 return 0;
949}
950
951static struct file_operations cpuset_file_operations = {
952 .read = cpuset_file_read,
953 .write = cpuset_file_write,
954 .llseek = generic_file_llseek,
955 .open = cpuset_file_open,
956 .release = cpuset_file_release,
957};
958
959static struct inode_operations cpuset_dir_inode_operations = {
960 .lookup = simple_lookup,
961 .mkdir = cpuset_mkdir,
962 .rmdir = cpuset_rmdir,
963};
964
965static int cpuset_create_file(struct dentry *dentry, int mode)
966{
967 struct inode *inode;
968
969 if (!dentry)
970 return -ENOENT;
971 if (dentry->d_inode)
972 return -EEXIST;
973
974 inode = cpuset_new_inode(mode);
975 if (!inode)
976 return -ENOMEM;
977
978 if (S_ISDIR(mode)) {
979 inode->i_op = &cpuset_dir_inode_operations;
980 inode->i_fop = &simple_dir_operations;
981
982 /* start off with i_nlink == 2 (for "." entry) */
983 inode->i_nlink++;
984 } else if (S_ISREG(mode)) {
985 inode->i_size = 0;
986 inode->i_fop = &cpuset_file_operations;
987 }
988
989 d_instantiate(dentry, inode);
990 dget(dentry); /* Extra count - pin the dentry in core */
991 return 0;
992}
993
994/*
995 * cpuset_create_dir - create a directory for an object.
996 * cs: the cpuset we create the directory for.
997 * It must have a valid ->parent field
998 * And we are going to fill its ->dentry field.
999 * name: The name to give to the cpuset directory. Will be copied.
1000 * mode: mode to set on new directory.
1001 */
1002
1003static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
1004{
1005 struct dentry *dentry = NULL;
1006 struct dentry *parent;
1007 int error = 0;
1008
1009 parent = cs->parent->dentry;
1010 dentry = cpuset_get_dentry(parent, name);
1011 if (IS_ERR(dentry))
1012 return PTR_ERR(dentry);
1013 error = cpuset_create_file(dentry, S_IFDIR | mode);
1014 if (!error) {
1015 dentry->d_fsdata = cs;
1016 parent->d_inode->i_nlink++;
1017 cs->dentry = dentry;
1018 }
1019 dput(dentry);
1020
1021 return error;
1022}
1023
1024static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1025{
1026 struct dentry *dentry;
1027 int error;
1028
1029 down(&dir->d_inode->i_sem);
1030 dentry = cpuset_get_dentry(dir, cft->name);
1031 if (!IS_ERR(dentry)) {
1032 error = cpuset_create_file(dentry, 0644 | S_IFREG);
1033 if (!error)
1034 dentry->d_fsdata = (void *)cft;
1035 dput(dentry);
1036 } else
1037 error = PTR_ERR(dentry);
1038 up(&dir->d_inode->i_sem);
1039 return error;
1040}
1041
1042/*
1043 * Stuff for reading the 'tasks' file.
1044 *
1045 * Reading this file can return large amounts of data if a cpuset has
1046 * *lots* of attached tasks. So it may need several calls to read(),
1047 * but we cannot guarantee that the information we produce is correct
1048 * unless we produce it entirely atomically.
1049 *
1050 * Upon tasks file open(), a struct ctr_struct is allocated, that
1051 * will have a pointer to an array (also allocated here). The struct
1052 * ctr_struct * is stored in file->private_data. Its resources will
1053 * be freed by release() when the file is closed. The array is used
1054 * to sprintf the PIDs and then used by read().
1055 */
1056
1057/* cpusets_tasks_read array */
1058
1059struct ctr_struct {
1060 char *buf;
1061 int bufsz;
1062};
1063
1064/*
1065 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1066 * Return actual number of pids loaded.
1067 */
1068static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1069{
1070 int n = 0;
1071 struct task_struct *g, *p;
1072
1073 read_lock(&tasklist_lock);
1074
1075 do_each_thread(g, p) {
1076 if (p->cpuset == cs) {
1077 pidarray[n++] = p->pid;
1078 if (unlikely(n == npids))
1079 goto array_full;
1080 }
1081 } while_each_thread(g, p);
1082
1083array_full:
1084 read_unlock(&tasklist_lock);
1085 return n;
1086}
1087
1088static int cmppid(const void *a, const void *b)
1089{
1090 return *(pid_t *)a - *(pid_t *)b;
1091}
1092
1093/*
1094 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1095 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1096 * count 'cnt' of how many chars would be written if buf were large enough.
1097 */
1098static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1099{
1100 int cnt = 0;
1101 int i;
1102
1103 for (i = 0; i < npids; i++)
1104 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1105 return cnt;
1106}
1107
1108static int cpuset_tasks_open(struct inode *unused, struct file *file)
1109{
1110 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
1111 struct ctr_struct *ctr;
1112 pid_t *pidarray;
1113 int npids;
1114 char c;
1115
1116 if (!(file->f_mode & FMODE_READ))
1117 return 0;
1118
1119 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1120 if (!ctr)
1121 goto err0;
1122
1123 /*
1124 * If cpuset gets more users after we read count, we won't have
1125 * enough space - tough. This race is indistinguishable to the
1126 * caller from the case that the additional cpuset users didn't
1127 * show up until sometime later on.
1128 */
1129 npids = atomic_read(&cs->count);
1130 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1131 if (!pidarray)
1132 goto err1;
1133
1134 npids = pid_array_load(pidarray, npids, cs);
1135 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1136
1137 /* Call pid_array_to_buf() twice, first just to get bufsz */
1138 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1139 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1140 if (!ctr->buf)
1141 goto err2;
1142 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1143
1144 kfree(pidarray);
1145 file->private_data = ctr;
1146 return 0;
1147
1148err2:
1149 kfree(pidarray);
1150err1:
1151 kfree(ctr);
1152err0:
1153 return -ENOMEM;
1154}
1155
1156static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
1157 size_t nbytes, loff_t *ppos)
1158{
1159 struct ctr_struct *ctr = file->private_data;
1160
1161 if (*ppos + nbytes > ctr->bufsz)
1162 nbytes = ctr->bufsz - *ppos;
1163 if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
1164 return -EFAULT;
1165 *ppos += nbytes;
1166 return nbytes;
1167}
1168
1169static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
1170{
1171 struct ctr_struct *ctr;
1172
1173 if (file->f_mode & FMODE_READ) {
1174 ctr = file->private_data;
1175 kfree(ctr->buf);
1176 kfree(ctr);
1177 }
1178 return 0;
1179}
1180
1181/*
1182 * for the common functions, 'private' gives the type of file
1183 */
1184
1185static struct cftype cft_tasks = {
1186 .name = "tasks",
1187 .open = cpuset_tasks_open,
1188 .read = cpuset_tasks_read,
1189 .release = cpuset_tasks_release,
1190 .private = FILE_TASKLIST,
1191};
1192
1193static struct cftype cft_cpus = {
1194 .name = "cpus",
1195 .private = FILE_CPULIST,
1196};
1197
1198static struct cftype cft_mems = {
1199 .name = "mems",
1200 .private = FILE_MEMLIST,
1201};
1202
1203static struct cftype cft_cpu_exclusive = {
1204 .name = "cpu_exclusive",
1205 .private = FILE_CPU_EXCLUSIVE,
1206};
1207
1208static struct cftype cft_mem_exclusive = {
1209 .name = "mem_exclusive",
1210 .private = FILE_MEM_EXCLUSIVE,
1211};
1212
1213static struct cftype cft_notify_on_release = {
1214 .name = "notify_on_release",
1215 .private = FILE_NOTIFY_ON_RELEASE,
1216};
1217
1218static int cpuset_populate_dir(struct dentry *cs_dentry)
1219{
1220 int err;
1221
1222 if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0)
1223 return err;
1224 if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
1225 return err;
1226 if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
1227 return err;
1228 if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
1229 return err;
1230 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
1231 return err;
1232 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1233 return err;
1234 return 0;
1235}
1236
1237/*
1238 * cpuset_create - create a cpuset
1239 * parent: cpuset that will be parent of the new cpuset.
1240 * name: name of the new cpuset. Will be strcpy'ed.
1241 * mode: mode to set on new inode
1242 *
1243 * Must be called with the semaphore on the parent inode held
1244 */
1245
1246static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1247{
1248 struct cpuset *cs;
1249 int err;
1250
1251 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1252 if (!cs)
1253 return -ENOMEM;
1254
1255 down(&cpuset_sem);
1256 refresh_mems();
1257 cs->flags = 0;
1258 if (notify_on_release(parent))
1259 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1260 cs->cpus_allowed = CPU_MASK_NONE;
1261 cs->mems_allowed = NODE_MASK_NONE;
1262 atomic_set(&cs->count, 0);
1263 INIT_LIST_HEAD(&cs->sibling);
1264 INIT_LIST_HEAD(&cs->children);
1265 atomic_inc(&cpuset_mems_generation);
1266 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1267
1268 cs->parent = parent;
1269
1270 list_add(&cs->sibling, &cs->parent->children);
1271
1272 err = cpuset_create_dir(cs, name, mode);
1273 if (err < 0)
1274 goto err;
1275
1276 /*
1277 * Release cpuset_sem before cpuset_populate_dir() because it
1278 * will down() this new directory's i_sem and if we race with
1279 * another mkdir, we might deadlock.
1280 */
1281 up(&cpuset_sem);
1282
1283 err = cpuset_populate_dir(cs->dentry);
1284 /* If err < 0, we have a half-filled directory - oh well ;) */
1285 return 0;
1286err:
1287 list_del(&cs->sibling);
1288 up(&cpuset_sem);
1289 kfree(cs);
1290 return err;
1291}
1292
1293static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1294{
1295 struct cpuset *c_parent = dentry->d_parent->d_fsdata;
1296
1297 /* the vfs holds inode->i_sem already */
1298 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1299}
1300
1301static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1302{
1303 struct cpuset *cs = dentry->d_fsdata;
1304 struct dentry *d;
1305 struct cpuset *parent;
1306
1307 /* the vfs holds both inode->i_sem already */
1308
1309 down(&cpuset_sem);
1310 refresh_mems();
1311 if (atomic_read(&cs->count) > 0) {
1312 up(&cpuset_sem);
1313 return -EBUSY;
1314 }
1315 if (!list_empty(&cs->children)) {
1316 up(&cpuset_sem);
1317 return -EBUSY;
1318 }
1319 spin_lock(&cs->dentry->d_lock);
1320 parent = cs->parent;
1321 set_bit(CS_REMOVED, &cs->flags);
1322 list_del(&cs->sibling); /* delete my sibling from parent->children */
1323 if (list_empty(&parent->children))
1324 check_for_release(parent);
1325 d = dget(cs->dentry);
1326 cs->dentry = NULL;
1327 spin_unlock(&d->d_lock);
1328 cpuset_d_remove_dir(d);
1329 dput(d);
1330 up(&cpuset_sem);
1331 return 0;
1332}
1333
1334/**
1335 * cpuset_init - initialize cpusets at system boot
1336 *
1337 * Description: Initialize top_cpuset and the cpuset internal file system,
1338 **/
1339
1340int __init cpuset_init(void)
1341{
1342 struct dentry *root;
1343 int err;
1344
1345 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1346 top_cpuset.mems_allowed = NODE_MASK_ALL;
1347
1348 atomic_inc(&cpuset_mems_generation);
1349 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1350
1351 init_task.cpuset = &top_cpuset;
1352
1353 err = register_filesystem(&cpuset_fs_type);
1354 if (err < 0)
1355 goto out;
1356 cpuset_mount = kern_mount(&cpuset_fs_type);
1357 if (IS_ERR(cpuset_mount)) {
1358 printk(KERN_ERR "cpuset: could not mount!\n");
1359 err = PTR_ERR(cpuset_mount);
1360 cpuset_mount = NULL;
1361 goto out;
1362 }
1363 root = cpuset_mount->mnt_sb->s_root;
1364 root->d_fsdata = &top_cpuset;
1365 root->d_inode->i_nlink++;
1366 top_cpuset.dentry = root;
1367 root->d_inode->i_op = &cpuset_dir_inode_operations;
1368 err = cpuset_populate_dir(root);
1369out:
1370 return err;
1371}
1372
1373/**
1374 * cpuset_init_smp - initialize cpus_allowed
1375 *
1376 * Description: Finish top cpuset after cpu, node maps are initialized
1377 **/
1378
1379void __init cpuset_init_smp(void)
1380{
1381 top_cpuset.cpus_allowed = cpu_online_map;
1382 top_cpuset.mems_allowed = node_online_map;
1383}
1384
1385/**
1386 * cpuset_fork - attach newly forked task to its parents cpuset.
1387 * @p: pointer to task_struct of forking parent process.
1388 *
1389 * Description: By default, on fork, a task inherits its
1390 * parents cpuset. The pointer to the shared cpuset is
1391 * automatically copied in fork.c by dup_task_struct().
1392 * This cpuset_fork() routine need only increment the usage
1393 * counter in that cpuset.
1394 **/
1395
1396void cpuset_fork(struct task_struct *tsk)
1397{
1398 atomic_inc(&tsk->cpuset->count);
1399}
1400
1401/**
1402 * cpuset_exit - detach cpuset from exiting task
1403 * @tsk: pointer to task_struct of exiting process
1404 *
1405 * Description: Detach cpuset from @tsk and release it.
1406 *
1407 **/
1408
1409void cpuset_exit(struct task_struct *tsk)
1410{
1411 struct cpuset *cs;
1412
1413 task_lock(tsk);
1414 cs = tsk->cpuset;
1415 tsk->cpuset = NULL;
1416 task_unlock(tsk);
1417
1418 if (atomic_dec_and_test(&cs->count)) {
1419 down(&cpuset_sem);
1420 check_for_release(cs);
1421 up(&cpuset_sem);
1422 }
1423}
1424
1425/**
1426 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1427 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1428 *
1429 * Description: Returns the cpumask_t cpus_allowed of the cpuset
1430 * attached to the specified @tsk. Guaranteed to return some non-empty
1431 * subset of cpu_online_map, even if this means going outside the
1432 * tasks cpuset.
1433 **/
1434
1435const cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1436{
1437 cpumask_t mask;
1438
1439 down(&cpuset_sem);
1440 task_lock((struct task_struct *)tsk);
1441 guarantee_online_cpus(tsk->cpuset, &mask);
1442 task_unlock((struct task_struct *)tsk);
1443 up(&cpuset_sem);
1444
1445 return mask;
1446}
1447
1448void cpuset_init_current_mems_allowed(void)
1449{
1450 current->mems_allowed = NODE_MASK_ALL;
1451}
1452
1453/*
1454 * If the current tasks cpusets mems_allowed changed behind our backs,
1455 * update current->mems_allowed and mems_generation to the new value.
1456 * Do not call this routine if in_interrupt().
1457 */
1458
1459void cpuset_update_current_mems_allowed(void)
1460{
1461 struct cpuset *cs = current->cpuset;
1462
1463 if (!cs)
1464 return; /* task is exiting */
1465 if (current->cpuset_mems_generation != cs->mems_generation) {
1466 down(&cpuset_sem);
1467 refresh_mems();
1468 up(&cpuset_sem);
1469 }
1470}
1471
1472void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
1473{
1474 bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
1475 MAX_NUMNODES);
1476}
1477
1478/*
1479 * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
1480 */
1481int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1482{
1483 int i;
1484
1485 for (i = 0; zl->zones[i]; i++) {
1486 int nid = zl->zones[i]->zone_pgdat->node_id;
1487
1488 if (node_isset(nid, current->mems_allowed))
1489 return 1;
1490 }
1491 return 0;
1492}
1493
1494/*
1495 * Is 'current' valid, and is zone z allowed in current->mems_allowed?
1496 */
1497int cpuset_zone_allowed(struct zone *z)
1498{
1499 return in_interrupt() ||
1500 node_isset(z->zone_pgdat->node_id, current->mems_allowed);
1501}
1502
1503/*
1504 * proc_cpuset_show()
1505 * - Print tasks cpuset path into seq_file.
1506 * - Used for /proc/<pid>/cpuset.
1507 */
1508
1509static int proc_cpuset_show(struct seq_file *m, void *v)
1510{
1511 struct cpuset *cs;
1512 struct task_struct *tsk;
1513 char *buf;
1514 int retval = 0;
1515
1516 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
1517 if (!buf)
1518 return -ENOMEM;
1519
1520 tsk = m->private;
1521 down(&cpuset_sem);
1522 task_lock(tsk);
1523 cs = tsk->cpuset;
1524 task_unlock(tsk);
1525 if (!cs) {
1526 retval = -EINVAL;
1527 goto out;
1528 }
1529
1530 retval = cpuset_path(cs, buf, PAGE_SIZE);
1531 if (retval < 0)
1532 goto out;
1533 seq_puts(m, buf);
1534 seq_putc(m, '\n');
1535out:
1536 up(&cpuset_sem);
1537 kfree(buf);
1538 return retval;
1539}
1540
1541static int cpuset_open(struct inode *inode, struct file *file)
1542{
1543 struct task_struct *tsk = PROC_I(inode)->task;
1544 return single_open(file, proc_cpuset_show, tsk);
1545}
1546
1547struct file_operations proc_cpuset_operations = {
1548 .open = cpuset_open,
1549 .read = seq_read,
1550 .llseek = seq_lseek,
1551 .release = single_release,
1552};
1553
1554/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
1555char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
1556{
1557 buffer += sprintf(buffer, "Cpus_allowed:\t");
1558 buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
1559 buffer += sprintf(buffer, "\n");
1560 buffer += sprintf(buffer, "Mems_allowed:\t");
1561 buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
1562 buffer += sprintf(buffer, "\n");
1563 return buffer;
1564}
diff --git a/kernel/dma.c b/kernel/dma.c
new file mode 100644
index 000000000000..aef0a45b7893
--- /dev/null
+++ b/kernel/dma.c
@@ -0,0 +1,158 @@
1/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $
2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
3 *
4 * Written by Hennus Bergman, 1992.
5 *
6 * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
7 * In the previous version the reported device could end up being wrong,
8 * if a device requested a DMA channel that was already in use.
9 * [It also happened to remove the sizeof(char *) == sizeof(int)
10 * assumption introduced because of those /proc/dma patches. -- Hennus]
11 */
12#include <linux/module.h>
13#include <linux/kernel.h>
14#include <linux/errno.h>
15#include <linux/spinlock.h>
16#include <linux/string.h>
17#include <linux/seq_file.h>
18#include <linux/proc_fs.h>
19#include <linux/init.h>
20#include <asm/dma.h>
21#include <asm/system.h>
22
23
24
25/* A note on resource allocation:
26 *
27 * All drivers needing DMA channels, should allocate and release them
28 * through the public routines `request_dma()' and `free_dma()'.
29 *
30 * In order to avoid problems, all processes should allocate resources in
31 * the same sequence and release them in the reverse order.
32 *
33 * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
34 * When releasing them, first release the DMA, then release the IRQ.
35 * If you don't, you may cause allocation requests to fail unnecessarily.
36 * This doesn't really matter now, but it will once we get real semaphores
37 * in the kernel.
38 */
39
40
41DEFINE_SPINLOCK(dma_spin_lock);
42
43/*
44 * If our port doesn't define this it has no PC like DMA
45 */
46
47#ifdef MAX_DMA_CHANNELS
48
49
50/* Channel n is busy iff dma_chan_busy[n].lock != 0.
51 * DMA0 used to be reserved for DRAM refresh, but apparently not any more...
52 * DMA4 is reserved for cascading.
53 */
54
55struct dma_chan {
56 int lock;
57 const char *device_id;
58};
59
60static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
61 [4] = { 1, "cascade" },
62};
63
64
65int request_dma(unsigned int dmanr, const char * device_id)
66{
67 if (dmanr >= MAX_DMA_CHANNELS)
68 return -EINVAL;
69
70 if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
71 return -EBUSY;
72
73 dma_chan_busy[dmanr].device_id = device_id;
74
75 /* old flag was 0, now contains 1 to indicate busy */
76 return 0;
77} /* request_dma */
78
79
80void free_dma(unsigned int dmanr)
81{
82 if (dmanr >= MAX_DMA_CHANNELS) {
83 printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
84 return;
85 }
86
87 if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
88 printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
89 return;
90 }
91
92} /* free_dma */
93
94#else
95
96int request_dma(unsigned int dmanr, const char *device_id)
97{
98 return -EINVAL;
99}
100
101void free_dma(unsigned int dmanr)
102{
103}
104
105#endif
106
107#ifdef CONFIG_PROC_FS
108
109#ifdef MAX_DMA_CHANNELS
110static int proc_dma_show(struct seq_file *m, void *v)
111{
112 int i;
113
114 for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
115 if (dma_chan_busy[i].lock) {
116 seq_printf(m, "%2d: %s\n", i,
117 dma_chan_busy[i].device_id);
118 }
119 }
120 return 0;
121}
122#else
123static int proc_dma_show(struct seq_file *m, void *v)
124{
125 seq_puts(m, "No DMA\n");
126 return 0;
127}
128#endif /* MAX_DMA_CHANNELS */
129
130static int proc_dma_open(struct inode *inode, struct file *file)
131{
132 return single_open(file, proc_dma_show, NULL);
133}
134
135static struct file_operations proc_dma_operations = {
136 .open = proc_dma_open,
137 .read = seq_read,
138 .llseek = seq_lseek,
139 .release = single_release,
140};
141
142static int __init proc_dma_init(void)
143{
144 struct proc_dir_entry *e;
145
146 e = create_proc_entry("dma", 0, NULL);
147 if (e)
148 e->proc_fops = &proc_dma_operations;
149
150 return 0;
151}
152
153__initcall(proc_dma_init);
154#endif
155
156EXPORT_SYMBOL(request_dma);
157EXPORT_SYMBOL(free_dma);
158EXPORT_SYMBOL(dma_spin_lock);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
new file mode 100644
index 000000000000..867d6dbeb574
--- /dev/null
+++ b/kernel/exec_domain.c
@@ -0,0 +1,209 @@
1/*
2 * Handling of different ABIs (personalities).
3 *
4 * We group personalities into execution domains which have their
5 * own handlers for kernel entry points, signal mapping, etc...
6 *
7 * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
8 */
9
10#include <linux/config.h>
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/kmod.h>
14#include <linux/module.h>
15#include <linux/personality.h>
16#include <linux/sched.h>
17#include <linux/syscalls.h>
18#include <linux/sysctl.h>
19#include <linux/types.h>
20
21
22static void default_handler(int, struct pt_regs *);
23
24static struct exec_domain *exec_domains = &default_exec_domain;
25static DEFINE_RWLOCK(exec_domains_lock);
26
27
28static u_long ident_map[32] = {
29 0, 1, 2, 3, 4, 5, 6, 7,
30 8, 9, 10, 11, 12, 13, 14, 15,
31 16, 17, 18, 19, 20, 21, 22, 23,
32 24, 25, 26, 27, 28, 29, 30, 31
33};
34
35struct exec_domain default_exec_domain = {
36 .name = "Linux", /* name */
37 .handler = default_handler, /* lcall7 causes a seg fault. */
38 .pers_low = 0, /* PER_LINUX personality. */
39 .pers_high = 0, /* PER_LINUX personality. */
40 .signal_map = ident_map, /* Identity map signals. */
41 .signal_invmap = ident_map, /* - both ways. */
42};
43
44
45static void
46default_handler(int segment, struct pt_regs *regp)
47{
48 set_personality(0);
49
50 if (current_thread_info()->exec_domain->handler != default_handler)
51 current_thread_info()->exec_domain->handler(segment, regp);
52 else
53 send_sig(SIGSEGV, current, 1);
54}
55
56static struct exec_domain *
57lookup_exec_domain(u_long personality)
58{
59 struct exec_domain * ep;
60 u_long pers = personality(personality);
61
62 read_lock(&exec_domains_lock);
63 for (ep = exec_domains; ep; ep = ep->next) {
64 if (pers >= ep->pers_low && pers <= ep->pers_high)
65 if (try_module_get(ep->module))
66 goto out;
67 }
68
69#ifdef CONFIG_KMOD
70 read_unlock(&exec_domains_lock);
71 request_module("personality-%ld", pers);
72 read_lock(&exec_domains_lock);
73
74 for (ep = exec_domains; ep; ep = ep->next) {
75 if (pers >= ep->pers_low && pers <= ep->pers_high)
76 if (try_module_get(ep->module))
77 goto out;
78 }
79#endif
80
81 ep = &default_exec_domain;
82out:
83 read_unlock(&exec_domains_lock);
84 return (ep);
85}
86
87int
88register_exec_domain(struct exec_domain *ep)
89{
90 struct exec_domain *tmp;
91 int err = -EBUSY;
92
93 if (ep == NULL)
94 return -EINVAL;
95
96 if (ep->next != NULL)
97 return -EBUSY;
98
99 write_lock(&exec_domains_lock);
100 for (tmp = exec_domains; tmp; tmp = tmp->next) {
101 if (tmp == ep)
102 goto out;
103 }
104
105 ep->next = exec_domains;
106 exec_domains = ep;
107 err = 0;
108
109out:
110 write_unlock(&exec_domains_lock);
111 return (err);
112}
113
114int
115unregister_exec_domain(struct exec_domain *ep)
116{
117 struct exec_domain **epp;
118
119 epp = &exec_domains;
120 write_lock(&exec_domains_lock);
121 for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
122 if (ep == *epp)
123 goto unregister;
124 }
125 write_unlock(&exec_domains_lock);
126 return -EINVAL;
127
128unregister:
129 *epp = ep->next;
130 ep->next = NULL;
131 write_unlock(&exec_domains_lock);
132 return 0;
133}
134
135int
136__set_personality(u_long personality)
137{
138 struct exec_domain *ep, *oep;
139
140 ep = lookup_exec_domain(personality);
141 if (ep == current_thread_info()->exec_domain) {
142 current->personality = personality;
143 return 0;
144 }
145
146 if (atomic_read(&current->fs->count) != 1) {
147 struct fs_struct *fsp, *ofsp;
148
149 fsp = copy_fs_struct(current->fs);
150 if (fsp == NULL) {
151 module_put(ep->module);
152 return -ENOMEM;
153 }
154
155 task_lock(current);
156 ofsp = current->fs;
157 current->fs = fsp;
158 task_unlock(current);
159
160 put_fs_struct(ofsp);
161 }
162
163 /*
164 * At that point we are guaranteed to be the sole owner of
165 * current->fs.
166 */
167
168 current->personality = personality;
169 oep = current_thread_info()->exec_domain;
170 current_thread_info()->exec_domain = ep;
171 set_fs_altroot();
172
173 module_put(oep->module);
174 return 0;
175}
176
177int
178get_exec_domain_list(char *page)
179{
180 struct exec_domain *ep;
181 int len = 0;
182
183 read_lock(&exec_domains_lock);
184 for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next)
185 len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n",
186 ep->pers_low, ep->pers_high, ep->name,
187 module_name(ep->module));
188 read_unlock(&exec_domains_lock);
189 return (len);
190}
191
192asmlinkage long
193sys_personality(u_long personality)
194{
195 u_long old = current->personality;
196
197 if (personality != 0xffffffff) {
198 set_personality(personality);
199 if (current->personality != personality)
200 return -EINVAL;
201 }
202
203 return (long)old;
204}
205
206
207EXPORT_SYMBOL(register_exec_domain);
208EXPORT_SYMBOL(unregister_exec_domain);
209EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
new file mode 100644
index 000000000000..6dd4ebe1dd90
--- /dev/null
+++ b/kernel/exit.c
@@ -0,0 +1,1527 @@
1/*
2 * linux/kernel/exit.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7#include <linux/config.h>
8#include <linux/mm.h>
9#include <linux/slab.h>
10#include <linux/interrupt.h>
11#include <linux/smp_lock.h>
12#include <linux/module.h>
13#include <linux/completion.h>
14#include <linux/personality.h>
15#include <linux/tty.h>
16#include <linux/namespace.h>
17#include <linux/key.h>
18#include <linux/security.h>
19#include <linux/cpu.h>
20#include <linux/acct.h>
21#include <linux/file.h>
22#include <linux/binfmts.h>
23#include <linux/ptrace.h>
24#include <linux/profile.h>
25#include <linux/mount.h>
26#include <linux/proc_fs.h>
27#include <linux/mempolicy.h>
28#include <linux/cpuset.h>
29#include <linux/syscalls.h>
30
31#include <asm/uaccess.h>
32#include <asm/unistd.h>
33#include <asm/pgtable.h>
34#include <asm/mmu_context.h>
35
36extern void sem_exit (void);
37extern struct task_struct *child_reaper;
38
39int getrusage(struct task_struct *, int, struct rusage __user *);
40
41static void __unhash_process(struct task_struct *p)
42{
43 nr_threads--;
44 detach_pid(p, PIDTYPE_PID);
45 detach_pid(p, PIDTYPE_TGID);
46 if (thread_group_leader(p)) {
47 detach_pid(p, PIDTYPE_PGID);
48 detach_pid(p, PIDTYPE_SID);
49 if (p->pid)
50 __get_cpu_var(process_counts)--;
51 }
52
53 REMOVE_LINKS(p);
54}
55
56void release_task(struct task_struct * p)
57{
58 int zap_leader;
59 task_t *leader;
60 struct dentry *proc_dentry;
61
62repeat:
63 atomic_dec(&p->user->processes);
64 spin_lock(&p->proc_lock);
65 proc_dentry = proc_pid_unhash(p);
66 write_lock_irq(&tasklist_lock);
67 if (unlikely(p->ptrace))
68 __ptrace_unlink(p);
69 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
70 __exit_signal(p);
71 __exit_sighand(p);
72 __unhash_process(p);
73
74 /*
75 * If we are the last non-leader member of the thread
76 * group, and the leader is zombie, then notify the
77 * group leader's parent process. (if it wants notification.)
78 */
79 zap_leader = 0;
80 leader = p->group_leader;
81 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
82 BUG_ON(leader->exit_signal == -1);
83 do_notify_parent(leader, leader->exit_signal);
84 /*
85 * If we were the last child thread and the leader has
86 * exited already, and the leader's parent ignores SIGCHLD,
87 * then we are the one who should release the leader.
88 *
89 * do_notify_parent() will have marked it self-reaping in
90 * that case.
91 */
92 zap_leader = (leader->exit_signal == -1);
93 }
94
95 sched_exit(p);
96 write_unlock_irq(&tasklist_lock);
97 spin_unlock(&p->proc_lock);
98 proc_pid_flush(proc_dentry);
99 release_thread(p);
100 put_task_struct(p);
101
102 p = leader;
103 if (unlikely(zap_leader))
104 goto repeat;
105}
106
107/* we are using it only for SMP init */
108
109void unhash_process(struct task_struct *p)
110{
111 struct dentry *proc_dentry;
112
113 spin_lock(&p->proc_lock);
114 proc_dentry = proc_pid_unhash(p);
115 write_lock_irq(&tasklist_lock);
116 __unhash_process(p);
117 write_unlock_irq(&tasklist_lock);
118 spin_unlock(&p->proc_lock);
119 proc_pid_flush(proc_dentry);
120}
121
122/*
123 * This checks not only the pgrp, but falls back on the pid if no
124 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
125 * without this...
126 */
127int session_of_pgrp(int pgrp)
128{
129 struct task_struct *p;
130 int sid = -1;
131
132 read_lock(&tasklist_lock);
133 do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
134 if (p->signal->session > 0) {
135 sid = p->signal->session;
136 goto out;
137 }
138 } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
139 p = find_task_by_pid(pgrp);
140 if (p)
141 sid = p->signal->session;
142out:
143 read_unlock(&tasklist_lock);
144
145 return sid;
146}
147
148/*
149 * Determine if a process group is "orphaned", according to the POSIX
150 * definition in 2.2.2.52. Orphaned process groups are not to be affected
151 * by terminal-generated stop signals. Newly orphaned process groups are
152 * to receive a SIGHUP and a SIGCONT.
153 *
154 * "I ask you, have you ever known what it is to be an orphan?"
155 */
156static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
157{
158 struct task_struct *p;
159 int ret = 1;
160
161 do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
162 if (p == ignored_task
163 || p->exit_state
164 || p->real_parent->pid == 1)
165 continue;
166 if (process_group(p->real_parent) != pgrp
167 && p->real_parent->signal->session == p->signal->session) {
168 ret = 0;
169 break;
170 }
171 } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
172 return ret; /* (sighing) "Often!" */
173}
174
175int is_orphaned_pgrp(int pgrp)
176{
177 int retval;
178
179 read_lock(&tasklist_lock);
180 retval = will_become_orphaned_pgrp(pgrp, NULL);
181 read_unlock(&tasklist_lock);
182
183 return retval;
184}
185
186static inline int has_stopped_jobs(int pgrp)
187{
188 int retval = 0;
189 struct task_struct *p;
190
191 do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
192 if (p->state != TASK_STOPPED)
193 continue;
194
195 /* If p is stopped by a debugger on a signal that won't
196 stop it, then don't count p as stopped. This isn't
197 perfect but it's a good approximation. */
198 if (unlikely (p->ptrace)
199 && p->exit_code != SIGSTOP
200 && p->exit_code != SIGTSTP
201 && p->exit_code != SIGTTOU
202 && p->exit_code != SIGTTIN)
203 continue;
204
205 retval = 1;
206 break;
207 } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
208 return retval;
209}
210
211/**
212 * reparent_to_init() - Reparent the calling kernel thread to the init task.
213 *
214 * If a kernel thread is launched as a result of a system call, or if
215 * it ever exits, it should generally reparent itself to init so that
216 * it is correctly cleaned up on exit.
217 *
218 * The various task state such as scheduling policy and priority may have
219 * been inherited from a user process, so we reset them to sane values here.
220 *
221 * NOTE that reparent_to_init() gives the caller full capabilities.
222 */
223void reparent_to_init(void)
224{
225 write_lock_irq(&tasklist_lock);
226
227 ptrace_unlink(current);
228 /* Reparent to init */
229 REMOVE_LINKS(current);
230 current->parent = child_reaper;
231 current->real_parent = child_reaper;
232 SET_LINKS(current);
233
234 /* Set the exit signal to SIGCHLD so we signal init on exit */
235 current->exit_signal = SIGCHLD;
236
237 if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
238 set_user_nice(current, 0);
239 /* cpus_allowed? */
240 /* rt_priority? */
241 /* signals? */
242 security_task_reparent_to_init(current);
243 memcpy(current->signal->rlim, init_task.signal->rlim,
244 sizeof(current->signal->rlim));
245 atomic_inc(&(INIT_USER->__count));
246 write_unlock_irq(&tasklist_lock);
247 switch_uid(INIT_USER);
248}
249
250void __set_special_pids(pid_t session, pid_t pgrp)
251{
252 struct task_struct *curr = current;
253
254 if (curr->signal->session != session) {
255 detach_pid(curr, PIDTYPE_SID);
256 curr->signal->session = session;
257 attach_pid(curr, PIDTYPE_SID, session);
258 }
259 if (process_group(curr) != pgrp) {
260 detach_pid(curr, PIDTYPE_PGID);
261 curr->signal->pgrp = pgrp;
262 attach_pid(curr, PIDTYPE_PGID, pgrp);
263 }
264}
265
266void set_special_pids(pid_t session, pid_t pgrp)
267{
268 write_lock_irq(&tasklist_lock);
269 __set_special_pids(session, pgrp);
270 write_unlock_irq(&tasklist_lock);
271}
272
273/*
274 * Let kernel threads use this to say that they
275 * allow a certain signal (since daemonize() will
276 * have disabled all of them by default).
277 */
278int allow_signal(int sig)
279{
280 if (sig < 1 || sig > _NSIG)
281 return -EINVAL;
282
283 spin_lock_irq(&current->sighand->siglock);
284 sigdelset(&current->blocked, sig);
285 if (!current->mm) {
286 /* Kernel threads handle their own signals.
287 Let the signal code know it'll be handled, so
288 that they don't get converted to SIGKILL or
289 just silently dropped */
290 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
291 }
292 recalc_sigpending();
293 spin_unlock_irq(&current->sighand->siglock);
294 return 0;
295}
296
297EXPORT_SYMBOL(allow_signal);
298
299int disallow_signal(int sig)
300{
301 if (sig < 1 || sig > _NSIG)
302 return -EINVAL;
303
304 spin_lock_irq(&current->sighand->siglock);
305 sigaddset(&current->blocked, sig);
306 recalc_sigpending();
307 spin_unlock_irq(&current->sighand->siglock);
308 return 0;
309}
310
311EXPORT_SYMBOL(disallow_signal);
312
313/*
314 * Put all the gunge required to become a kernel thread without
315 * attached user resources in one place where it belongs.
316 */
317
318void daemonize(const char *name, ...)
319{
320 va_list args;
321 struct fs_struct *fs;
322 sigset_t blocked;
323
324 va_start(args, name);
325 vsnprintf(current->comm, sizeof(current->comm), name, args);
326 va_end(args);
327
328 /*
329 * If we were started as result of loading a module, close all of the
330 * user space pages. We don't need them, and if we didn't close them
331 * they would be locked into memory.
332 */
333 exit_mm(current);
334
335 set_special_pids(1, 1);
336 down(&tty_sem);
337 current->signal->tty = NULL;
338 up(&tty_sem);
339
340 /* Block and flush all signals */
341 sigfillset(&blocked);
342 sigprocmask(SIG_BLOCK, &blocked, NULL);
343 flush_signals(current);
344
345 /* Become as one with the init task */
346
347 exit_fs(current); /* current->fs->count--; */
348 fs = init_task.fs;
349 current->fs = fs;
350 atomic_inc(&fs->count);
351 exit_files(current);
352 current->files = init_task.files;
353 atomic_inc(&current->files->count);
354
355 reparent_to_init();
356}
357
358EXPORT_SYMBOL(daemonize);
359
360static inline void close_files(struct files_struct * files)
361{
362 int i, j;
363
364 j = 0;
365 for (;;) {
366 unsigned long set;
367 i = j * __NFDBITS;
368 if (i >= files->max_fdset || i >= files->max_fds)
369 break;
370 set = files->open_fds->fds_bits[j++];
371 while (set) {
372 if (set & 1) {
373 struct file * file = xchg(&files->fd[i], NULL);
374 if (file)
375 filp_close(file, files);
376 }
377 i++;
378 set >>= 1;
379 }
380 }
381}
382
383struct files_struct *get_files_struct(struct task_struct *task)
384{
385 struct files_struct *files;
386
387 task_lock(task);
388 files = task->files;
389 if (files)
390 atomic_inc(&files->count);
391 task_unlock(task);
392
393 return files;
394}
395
396void fastcall put_files_struct(struct files_struct *files)
397{
398 if (atomic_dec_and_test(&files->count)) {
399 close_files(files);
400 /*
401 * Free the fd and fdset arrays if we expanded them.
402 */
403 if (files->fd != &files->fd_array[0])
404 free_fd_array(files->fd, files->max_fds);
405 if (files->max_fdset > __FD_SETSIZE) {
406 free_fdset(files->open_fds, files->max_fdset);
407 free_fdset(files->close_on_exec, files->max_fdset);
408 }
409 kmem_cache_free(files_cachep, files);
410 }
411}
412
413EXPORT_SYMBOL(put_files_struct);
414
415static inline void __exit_files(struct task_struct *tsk)
416{
417 struct files_struct * files = tsk->files;
418
419 if (files) {
420 task_lock(tsk);
421 tsk->files = NULL;
422 task_unlock(tsk);
423 put_files_struct(files);
424 }
425}
426
427void exit_files(struct task_struct *tsk)
428{
429 __exit_files(tsk);
430}
431
432static inline void __put_fs_struct(struct fs_struct *fs)
433{
434 /* No need to hold fs->lock if we are killing it */
435 if (atomic_dec_and_test(&fs->count)) {
436 dput(fs->root);
437 mntput(fs->rootmnt);
438 dput(fs->pwd);
439 mntput(fs->pwdmnt);
440 if (fs->altroot) {
441 dput(fs->altroot);
442 mntput(fs->altrootmnt);
443 }
444 kmem_cache_free(fs_cachep, fs);
445 }
446}
447
448void put_fs_struct(struct fs_struct *fs)
449{
450 __put_fs_struct(fs);
451}
452
453static inline void __exit_fs(struct task_struct *tsk)
454{
455 struct fs_struct * fs = tsk->fs;
456
457 if (fs) {
458 task_lock(tsk);
459 tsk->fs = NULL;
460 task_unlock(tsk);
461 __put_fs_struct(fs);
462 }
463}
464
465void exit_fs(struct task_struct *tsk)
466{
467 __exit_fs(tsk);
468}
469
470EXPORT_SYMBOL_GPL(exit_fs);
471
472/*
473 * Turn us into a lazy TLB process if we
474 * aren't already..
475 */
476void exit_mm(struct task_struct * tsk)
477{
478 struct mm_struct *mm = tsk->mm;
479
480 mm_release(tsk, mm);
481 if (!mm)
482 return;
483 /*
484 * Serialize with any possible pending coredump.
485 * We must hold mmap_sem around checking core_waiters
486 * and clearing tsk->mm. The core-inducing thread
487 * will increment core_waiters for each thread in the
488 * group with ->mm != NULL.
489 */
490 down_read(&mm->mmap_sem);
491 if (mm->core_waiters) {
492 up_read(&mm->mmap_sem);
493 down_write(&mm->mmap_sem);
494 if (!--mm->core_waiters)
495 complete(mm->core_startup_done);
496 up_write(&mm->mmap_sem);
497
498 wait_for_completion(&mm->core_done);
499 down_read(&mm->mmap_sem);
500 }
501 atomic_inc(&mm->mm_count);
502 if (mm != tsk->active_mm) BUG();
503 /* more a memory barrier than a real lock */
504 task_lock(tsk);
505 tsk->mm = NULL;
506 up_read(&mm->mmap_sem);
507 enter_lazy_tlb(mm, current);
508 task_unlock(tsk);
509 mmput(mm);
510}
511
512static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
513{
514 /*
515 * Make sure we're not reparenting to ourselves and that
516 * the parent is not a zombie.
517 */
518 BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
519 p->real_parent = reaper;
520 if (p->parent == p->real_parent)
521 BUG();
522}
523
524static inline void reparent_thread(task_t *p, task_t *father, int traced)
525{
526 /* We don't want people slaying init. */
527 if (p->exit_signal != -1)
528 p->exit_signal = SIGCHLD;
529
530 if (p->pdeath_signal)
531 /* We already hold the tasklist_lock here. */
532 group_send_sig_info(p->pdeath_signal, (void *) 0, p);
533
534 /* Move the child from its dying parent to the new one. */
535 if (unlikely(traced)) {
536 /* Preserve ptrace links if someone else is tracing this child. */
537 list_del_init(&p->ptrace_list);
538 if (p->parent != p->real_parent)
539 list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
540 } else {
541 /* If this child is being traced, then we're the one tracing it
542 * anyway, so let go of it.
543 */
544 p->ptrace = 0;
545 list_del_init(&p->sibling);
546 p->parent = p->real_parent;
547 list_add_tail(&p->sibling, &p->parent->children);
548
549 /* If we'd notified the old parent about this child's death,
550 * also notify the new parent.
551 */
552 if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
553 thread_group_empty(p))
554 do_notify_parent(p, p->exit_signal);
555 else if (p->state == TASK_TRACED) {
556 /*
557 * If it was at a trace stop, turn it into
558 * a normal stop since it's no longer being
559 * traced.
560 */
561 ptrace_untrace(p);
562 }
563 }
564
565 /*
566 * process group orphan check
567 * Case ii: Our child is in a different pgrp
568 * than we are, and it was the only connection
569 * outside, so the child pgrp is now orphaned.
570 */
571 if ((process_group(p) != process_group(father)) &&
572 (p->signal->session == father->signal->session)) {
573 int pgrp = process_group(p);
574
575 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
576 __kill_pg_info(SIGHUP, (void *)1, pgrp);
577 __kill_pg_info(SIGCONT, (void *)1, pgrp);
578 }
579 }
580}
581
582/*
583 * When we die, we re-parent all our children.
584 * Try to give them to another thread in our thread
585 * group, and if no such member exists, give it to
586 * the global child reaper process (ie "init")
587 */
588static inline void forget_original_parent(struct task_struct * father,
589 struct list_head *to_release)
590{
591 struct task_struct *p, *reaper = father;
592 struct list_head *_p, *_n;
593
594 do {
595 reaper = next_thread(reaper);
596 if (reaper == father) {
597 reaper = child_reaper;
598 break;
599 }
600 } while (reaper->exit_state);
601
602 /*
603 * There are only two places where our children can be:
604 *
605 * - in our child list
606 * - in our ptraced child list
607 *
608 * Search them and reparent children.
609 */
610 list_for_each_safe(_p, _n, &father->children) {
611 int ptrace;
612 p = list_entry(_p,struct task_struct,sibling);
613
614 ptrace = p->ptrace;
615
616 /* if father isn't the real parent, then ptrace must be enabled */
617 BUG_ON(father != p->real_parent && !ptrace);
618
619 if (father == p->real_parent) {
620 /* reparent with a reaper, real father it's us */
621 choose_new_parent(p, reaper, child_reaper);
622 reparent_thread(p, father, 0);
623 } else {
624 /* reparent ptraced task to its real parent */
625 __ptrace_unlink (p);
626 if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
627 thread_group_empty(p))
628 do_notify_parent(p, p->exit_signal);
629 }
630
631 /*
632 * if the ptraced child is a zombie with exit_signal == -1
633 * we must collect it before we exit, or it will remain
634 * zombie forever since we prevented it from self-reap itself
635 * while it was being traced by us, to be able to see it in wait4.
636 */
637 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
638 list_add(&p->ptrace_list, to_release);
639 }
640 list_for_each_safe(_p, _n, &father->ptrace_children) {
641 p = list_entry(_p,struct task_struct,ptrace_list);
642 choose_new_parent(p, reaper, child_reaper);
643 reparent_thread(p, father, 1);
644 }
645}
646
647/*
648 * Send signals to all our closest relatives so that they know
649 * to properly mourn us..
650 */
651static void exit_notify(struct task_struct *tsk)
652{
653 int state;
654 struct task_struct *t;
655 struct list_head ptrace_dead, *_p, *_n;
656
657 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
658 && !thread_group_empty(tsk)) {
659 /*
660 * This occurs when there was a race between our exit
661 * syscall and a group signal choosing us as the one to
662 * wake up. It could be that we are the only thread
663 * alerted to check for pending signals, but another thread
664 * should be woken now to take the signal since we will not.
665 * Now we'll wake all the threads in the group just to make
666 * sure someone gets all the pending signals.
667 */
668 read_lock(&tasklist_lock);
669 spin_lock_irq(&tsk->sighand->siglock);
670 for (t = next_thread(tsk); t != tsk; t = next_thread(t))
671 if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
672 recalc_sigpending_tsk(t);
673 if (signal_pending(t))
674 signal_wake_up(t, 0);
675 }
676 spin_unlock_irq(&tsk->sighand->siglock);
677 read_unlock(&tasklist_lock);
678 }
679
680 write_lock_irq(&tasklist_lock);
681
682 /*
683 * This does two things:
684 *
685 * A. Make init inherit all the child processes
686 * B. Check to see if any process groups have become orphaned
687 * as a result of our exiting, and if they have any stopped
688 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
689 */
690
691 INIT_LIST_HEAD(&ptrace_dead);
692 forget_original_parent(tsk, &ptrace_dead);
693 BUG_ON(!list_empty(&tsk->children));
694 BUG_ON(!list_empty(&tsk->ptrace_children));
695
696 /*
697 * Check to see if any process groups have become orphaned
698 * as a result of our exiting, and if they have any stopped
699 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
700 *
701 * Case i: Our father is in a different pgrp than we are
702 * and we were the only connection outside, so our pgrp
703 * is about to become orphaned.
704 */
705
706 t = tsk->real_parent;
707
708 if ((process_group(t) != process_group(tsk)) &&
709 (t->signal->session == tsk->signal->session) &&
710 will_become_orphaned_pgrp(process_group(tsk), tsk) &&
711 has_stopped_jobs(process_group(tsk))) {
712 __kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
713 __kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
714 }
715
716 /* Let father know we died
717 *
718 * Thread signals are configurable, but you aren't going to use
719 * that to send signals to arbitary processes.
720 * That stops right now.
721 *
722 * If the parent exec id doesn't match the exec id we saved
723 * when we started then we know the parent has changed security
724 * domain.
725 *
726 * If our self_exec id doesn't match our parent_exec_id then
727 * we have changed execution domain as these two values started
728 * the same after a fork.
729 *
730 */
731
732 if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
733 ( tsk->parent_exec_id != t->self_exec_id ||
734 tsk->self_exec_id != tsk->parent_exec_id)
735 && !capable(CAP_KILL))
736 tsk->exit_signal = SIGCHLD;
737
738
739 /* If something other than our normal parent is ptracing us, then
740 * send it a SIGCHLD instead of honoring exit_signal. exit_signal
741 * only has special meaning to our real parent.
742 */
743 if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
744 int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
745 do_notify_parent(tsk, signal);
746 } else if (tsk->ptrace) {
747 do_notify_parent(tsk, SIGCHLD);
748 }
749
750 state = EXIT_ZOMBIE;
751 if (tsk->exit_signal == -1 &&
752 (likely(tsk->ptrace == 0) ||
753 unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
754 state = EXIT_DEAD;
755 tsk->exit_state = state;
756
757 write_unlock_irq(&tasklist_lock);
758
759 list_for_each_safe(_p, _n, &ptrace_dead) {
760 list_del_init(_p);
761 t = list_entry(_p,struct task_struct,ptrace_list);
762 release_task(t);
763 }
764
765 /* If the process is dead, release it - nobody will wait for it */
766 if (state == EXIT_DEAD)
767 release_task(tsk);
768
769 /* PF_DEAD causes final put_task_struct after we schedule. */
770 preempt_disable();
771 tsk->flags |= PF_DEAD;
772}
773
774fastcall NORET_TYPE void do_exit(long code)
775{
776 struct task_struct *tsk = current;
777 int group_dead;
778
779 profile_task_exit(tsk);
780
781 if (unlikely(in_interrupt()))
782 panic("Aiee, killing interrupt handler!");
783 if (unlikely(!tsk->pid))
784 panic("Attempted to kill the idle task!");
785 if (unlikely(tsk->pid == 1))
786 panic("Attempted to kill init!");
787 if (tsk->io_context)
788 exit_io_context();
789
790 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
791 current->ptrace_message = code;
792 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
793 }
794
795 tsk->flags |= PF_EXITING;
796
797 /*
798 * Make sure we don't try to process any timer firings
799 * while we are already exiting.
800 */
801 tsk->it_virt_expires = cputime_zero;
802 tsk->it_prof_expires = cputime_zero;
803 tsk->it_sched_expires = 0;
804
805 if (unlikely(in_atomic()))
806 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
807 current->comm, current->pid,
808 preempt_count());
809
810 acct_update_integrals(tsk);
811 update_mem_hiwater(tsk);
812 group_dead = atomic_dec_and_test(&tsk->signal->live);
813 if (group_dead) {
814 del_timer_sync(&tsk->signal->real_timer);
815 acct_process(code);
816 }
817 exit_mm(tsk);
818
819 exit_sem(tsk);
820 __exit_files(tsk);
821 __exit_fs(tsk);
822 exit_namespace(tsk);
823 exit_thread();
824 cpuset_exit(tsk);
825 exit_keys(tsk);
826
827 if (group_dead && tsk->signal->leader)
828 disassociate_ctty(1);
829
830 module_put(tsk->thread_info->exec_domain->module);
831 if (tsk->binfmt)
832 module_put(tsk->binfmt->module);
833
834 tsk->exit_code = code;
835 exit_notify(tsk);
836#ifdef CONFIG_NUMA
837 mpol_free(tsk->mempolicy);
838 tsk->mempolicy = NULL;
839#endif
840
841 BUG_ON(!(current->flags & PF_DEAD));
842 schedule();
843 BUG();
844 /* Avoid "noreturn function does return". */
845 for (;;) ;
846}
847
848NORET_TYPE void complete_and_exit(struct completion *comp, long code)
849{
850 if (comp)
851 complete(comp);
852
853 do_exit(code);
854}
855
856EXPORT_SYMBOL(complete_and_exit);
857
858asmlinkage long sys_exit(int error_code)
859{
860 do_exit((error_code&0xff)<<8);
861}
862
863task_t fastcall *next_thread(const task_t *p)
864{
865 return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
866}
867
868EXPORT_SYMBOL(next_thread);
869
870/*
871 * Take down every thread in the group. This is called by fatal signals
872 * as well as by sys_exit_group (below).
873 */
874NORET_TYPE void
875do_group_exit(int exit_code)
876{
877 BUG_ON(exit_code & 0x80); /* core dumps don't get here */
878
879 if (current->signal->flags & SIGNAL_GROUP_EXIT)
880 exit_code = current->signal->group_exit_code;
881 else if (!thread_group_empty(current)) {
882 struct signal_struct *const sig = current->signal;
883 struct sighand_struct *const sighand = current->sighand;
884 read_lock(&tasklist_lock);
885 spin_lock_irq(&sighand->siglock);
886 if (sig->flags & SIGNAL_GROUP_EXIT)
887 /* Another thread got here before we took the lock. */
888 exit_code = sig->group_exit_code;
889 else {
890 sig->flags = SIGNAL_GROUP_EXIT;
891 sig->group_exit_code = exit_code;
892 zap_other_threads(current);
893 }
894 spin_unlock_irq(&sighand->siglock);
895 read_unlock(&tasklist_lock);
896 }
897
898 do_exit(exit_code);
899 /* NOTREACHED */
900}
901
902/*
903 * this kills every thread in the thread group. Note that any externally
904 * wait4()-ing process will get the correct exit code - even if this
905 * thread is not the thread group leader.
906 */
907asmlinkage void sys_exit_group(int error_code)
908{
909 do_group_exit((error_code & 0xff) << 8);
910}
911
912static int eligible_child(pid_t pid, int options, task_t *p)
913{
914 if (pid > 0) {
915 if (p->pid != pid)
916 return 0;
917 } else if (!pid) {
918 if (process_group(p) != process_group(current))
919 return 0;
920 } else if (pid != -1) {
921 if (process_group(p) != -pid)
922 return 0;
923 }
924
925 /*
926 * Do not consider detached threads that are
927 * not ptraced:
928 */
929 if (p->exit_signal == -1 && !p->ptrace)
930 return 0;
931
932 /* Wait for all children (clone and not) if __WALL is set;
933 * otherwise, wait for clone children *only* if __WCLONE is
934 * set; otherwise, wait for non-clone children *only*. (Note:
935 * A "clone" child here is one that reports to its parent
936 * using a signal other than SIGCHLD.) */
937 if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
938 && !(options & __WALL))
939 return 0;
940 /*
941 * Do not consider thread group leaders that are
942 * in a non-empty thread group:
943 */
944 if (current->tgid != p->tgid && delay_group_leader(p))
945 return 2;
946
947 if (security_task_wait(p))
948 return 0;
949
950 return 1;
951}
952
953static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
954 int why, int status,
955 struct siginfo __user *infop,
956 struct rusage __user *rusagep)
957{
958 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
959 put_task_struct(p);
960 if (!retval)
961 retval = put_user(SIGCHLD, &infop->si_signo);
962 if (!retval)
963 retval = put_user(0, &infop->si_errno);
964 if (!retval)
965 retval = put_user((short)why, &infop->si_code);
966 if (!retval)
967 retval = put_user(pid, &infop->si_pid);
968 if (!retval)
969 retval = put_user(uid, &infop->si_uid);
970 if (!retval)
971 retval = put_user(status, &infop->si_status);
972 if (!retval)
973 retval = pid;
974 return retval;
975}
976
977/*
978 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
979 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
980 * the lock and this task is uninteresting. If we return nonzero, we have
981 * released the lock and the system call should return.
982 */
983static int wait_task_zombie(task_t *p, int noreap,
984 struct siginfo __user *infop,
985 int __user *stat_addr, struct rusage __user *ru)
986{
987 unsigned long state;
988 int retval;
989 int status;
990
991 if (unlikely(noreap)) {
992 pid_t pid = p->pid;
993 uid_t uid = p->uid;
994 int exit_code = p->exit_code;
995 int why, status;
996
997 if (unlikely(p->exit_state != EXIT_ZOMBIE))
998 return 0;
999 if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
1000 return 0;
1001 get_task_struct(p);
1002 read_unlock(&tasklist_lock);
1003 if ((exit_code & 0x7f) == 0) {
1004 why = CLD_EXITED;
1005 status = exit_code >> 8;
1006 } else {
1007 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1008 status = exit_code & 0x7f;
1009 }
1010 return wait_noreap_copyout(p, pid, uid, why,
1011 status, infop, ru);
1012 }
1013
1014 /*
1015 * Try to move the task's state to DEAD
1016 * only one thread is allowed to do this:
1017 */
1018 state = xchg(&p->exit_state, EXIT_DEAD);
1019 if (state != EXIT_ZOMBIE) {
1020 BUG_ON(state != EXIT_DEAD);
1021 return 0;
1022 }
1023 if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
1024 /*
1025 * This can only happen in a race with a ptraced thread
1026 * dying on another processor.
1027 */
1028 return 0;
1029 }
1030
1031 if (likely(p->real_parent == p->parent) && likely(p->signal)) {
1032 /*
1033 * The resource counters for the group leader are in its
1034 * own task_struct. Those for dead threads in the group
1035 * are in its signal_struct, as are those for the child
1036 * processes it has previously reaped. All these
1037 * accumulate in the parent's signal_struct c* fields.
1038 *
1039 * We don't bother to take a lock here to protect these
1040 * p->signal fields, because they are only touched by
1041 * __exit_signal, which runs with tasklist_lock
1042 * write-locked anyway, and so is excluded here. We do
1043 * need to protect the access to p->parent->signal fields,
1044 * as other threads in the parent group can be right
1045 * here reaping other children at the same time.
1046 */
1047 spin_lock_irq(&p->parent->sighand->siglock);
1048 p->parent->signal->cutime =
1049 cputime_add(p->parent->signal->cutime,
1050 cputime_add(p->utime,
1051 cputime_add(p->signal->utime,
1052 p->signal->cutime)));
1053 p->parent->signal->cstime =
1054 cputime_add(p->parent->signal->cstime,
1055 cputime_add(p->stime,
1056 cputime_add(p->signal->stime,
1057 p->signal->cstime)));
1058 p->parent->signal->cmin_flt +=
1059 p->min_flt + p->signal->min_flt + p->signal->cmin_flt;
1060 p->parent->signal->cmaj_flt +=
1061 p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt;
1062 p->parent->signal->cnvcsw +=
1063 p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw;
1064 p->parent->signal->cnivcsw +=
1065 p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw;
1066 spin_unlock_irq(&p->parent->sighand->siglock);
1067 }
1068
1069 /*
1070 * Now we are sure this task is interesting, and no other
1071 * thread can reap it because we set its state to EXIT_DEAD.
1072 */
1073 read_unlock(&tasklist_lock);
1074
1075 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1076 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1077 ? p->signal->group_exit_code : p->exit_code;
1078 if (!retval && stat_addr)
1079 retval = put_user(status, stat_addr);
1080 if (!retval && infop)
1081 retval = put_user(SIGCHLD, &infop->si_signo);
1082 if (!retval && infop)
1083 retval = put_user(0, &infop->si_errno);
1084 if (!retval && infop) {
1085 int why;
1086
1087 if ((status & 0x7f) == 0) {
1088 why = CLD_EXITED;
1089 status >>= 8;
1090 } else {
1091 why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1092 status &= 0x7f;
1093 }
1094 retval = put_user((short)why, &infop->si_code);
1095 if (!retval)
1096 retval = put_user(status, &infop->si_status);
1097 }
1098 if (!retval && infop)
1099 retval = put_user(p->pid, &infop->si_pid);
1100 if (!retval && infop)
1101 retval = put_user(p->uid, &infop->si_uid);
1102 if (retval) {
1103 // TODO: is this safe?
1104 p->exit_state = EXIT_ZOMBIE;
1105 return retval;
1106 }
1107 retval = p->pid;
1108 if (p->real_parent != p->parent) {
1109 write_lock_irq(&tasklist_lock);
1110 /* Double-check with lock held. */
1111 if (p->real_parent != p->parent) {
1112 __ptrace_unlink(p);
1113 // TODO: is this safe?
1114 p->exit_state = EXIT_ZOMBIE;
1115 /*
1116 * If this is not a detached task, notify the parent.
1117 * If it's still not detached after that, don't release
1118 * it now.
1119 */
1120 if (p->exit_signal != -1) {
1121 do_notify_parent(p, p->exit_signal);
1122 if (p->exit_signal != -1)
1123 p = NULL;
1124 }
1125 }
1126 write_unlock_irq(&tasklist_lock);
1127 }
1128 if (p != NULL)
1129 release_task(p);
1130 BUG_ON(!retval);
1131 return retval;
1132}
1133
1134/*
1135 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
1136 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1137 * the lock and this task is uninteresting. If we return nonzero, we have
1138 * released the lock and the system call should return.
1139 */
1140static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
1141 struct siginfo __user *infop,
1142 int __user *stat_addr, struct rusage __user *ru)
1143{
1144 int retval, exit_code;
1145
1146 if (!p->exit_code)
1147 return 0;
1148 if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
1149 p->signal && p->signal->group_stop_count > 0)
1150 /*
1151 * A group stop is in progress and this is the group leader.
1152 * We won't report until all threads have stopped.
1153 */
1154 return 0;
1155
1156 /*
1157 * Now we are pretty sure this task is interesting.
1158 * Make sure it doesn't get reaped out from under us while we
1159 * give up the lock and then examine it below. We don't want to
1160 * keep holding onto the tasklist_lock while we call getrusage and
1161 * possibly take page faults for user memory.
1162 */
1163 get_task_struct(p);
1164 read_unlock(&tasklist_lock);
1165
1166 if (unlikely(noreap)) {
1167 pid_t pid = p->pid;
1168 uid_t uid = p->uid;
1169 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
1170
1171 exit_code = p->exit_code;
1172 if (unlikely(!exit_code) ||
1173 unlikely(p->state > TASK_STOPPED))
1174 goto bail_ref;
1175 return wait_noreap_copyout(p, pid, uid,
1176 why, (exit_code << 8) | 0x7f,
1177 infop, ru);
1178 }
1179
1180 write_lock_irq(&tasklist_lock);
1181
1182 /*
1183 * This uses xchg to be atomic with the thread resuming and setting
1184 * it. It must also be done with the write lock held to prevent a
1185 * race with the EXIT_ZOMBIE case.
1186 */
1187 exit_code = xchg(&p->exit_code, 0);
1188 if (unlikely(p->exit_state)) {
1189 /*
1190 * The task resumed and then died. Let the next iteration
1191 * catch it in EXIT_ZOMBIE. Note that exit_code might
1192 * already be zero here if it resumed and did _exit(0).
1193 * The task itself is dead and won't touch exit_code again;
1194 * other processors in this function are locked out.
1195 */
1196 p->exit_code = exit_code;
1197 exit_code = 0;
1198 }
1199 if (unlikely(exit_code == 0)) {
1200 /*
1201 * Another thread in this function got to it first, or it
1202 * resumed, or it resumed and then died.
1203 */
1204 write_unlock_irq(&tasklist_lock);
1205bail_ref:
1206 put_task_struct(p);
1207 /*
1208 * We are returning to the wait loop without having successfully
1209 * removed the process and having released the lock. We cannot
1210 * continue, since the "p" task pointer is potentially stale.
1211 *
1212 * Return -EAGAIN, and do_wait() will restart the loop from the
1213 * beginning. Do _not_ re-acquire the lock.
1214 */
1215 return -EAGAIN;
1216 }
1217
1218 /* move to end of parent's list to avoid starvation */
1219 remove_parent(p);
1220 add_parent(p, p->parent);
1221
1222 write_unlock_irq(&tasklist_lock);
1223
1224 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1225 if (!retval && stat_addr)
1226 retval = put_user((exit_code << 8) | 0x7f, stat_addr);
1227 if (!retval && infop)
1228 retval = put_user(SIGCHLD, &infop->si_signo);
1229 if (!retval && infop)
1230 retval = put_user(0, &infop->si_errno);
1231 if (!retval && infop)
1232 retval = put_user((short)((p->ptrace & PT_PTRACED)
1233 ? CLD_TRAPPED : CLD_STOPPED),
1234 &infop->si_code);
1235 if (!retval && infop)
1236 retval = put_user(exit_code, &infop->si_status);
1237 if (!retval && infop)
1238 retval = put_user(p->pid, &infop->si_pid);
1239 if (!retval && infop)
1240 retval = put_user(p->uid, &infop->si_uid);
1241 if (!retval)
1242 retval = p->pid;
1243 put_task_struct(p);
1244
1245 BUG_ON(!retval);
1246 return retval;
1247}
1248
1249/*
1250 * Handle do_wait work for one task in a live, non-stopped state.
1251 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1252 * the lock and this task is uninteresting. If we return nonzero, we have
1253 * released the lock and the system call should return.
1254 */
1255static int wait_task_continued(task_t *p, int noreap,
1256 struct siginfo __user *infop,
1257 int __user *stat_addr, struct rusage __user *ru)
1258{
1259 int retval;
1260 pid_t pid;
1261 uid_t uid;
1262
1263 if (unlikely(!p->signal))
1264 return 0;
1265
1266 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1267 return 0;
1268
1269 spin_lock_irq(&p->sighand->siglock);
1270 /* Re-check with the lock held. */
1271 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1272 spin_unlock_irq(&p->sighand->siglock);
1273 return 0;
1274 }
1275 if (!noreap)
1276 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1277 spin_unlock_irq(&p->sighand->siglock);
1278
1279 pid = p->pid;
1280 uid = p->uid;
1281 get_task_struct(p);
1282 read_unlock(&tasklist_lock);
1283
1284 if (!infop) {
1285 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1286 put_task_struct(p);
1287 if (!retval && stat_addr)
1288 retval = put_user(0xffff, stat_addr);
1289 if (!retval)
1290 retval = p->pid;
1291 } else {
1292 retval = wait_noreap_copyout(p, pid, uid,
1293 CLD_CONTINUED, SIGCONT,
1294 infop, ru);
1295 BUG_ON(retval == 0);
1296 }
1297
1298 return retval;
1299}
1300
1301
1302static inline int my_ptrace_child(struct task_struct *p)
1303{
1304 if (!(p->ptrace & PT_PTRACED))
1305 return 0;
1306 if (!(p->ptrace & PT_ATTACHED))
1307 return 1;
1308 /*
1309 * This child was PTRACE_ATTACH'd. We should be seeing it only if
1310 * we are the attacher. If we are the real parent, this is a race
1311 * inside ptrace_attach. It is waiting for the tasklist_lock,
1312 * which we have to switch the parent links, but has already set
1313 * the flags in p->ptrace.
1314 */
1315 return (p->parent != p->real_parent);
1316}
1317
1318static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
1319 int __user *stat_addr, struct rusage __user *ru)
1320{
1321 DECLARE_WAITQUEUE(wait, current);
1322 struct task_struct *tsk;
1323 int flag, retval;
1324
1325 add_wait_queue(&current->signal->wait_chldexit,&wait);
1326repeat:
1327 /*
1328 * We will set this flag if we see any child that might later
1329 * match our criteria, even if we are not able to reap it yet.
1330 */
1331 flag = 0;
1332 current->state = TASK_INTERRUPTIBLE;
1333 read_lock(&tasklist_lock);
1334 tsk = current;
1335 do {
1336 struct task_struct *p;
1337 struct list_head *_p;
1338 int ret;
1339
1340 list_for_each(_p,&tsk->children) {
1341 p = list_entry(_p,struct task_struct,sibling);
1342
1343 ret = eligible_child(pid, options, p);
1344 if (!ret)
1345 continue;
1346
1347 switch (p->state) {
1348 case TASK_TRACED:
1349 if (!my_ptrace_child(p))
1350 continue;
1351 /*FALLTHROUGH*/
1352 case TASK_STOPPED:
1353 /*
1354 * It's stopped now, so it might later
1355 * continue, exit, or stop again.
1356 */
1357 flag = 1;
1358 if (!(options & WUNTRACED) &&
1359 !my_ptrace_child(p))
1360 continue;
1361 retval = wait_task_stopped(p, ret == 2,
1362 (options & WNOWAIT),
1363 infop,
1364 stat_addr, ru);
1365 if (retval == -EAGAIN)
1366 goto repeat;
1367 if (retval != 0) /* He released the lock. */
1368 goto end;
1369 break;
1370 default:
1371 // case EXIT_DEAD:
1372 if (p->exit_state == EXIT_DEAD)
1373 continue;
1374 // case EXIT_ZOMBIE:
1375 if (p->exit_state == EXIT_ZOMBIE) {
1376 /*
1377 * Eligible but we cannot release
1378 * it yet:
1379 */
1380 if (ret == 2)
1381 goto check_continued;
1382 if (!likely(options & WEXITED))
1383 continue;
1384 retval = wait_task_zombie(
1385 p, (options & WNOWAIT),
1386 infop, stat_addr, ru);
1387 /* He released the lock. */
1388 if (retval != 0)
1389 goto end;
1390 break;
1391 }
1392check_continued:
1393 /*
1394 * It's running now, so it might later
1395 * exit, stop, or stop and then continue.
1396 */
1397 flag = 1;
1398 if (!unlikely(options & WCONTINUED))
1399 continue;
1400 retval = wait_task_continued(
1401 p, (options & WNOWAIT),
1402 infop, stat_addr, ru);
1403 if (retval != 0) /* He released the lock. */
1404 goto end;
1405 break;
1406 }
1407 }
1408 if (!flag) {
1409 list_for_each(_p, &tsk->ptrace_children) {
1410 p = list_entry(_p, struct task_struct,
1411 ptrace_list);
1412 if (!eligible_child(pid, options, p))
1413 continue;
1414 flag = 1;
1415 break;
1416 }
1417 }
1418 if (options & __WNOTHREAD)
1419 break;
1420 tsk = next_thread(tsk);
1421 if (tsk->signal != current->signal)
1422 BUG();
1423 } while (tsk != current);
1424
1425 read_unlock(&tasklist_lock);
1426 if (flag) {
1427 retval = 0;
1428 if (options & WNOHANG)
1429 goto end;
1430 retval = -ERESTARTSYS;
1431 if (signal_pending(current))
1432 goto end;
1433 schedule();
1434 goto repeat;
1435 }
1436 retval = -ECHILD;
1437end:
1438 current->state = TASK_RUNNING;
1439 remove_wait_queue(&current->signal->wait_chldexit,&wait);
1440 if (infop) {
1441 if (retval > 0)
1442 retval = 0;
1443 else {
1444 /*
1445 * For a WNOHANG return, clear out all the fields
1446 * we would set so the user can easily tell the
1447 * difference.
1448 */
1449 if (!retval)
1450 retval = put_user(0, &infop->si_signo);
1451 if (!retval)
1452 retval = put_user(0, &infop->si_errno);
1453 if (!retval)
1454 retval = put_user(0, &infop->si_code);
1455 if (!retval)
1456 retval = put_user(0, &infop->si_pid);
1457 if (!retval)
1458 retval = put_user(0, &infop->si_uid);
1459 if (!retval)
1460 retval = put_user(0, &infop->si_status);
1461 }
1462 }
1463 return retval;
1464}
1465
1466asmlinkage long sys_waitid(int which, pid_t pid,
1467 struct siginfo __user *infop, int options,
1468 struct rusage __user *ru)
1469{
1470 long ret;
1471
1472 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
1473 return -EINVAL;
1474 if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1475 return -EINVAL;
1476
1477 switch (which) {
1478 case P_ALL:
1479 pid = -1;
1480 break;
1481 case P_PID:
1482 if (pid <= 0)
1483 return -EINVAL;
1484 break;
1485 case P_PGID:
1486 if (pid <= 0)
1487 return -EINVAL;
1488 pid = -pid;
1489 break;
1490 default:
1491 return -EINVAL;
1492 }
1493
1494 ret = do_wait(pid, options, infop, NULL, ru);
1495
1496 /* avoid REGPARM breakage on x86: */
1497 prevent_tail_call(ret);
1498 return ret;
1499}
1500
1501asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
1502 int options, struct rusage __user *ru)
1503{
1504 long ret;
1505
1506 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1507 __WNOTHREAD|__WCLONE|__WALL))
1508 return -EINVAL;
1509 ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
1510
1511 /* avoid REGPARM breakage on x86: */
1512 prevent_tail_call(ret);
1513 return ret;
1514}
1515
1516#ifdef __ARCH_WANT_SYS_WAITPID
1517
1518/*
1519 * sys_waitpid() remains for compatibility. waitpid() should be
1520 * implemented by calling sys_wait4() from libc.a.
1521 */
1522asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options)
1523{
1524 return sys_wait4(pid, stat_addr, options, NULL);
1525}
1526
1527#endif
diff --git a/kernel/extable.c b/kernel/extable.c
new file mode 100644
index 000000000000..7501b531ceed
--- /dev/null
+++ b/kernel/extable.c
@@ -0,0 +1,67 @@
1/* Rewritten by Rusty Russell, on the backs of many others...
2 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/module.h>
19#include <linux/init.h>
20#include <asm/uaccess.h>
21#include <asm/sections.h>
22
23extern struct exception_table_entry __start___ex_table[];
24extern struct exception_table_entry __stop___ex_table[];
25
26/* Sort the kernel's built-in exception table */
27void __init sort_main_extable(void)
28{
29 sort_extable(__start___ex_table, __stop___ex_table);
30}
31
32/* Given an address, look for it in the exception tables. */
33const struct exception_table_entry *search_exception_tables(unsigned long addr)
34{
35 const struct exception_table_entry *e;
36
37 e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
38 if (!e)
39 e = search_module_extables(addr);
40 return e;
41}
42
43static int core_kernel_text(unsigned long addr)
44{
45 if (addr >= (unsigned long)_stext &&
46 addr <= (unsigned long)_etext)
47 return 1;
48
49 if (addr >= (unsigned long)_sinittext &&
50 addr <= (unsigned long)_einittext)
51 return 1;
52 return 0;
53}
54
55int __kernel_text_address(unsigned long addr)
56{
57 if (core_kernel_text(addr))
58 return 1;
59 return __module_text_address(addr) != NULL;
60}
61
62int kernel_text_address(unsigned long addr)
63{
64 if (core_kernel_text(addr))
65 return 1;
66 return module_text_address(addr) != NULL;
67}
diff --git a/kernel/fork.c b/kernel/fork.c
new file mode 100644
index 000000000000..f42a17f88699
--- /dev/null
+++ b/kernel/fork.c
@@ -0,0 +1,1274 @@
1/*
2 * linux/kernel/fork.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7/*
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */
13
14#include <linux/config.h>
15#include <linux/slab.h>
16#include <linux/init.h>
17#include <linux/unistd.h>
18#include <linux/smp_lock.h>
19#include <linux/module.h>
20#include <linux/vmalloc.h>
21#include <linux/completion.h>
22#include <linux/namespace.h>
23#include <linux/personality.h>
24#include <linux/mempolicy.h>
25#include <linux/sem.h>
26#include <linux/file.h>
27#include <linux/key.h>
28#include <linux/binfmts.h>
29#include <linux/mman.h>
30#include <linux/fs.h>
31#include <linux/cpu.h>
32#include <linux/cpuset.h>
33#include <linux/security.h>
34#include <linux/swap.h>
35#include <linux/syscalls.h>
36#include <linux/jiffies.h>
37#include <linux/futex.h>
38#include <linux/ptrace.h>
39#include <linux/mount.h>
40#include <linux/audit.h>
41#include <linux/profile.h>
42#include <linux/rmap.h>
43#include <linux/acct.h>
44
45#include <asm/pgtable.h>
46#include <asm/pgalloc.h>
47#include <asm/uaccess.h>
48#include <asm/mmu_context.h>
49#include <asm/cacheflush.h>
50#include <asm/tlbflush.h>
51
52/*
53 * Protected counters by write_lock_irq(&tasklist_lock)
54 */
55unsigned long total_forks; /* Handle normal Linux uptimes. */
56int nr_threads; /* The idle threads do not count.. */
57
58int max_threads; /* tunable limit on nr_threads */
59
60DEFINE_PER_CPU(unsigned long, process_counts) = 0;
61
62 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
63
64EXPORT_SYMBOL(tasklist_lock);
65
66int nr_processes(void)
67{
68 int cpu;
69 int total = 0;
70
71 for_each_online_cpu(cpu)
72 total += per_cpu(process_counts, cpu);
73
74 return total;
75}
76
77#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
78# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
79# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
80static kmem_cache_t *task_struct_cachep;
81#endif
82
83/* SLAB cache for signal_struct structures (tsk->signal) */
84kmem_cache_t *signal_cachep;
85
86/* SLAB cache for sighand_struct structures (tsk->sighand) */
87kmem_cache_t *sighand_cachep;
88
89/* SLAB cache for files_struct structures (tsk->files) */
90kmem_cache_t *files_cachep;
91
92/* SLAB cache for fs_struct structures (tsk->fs) */
93kmem_cache_t *fs_cachep;
94
95/* SLAB cache for vm_area_struct structures */
96kmem_cache_t *vm_area_cachep;
97
98/* SLAB cache for mm_struct structures (tsk->mm) */
99static kmem_cache_t *mm_cachep;
100
101void free_task(struct task_struct *tsk)
102{
103 free_thread_info(tsk->thread_info);
104 free_task_struct(tsk);
105}
106EXPORT_SYMBOL(free_task);
107
108void __put_task_struct(struct task_struct *tsk)
109{
110 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
111 WARN_ON(atomic_read(&tsk->usage));
112 WARN_ON(tsk == current);
113
114 if (unlikely(tsk->audit_context))
115 audit_free(tsk);
116 security_task_free(tsk);
117 free_uid(tsk->user);
118 put_group_info(tsk->group_info);
119
120 if (!profile_handoff_task(tsk))
121 free_task(tsk);
122}
123
124void __init fork_init(unsigned long mempages)
125{
126#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
127#ifndef ARCH_MIN_TASKALIGN
128#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
129#endif
130 /* create a slab on which task_structs can be allocated */
131 task_struct_cachep =
132 kmem_cache_create("task_struct", sizeof(struct task_struct),
133 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
134#endif
135
136 /*
137 * The default maximum number of threads is set to a safe
138 * value: the thread structures can take up at most half
139 * of memory.
140 */
141 max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
142
143 /*
144 * we need to allow at least 20 threads to boot a system
145 */
146 if(max_threads < 20)
147 max_threads = 20;
148
149 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
150 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
151 init_task.signal->rlim[RLIMIT_SIGPENDING] =
152 init_task.signal->rlim[RLIMIT_NPROC];
153}
154
155static struct task_struct *dup_task_struct(struct task_struct *orig)
156{
157 struct task_struct *tsk;
158 struct thread_info *ti;
159
160 prepare_to_copy(orig);
161
162 tsk = alloc_task_struct();
163 if (!tsk)
164 return NULL;
165
166 ti = alloc_thread_info(tsk);
167 if (!ti) {
168 free_task_struct(tsk);
169 return NULL;
170 }
171
172 *ti = *orig->thread_info;
173 *tsk = *orig;
174 tsk->thread_info = ti;
175 ti->task = tsk;
176
177 /* One for us, one for whoever does the "release_task()" (usually parent) */
178 atomic_set(&tsk->usage,2);
179 return tsk;
180}
181
182#ifdef CONFIG_MMU
183static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
184{
185 struct vm_area_struct * mpnt, *tmp, **pprev;
186 struct rb_node **rb_link, *rb_parent;
187 int retval;
188 unsigned long charge;
189 struct mempolicy *pol;
190
191 down_write(&oldmm->mmap_sem);
192 flush_cache_mm(current->mm);
193 mm->locked_vm = 0;
194 mm->mmap = NULL;
195 mm->mmap_cache = NULL;
196 mm->free_area_cache = oldmm->mmap_base;
197 mm->map_count = 0;
198 set_mm_counter(mm, rss, 0);
199 set_mm_counter(mm, anon_rss, 0);
200 cpus_clear(mm->cpu_vm_mask);
201 mm->mm_rb = RB_ROOT;
202 rb_link = &mm->mm_rb.rb_node;
203 rb_parent = NULL;
204 pprev = &mm->mmap;
205
206 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
207 struct file *file;
208
209 if (mpnt->vm_flags & VM_DONTCOPY) {
210 __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
211 -vma_pages(mpnt));
212 continue;
213 }
214 charge = 0;
215 if (mpnt->vm_flags & VM_ACCOUNT) {
216 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
217 if (security_vm_enough_memory(len))
218 goto fail_nomem;
219 charge = len;
220 }
221 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
222 if (!tmp)
223 goto fail_nomem;
224 *tmp = *mpnt;
225 pol = mpol_copy(vma_policy(mpnt));
226 retval = PTR_ERR(pol);
227 if (IS_ERR(pol))
228 goto fail_nomem_policy;
229 vma_set_policy(tmp, pol);
230 tmp->vm_flags &= ~VM_LOCKED;
231 tmp->vm_mm = mm;
232 tmp->vm_next = NULL;
233 anon_vma_link(tmp);
234 file = tmp->vm_file;
235 if (file) {
236 struct inode *inode = file->f_dentry->d_inode;
237 get_file(file);
238 if (tmp->vm_flags & VM_DENYWRITE)
239 atomic_dec(&inode->i_writecount);
240
241 /* insert tmp into the share list, just after mpnt */
242 spin_lock(&file->f_mapping->i_mmap_lock);
243 tmp->vm_truncate_count = mpnt->vm_truncate_count;
244 flush_dcache_mmap_lock(file->f_mapping);
245 vma_prio_tree_add(tmp, mpnt);
246 flush_dcache_mmap_unlock(file->f_mapping);
247 spin_unlock(&file->f_mapping->i_mmap_lock);
248 }
249
250 /*
251 * Link in the new vma and copy the page table entries:
252 * link in first so that swapoff can see swap entries,
253 * and try_to_unmap_one's find_vma find the new vma.
254 */
255 spin_lock(&mm->page_table_lock);
256 *pprev = tmp;
257 pprev = &tmp->vm_next;
258
259 __vma_link_rb(mm, tmp, rb_link, rb_parent);
260 rb_link = &tmp->vm_rb.rb_right;
261 rb_parent = &tmp->vm_rb;
262
263 mm->map_count++;
264 retval = copy_page_range(mm, current->mm, tmp);
265 spin_unlock(&mm->page_table_lock);
266
267 if (tmp->vm_ops && tmp->vm_ops->open)
268 tmp->vm_ops->open(tmp);
269
270 if (retval)
271 goto out;
272 }
273 retval = 0;
274
275out:
276 flush_tlb_mm(current->mm);
277 up_write(&oldmm->mmap_sem);
278 return retval;
279fail_nomem_policy:
280 kmem_cache_free(vm_area_cachep, tmp);
281fail_nomem:
282 retval = -ENOMEM;
283 vm_unacct_memory(charge);
284 goto out;
285}
286
287static inline int mm_alloc_pgd(struct mm_struct * mm)
288{
289 mm->pgd = pgd_alloc(mm);
290 if (unlikely(!mm->pgd))
291 return -ENOMEM;
292 return 0;
293}
294
295static inline void mm_free_pgd(struct mm_struct * mm)
296{
297 pgd_free(mm->pgd);
298}
299#else
300#define dup_mmap(mm, oldmm) (0)
301#define mm_alloc_pgd(mm) (0)
302#define mm_free_pgd(mm)
303#endif /* CONFIG_MMU */
304
305 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
306
307#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
308#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
309
310#include <linux/init_task.h>
311
312static struct mm_struct * mm_init(struct mm_struct * mm)
313{
314 atomic_set(&mm->mm_users, 1);
315 atomic_set(&mm->mm_count, 1);
316 init_rwsem(&mm->mmap_sem);
317 INIT_LIST_HEAD(&mm->mmlist);
318 mm->core_waiters = 0;
319 mm->nr_ptes = 0;
320 spin_lock_init(&mm->page_table_lock);
321 rwlock_init(&mm->ioctx_list_lock);
322 mm->ioctx_list = NULL;
323 mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
324 mm->free_area_cache = TASK_UNMAPPED_BASE;
325
326 if (likely(!mm_alloc_pgd(mm))) {
327 mm->def_flags = 0;
328 return mm;
329 }
330 free_mm(mm);
331 return NULL;
332}
333
334/*
335 * Allocate and initialize an mm_struct.
336 */
337struct mm_struct * mm_alloc(void)
338{
339 struct mm_struct * mm;
340
341 mm = allocate_mm();
342 if (mm) {
343 memset(mm, 0, sizeof(*mm));
344 mm = mm_init(mm);
345 }
346 return mm;
347}
348
349/*
350 * Called when the last reference to the mm
351 * is dropped: either by a lazy thread or by
352 * mmput. Free the page directory and the mm.
353 */
354void fastcall __mmdrop(struct mm_struct *mm)
355{
356 BUG_ON(mm == &init_mm);
357 mm_free_pgd(mm);
358 destroy_context(mm);
359 free_mm(mm);
360}
361
362/*
363 * Decrement the use count and release all resources for an mm.
364 */
365void mmput(struct mm_struct *mm)
366{
367 if (atomic_dec_and_test(&mm->mm_users)) {
368 exit_aio(mm);
369 exit_mmap(mm);
370 if (!list_empty(&mm->mmlist)) {
371 spin_lock(&mmlist_lock);
372 list_del(&mm->mmlist);
373 spin_unlock(&mmlist_lock);
374 }
375 put_swap_token(mm);
376 mmdrop(mm);
377 }
378}
379EXPORT_SYMBOL_GPL(mmput);
380
381/**
382 * get_task_mm - acquire a reference to the task's mm
383 *
384 * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
385 * this kernel workthread has transiently adopted a user mm with use_mm,
386 * to do its AIO) is not set and if so returns a reference to it, after
387 * bumping up the use count. User must release the mm via mmput()
388 * after use. Typically used by /proc and ptrace.
389 */
390struct mm_struct *get_task_mm(struct task_struct *task)
391{
392 struct mm_struct *mm;
393
394 task_lock(task);
395 mm = task->mm;
396 if (mm) {
397 if (task->flags & PF_BORROWED_MM)
398 mm = NULL;
399 else
400 atomic_inc(&mm->mm_users);
401 }
402 task_unlock(task);
403 return mm;
404}
405EXPORT_SYMBOL_GPL(get_task_mm);
406
407/* Please note the differences between mmput and mm_release.
408 * mmput is called whenever we stop holding onto a mm_struct,
409 * error success whatever.
410 *
411 * mm_release is called after a mm_struct has been removed
412 * from the current process.
413 *
414 * This difference is important for error handling, when we
415 * only half set up a mm_struct for a new process and need to restore
416 * the old one. Because we mmput the new mm_struct before
417 * restoring the old one. . .
418 * Eric Biederman 10 January 1998
419 */
420void mm_release(struct task_struct *tsk, struct mm_struct *mm)
421{
422 struct completion *vfork_done = tsk->vfork_done;
423
424 /* Get rid of any cached register state */
425 deactivate_mm(tsk, mm);
426
427 /* notify parent sleeping on vfork() */
428 if (vfork_done) {
429 tsk->vfork_done = NULL;
430 complete(vfork_done);
431 }
432 if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
433 u32 __user * tidptr = tsk->clear_child_tid;
434 tsk->clear_child_tid = NULL;
435
436 /*
437 * We don't check the error code - if userspace has
438 * not set up a proper pointer then tough luck.
439 */
440 put_user(0, tidptr);
441 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
442 }
443}
444
445static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
446{
447 struct mm_struct * mm, *oldmm;
448 int retval;
449
450 tsk->min_flt = tsk->maj_flt = 0;
451 tsk->nvcsw = tsk->nivcsw = 0;
452
453 tsk->mm = NULL;
454 tsk->active_mm = NULL;
455
456 /*
457 * Are we cloning a kernel thread?
458 *
459 * We need to steal a active VM for that..
460 */
461 oldmm = current->mm;
462 if (!oldmm)
463 return 0;
464
465 if (clone_flags & CLONE_VM) {
466 atomic_inc(&oldmm->mm_users);
467 mm = oldmm;
468 /*
469 * There are cases where the PTL is held to ensure no
470 * new threads start up in user mode using an mm, which
471 * allows optimizing out ipis; the tlb_gather_mmu code
472 * is an example.
473 */
474 spin_unlock_wait(&oldmm->page_table_lock);
475 goto good_mm;
476 }
477
478 retval = -ENOMEM;
479 mm = allocate_mm();
480 if (!mm)
481 goto fail_nomem;
482
483 /* Copy the current MM stuff.. */
484 memcpy(mm, oldmm, sizeof(*mm));
485 if (!mm_init(mm))
486 goto fail_nomem;
487
488 if (init_new_context(tsk,mm))
489 goto fail_nocontext;
490
491 retval = dup_mmap(mm, oldmm);
492 if (retval)
493 goto free_pt;
494
495 mm->hiwater_rss = get_mm_counter(mm,rss);
496 mm->hiwater_vm = mm->total_vm;
497
498good_mm:
499 tsk->mm = mm;
500 tsk->active_mm = mm;
501 return 0;
502
503free_pt:
504 mmput(mm);
505fail_nomem:
506 return retval;
507
508fail_nocontext:
509 /*
510 * If init_new_context() failed, we cannot use mmput() to free the mm
511 * because it calls destroy_context()
512 */
513 mm_free_pgd(mm);
514 free_mm(mm);
515 return retval;
516}
517
518static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
519{
520 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
521 /* We don't need to lock fs - think why ;-) */
522 if (fs) {
523 atomic_set(&fs->count, 1);
524 rwlock_init(&fs->lock);
525 fs->umask = old->umask;
526 read_lock(&old->lock);
527 fs->rootmnt = mntget(old->rootmnt);
528 fs->root = dget(old->root);
529 fs->pwdmnt = mntget(old->pwdmnt);
530 fs->pwd = dget(old->pwd);
531 if (old->altroot) {
532 fs->altrootmnt = mntget(old->altrootmnt);
533 fs->altroot = dget(old->altroot);
534 } else {
535 fs->altrootmnt = NULL;
536 fs->altroot = NULL;
537 }
538 read_unlock(&old->lock);
539 }
540 return fs;
541}
542
543struct fs_struct *copy_fs_struct(struct fs_struct *old)
544{
545 return __copy_fs_struct(old);
546}
547
548EXPORT_SYMBOL_GPL(copy_fs_struct);
549
550static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
551{
552 if (clone_flags & CLONE_FS) {
553 atomic_inc(&current->fs->count);
554 return 0;
555 }
556 tsk->fs = __copy_fs_struct(current->fs);
557 if (!tsk->fs)
558 return -ENOMEM;
559 return 0;
560}
561
562static int count_open_files(struct files_struct *files, int size)
563{
564 int i;
565
566 /* Find the last open fd */
567 for (i = size/(8*sizeof(long)); i > 0; ) {
568 if (files->open_fds->fds_bits[--i])
569 break;
570 }
571 i = (i+1) * 8 * sizeof(long);
572 return i;
573}
574
575static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
576{
577 struct files_struct *oldf, *newf;
578 struct file **old_fds, **new_fds;
579 int open_files, size, i, error = 0, expand;
580
581 /*
582 * A background process may not have any files ...
583 */
584 oldf = current->files;
585 if (!oldf)
586 goto out;
587
588 if (clone_flags & CLONE_FILES) {
589 atomic_inc(&oldf->count);
590 goto out;
591 }
592
593 /*
594 * Note: we may be using current for both targets (See exec.c)
595 * This works because we cache current->files (old) as oldf. Don't
596 * break this.
597 */
598 tsk->files = NULL;
599 error = -ENOMEM;
600 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
601 if (!newf)
602 goto out;
603
604 atomic_set(&newf->count, 1);
605
606 spin_lock_init(&newf->file_lock);
607 newf->next_fd = 0;
608 newf->max_fds = NR_OPEN_DEFAULT;
609 newf->max_fdset = __FD_SETSIZE;
610 newf->close_on_exec = &newf->close_on_exec_init;
611 newf->open_fds = &newf->open_fds_init;
612 newf->fd = &newf->fd_array[0];
613
614 spin_lock(&oldf->file_lock);
615
616 open_files = count_open_files(oldf, oldf->max_fdset);
617 expand = 0;
618
619 /*
620 * Check whether we need to allocate a larger fd array or fd set.
621 * Note: we're not a clone task, so the open count won't change.
622 */
623 if (open_files > newf->max_fdset) {
624 newf->max_fdset = 0;
625 expand = 1;
626 }
627 if (open_files > newf->max_fds) {
628 newf->max_fds = 0;
629 expand = 1;
630 }
631
632 /* if the old fdset gets grown now, we'll only copy up to "size" fds */
633 if (expand) {
634 spin_unlock(&oldf->file_lock);
635 spin_lock(&newf->file_lock);
636 error = expand_files(newf, open_files-1);
637 spin_unlock(&newf->file_lock);
638 if (error < 0)
639 goto out_release;
640 spin_lock(&oldf->file_lock);
641 }
642
643 old_fds = oldf->fd;
644 new_fds = newf->fd;
645
646 memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
647 memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
648
649 for (i = open_files; i != 0; i--) {
650 struct file *f = *old_fds++;
651 if (f) {
652 get_file(f);
653 } else {
654 /*
655 * The fd may be claimed in the fd bitmap but not yet
656 * instantiated in the files array if a sibling thread
657 * is partway through open(). So make sure that this
658 * fd is available to the new process.
659 */
660 FD_CLR(open_files - i, newf->open_fds);
661 }
662 *new_fds++ = f;
663 }
664 spin_unlock(&oldf->file_lock);
665
666 /* compute the remainder to be cleared */
667 size = (newf->max_fds - open_files) * sizeof(struct file *);
668
669 /* This is long word aligned thus could use a optimized version */
670 memset(new_fds, 0, size);
671
672 if (newf->max_fdset > open_files) {
673 int left = (newf->max_fdset-open_files)/8;
674 int start = open_files / (8 * sizeof(unsigned long));
675
676 memset(&newf->open_fds->fds_bits[start], 0, left);
677 memset(&newf->close_on_exec->fds_bits[start], 0, left);
678 }
679
680 tsk->files = newf;
681 error = 0;
682out:
683 return error;
684
685out_release:
686 free_fdset (newf->close_on_exec, newf->max_fdset);
687 free_fdset (newf->open_fds, newf->max_fdset);
688 free_fd_array(newf->fd, newf->max_fds);
689 kmem_cache_free(files_cachep, newf);
690 goto out;
691}
692
693/*
694 * Helper to unshare the files of the current task.
695 * We don't want to expose copy_files internals to
696 * the exec layer of the kernel.
697 */
698
699int unshare_files(void)
700{
701 struct files_struct *files = current->files;
702 int rc;
703
704 if(!files)
705 BUG();
706
707 /* This can race but the race causes us to copy when we don't
708 need to and drop the copy */
709 if(atomic_read(&files->count) == 1)
710 {
711 atomic_inc(&files->count);
712 return 0;
713 }
714 rc = copy_files(0, current);
715 if(rc)
716 current->files = files;
717 return rc;
718}
719
720EXPORT_SYMBOL(unshare_files);
721
722static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
723{
724 struct sighand_struct *sig;
725
726 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
727 atomic_inc(&current->sighand->count);
728 return 0;
729 }
730 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
731 tsk->sighand = sig;
732 if (!sig)
733 return -ENOMEM;
734 spin_lock_init(&sig->siglock);
735 atomic_set(&sig->count, 1);
736 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
737 return 0;
738}
739
740static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
741{
742 struct signal_struct *sig;
743 int ret;
744
745 if (clone_flags & CLONE_THREAD) {
746 atomic_inc(&current->signal->count);
747 atomic_inc(&current->signal->live);
748 return 0;
749 }
750 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
751 tsk->signal = sig;
752 if (!sig)
753 return -ENOMEM;
754
755 ret = copy_thread_group_keys(tsk);
756 if (ret < 0) {
757 kmem_cache_free(signal_cachep, sig);
758 return ret;
759 }
760
761 atomic_set(&sig->count, 1);
762 atomic_set(&sig->live, 1);
763 init_waitqueue_head(&sig->wait_chldexit);
764 sig->flags = 0;
765 sig->group_exit_code = 0;
766 sig->group_exit_task = NULL;
767 sig->group_stop_count = 0;
768 sig->curr_target = NULL;
769 init_sigpending(&sig->shared_pending);
770 INIT_LIST_HEAD(&sig->posix_timers);
771
772 sig->it_real_value = sig->it_real_incr = 0;
773 sig->real_timer.function = it_real_fn;
774 sig->real_timer.data = (unsigned long) tsk;
775 init_timer(&sig->real_timer);
776
777 sig->it_virt_expires = cputime_zero;
778 sig->it_virt_incr = cputime_zero;
779 sig->it_prof_expires = cputime_zero;
780 sig->it_prof_incr = cputime_zero;
781
782 sig->tty = current->signal->tty;
783 sig->pgrp = process_group(current);
784 sig->session = current->signal->session;
785 sig->leader = 0; /* session leadership doesn't inherit */
786 sig->tty_old_pgrp = 0;
787
788 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
789 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
790 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
791 sig->sched_time = 0;
792 INIT_LIST_HEAD(&sig->cpu_timers[0]);
793 INIT_LIST_HEAD(&sig->cpu_timers[1]);
794 INIT_LIST_HEAD(&sig->cpu_timers[2]);
795
796 task_lock(current->group_leader);
797 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
798 task_unlock(current->group_leader);
799
800 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
801 /*
802 * New sole thread in the process gets an expiry time
803 * of the whole CPU time limit.
804 */
805 tsk->it_prof_expires =
806 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
807 }
808
809 return 0;
810}
811
812static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
813{
814 unsigned long new_flags = p->flags;
815
816 new_flags &= ~PF_SUPERPRIV;
817 new_flags |= PF_FORKNOEXEC;
818 if (!(clone_flags & CLONE_PTRACE))
819 p->ptrace = 0;
820 p->flags = new_flags;
821}
822
823asmlinkage long sys_set_tid_address(int __user *tidptr)
824{
825 current->clear_child_tid = tidptr;
826
827 return current->pid;
828}
829
830/*
831 * This creates a new process as a copy of the old one,
832 * but does not actually start it yet.
833 *
834 * It copies the registers, and all the appropriate
835 * parts of the process environment (as per the clone
836 * flags). The actual kick-off is left to the caller.
837 */
838static task_t *copy_process(unsigned long clone_flags,
839 unsigned long stack_start,
840 struct pt_regs *regs,
841 unsigned long stack_size,
842 int __user *parent_tidptr,
843 int __user *child_tidptr,
844 int pid)
845{
846 int retval;
847 struct task_struct *p = NULL;
848
849 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
850 return ERR_PTR(-EINVAL);
851
852 /*
853 * Thread groups must share signals as well, and detached threads
854 * can only be started up within the thread group.
855 */
856 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
857 return ERR_PTR(-EINVAL);
858
859 /*
860 * Shared signal handlers imply shared VM. By way of the above,
861 * thread groups also imply shared VM. Blocking this case allows
862 * for various simplifications in other code.
863 */
864 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
865 return ERR_PTR(-EINVAL);
866
867 retval = security_task_create(clone_flags);
868 if (retval)
869 goto fork_out;
870
871 retval = -ENOMEM;
872 p = dup_task_struct(current);
873 if (!p)
874 goto fork_out;
875
876 retval = -EAGAIN;
877 if (atomic_read(&p->user->processes) >=
878 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
879 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
880 p->user != &root_user)
881 goto bad_fork_free;
882 }
883
884 atomic_inc(&p->user->__count);
885 atomic_inc(&p->user->processes);
886 get_group_info(p->group_info);
887
888 /*
889 * If multiple threads are within copy_process(), then this check
890 * triggers too late. This doesn't hurt, the check is only there
891 * to stop root fork bombs.
892 */
893 if (nr_threads >= max_threads)
894 goto bad_fork_cleanup_count;
895
896 if (!try_module_get(p->thread_info->exec_domain->module))
897 goto bad_fork_cleanup_count;
898
899 if (p->binfmt && !try_module_get(p->binfmt->module))
900 goto bad_fork_cleanup_put_domain;
901
902 p->did_exec = 0;
903 copy_flags(clone_flags, p);
904 p->pid = pid;
905 retval = -EFAULT;
906 if (clone_flags & CLONE_PARENT_SETTID)
907 if (put_user(p->pid, parent_tidptr))
908 goto bad_fork_cleanup;
909
910 p->proc_dentry = NULL;
911
912 INIT_LIST_HEAD(&p->children);
913 INIT_LIST_HEAD(&p->sibling);
914 p->vfork_done = NULL;
915 spin_lock_init(&p->alloc_lock);
916 spin_lock_init(&p->proc_lock);
917
918 clear_tsk_thread_flag(p, TIF_SIGPENDING);
919 init_sigpending(&p->pending);
920
921 p->utime = cputime_zero;
922 p->stime = cputime_zero;
923 p->sched_time = 0;
924 p->rchar = 0; /* I/O counter: bytes read */
925 p->wchar = 0; /* I/O counter: bytes written */
926 p->syscr = 0; /* I/O counter: read syscalls */
927 p->syscw = 0; /* I/O counter: write syscalls */
928 acct_clear_integrals(p);
929
930 p->it_virt_expires = cputime_zero;
931 p->it_prof_expires = cputime_zero;
932 p->it_sched_expires = 0;
933 INIT_LIST_HEAD(&p->cpu_timers[0]);
934 INIT_LIST_HEAD(&p->cpu_timers[1]);
935 INIT_LIST_HEAD(&p->cpu_timers[2]);
936
937 p->lock_depth = -1; /* -1 = no lock */
938 do_posix_clock_monotonic_gettime(&p->start_time);
939 p->security = NULL;
940 p->io_context = NULL;
941 p->io_wait = NULL;
942 p->audit_context = NULL;
943#ifdef CONFIG_NUMA
944 p->mempolicy = mpol_copy(p->mempolicy);
945 if (IS_ERR(p->mempolicy)) {
946 retval = PTR_ERR(p->mempolicy);
947 p->mempolicy = NULL;
948 goto bad_fork_cleanup;
949 }
950#endif
951
952 p->tgid = p->pid;
953 if (clone_flags & CLONE_THREAD)
954 p->tgid = current->tgid;
955
956 if ((retval = security_task_alloc(p)))
957 goto bad_fork_cleanup_policy;
958 if ((retval = audit_alloc(p)))
959 goto bad_fork_cleanup_security;
960 /* copy all the process information */
961 if ((retval = copy_semundo(clone_flags, p)))
962 goto bad_fork_cleanup_audit;
963 if ((retval = copy_files(clone_flags, p)))
964 goto bad_fork_cleanup_semundo;
965 if ((retval = copy_fs(clone_flags, p)))
966 goto bad_fork_cleanup_files;
967 if ((retval = copy_sighand(clone_flags, p)))
968 goto bad_fork_cleanup_fs;
969 if ((retval = copy_signal(clone_flags, p)))
970 goto bad_fork_cleanup_sighand;
971 if ((retval = copy_mm(clone_flags, p)))
972 goto bad_fork_cleanup_signal;
973 if ((retval = copy_keys(clone_flags, p)))
974 goto bad_fork_cleanup_mm;
975 if ((retval = copy_namespace(clone_flags, p)))
976 goto bad_fork_cleanup_keys;
977 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
978 if (retval)
979 goto bad_fork_cleanup_namespace;
980
981 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
982 /*
983 * Clear TID on mm_release()?
984 */
985 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
986
987 /*
988 * Syscall tracing should be turned off in the child regardless
989 * of CLONE_PTRACE.
990 */
991 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
992
993 /* Our parent execution domain becomes current domain
994 These must match for thread signalling to apply */
995
996 p->parent_exec_id = p->self_exec_id;
997
998 /* ok, now we should be set up.. */
999 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1000 p->pdeath_signal = 0;
1001 p->exit_state = 0;
1002
1003 /* Perform scheduler related setup */
1004 sched_fork(p);
1005
1006 /*
1007 * Ok, make it visible to the rest of the system.
1008 * We dont wake it up yet.
1009 */
1010 p->group_leader = p;
1011 INIT_LIST_HEAD(&p->ptrace_children);
1012 INIT_LIST_HEAD(&p->ptrace_list);
1013
1014 /* Need tasklist lock for parent etc handling! */
1015 write_lock_irq(&tasklist_lock);
1016
1017 /*
1018 * The task hasn't been attached yet, so cpus_allowed mask cannot
1019 * have changed. The cpus_allowed mask of the parent may have
1020 * changed after it was copied first time, and it may then move to
1021 * another CPU - so we re-copy it here and set the child's CPU to
1022 * the parent's CPU. This avoids alot of nasty races.
1023 */
1024 p->cpus_allowed = current->cpus_allowed;
1025 set_task_cpu(p, smp_processor_id());
1026
1027 /*
1028 * Check for pending SIGKILL! The new thread should not be allowed
1029 * to slip out of an OOM kill. (or normal SIGKILL.)
1030 */
1031 if (sigismember(&current->pending.signal, SIGKILL)) {
1032 write_unlock_irq(&tasklist_lock);
1033 retval = -EINTR;
1034 goto bad_fork_cleanup_namespace;
1035 }
1036
1037 /* CLONE_PARENT re-uses the old parent */
1038 if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
1039 p->real_parent = current->real_parent;
1040 else
1041 p->real_parent = current;
1042 p->parent = p->real_parent;
1043
1044 if (clone_flags & CLONE_THREAD) {
1045 spin_lock(&current->sighand->siglock);
1046 /*
1047 * Important: if an exit-all has been started then
1048 * do not create this new thread - the whole thread
1049 * group is supposed to exit anyway.
1050 */
1051 if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1052 spin_unlock(&current->sighand->siglock);
1053 write_unlock_irq(&tasklist_lock);
1054 retval = -EAGAIN;
1055 goto bad_fork_cleanup_namespace;
1056 }
1057 p->group_leader = current->group_leader;
1058
1059 if (current->signal->group_stop_count > 0) {
1060 /*
1061 * There is an all-stop in progress for the group.
1062 * We ourselves will stop as soon as we check signals.
1063 * Make the new thread part of that group stop too.
1064 */
1065 current->signal->group_stop_count++;
1066 set_tsk_thread_flag(p, TIF_SIGPENDING);
1067 }
1068
1069 if (!cputime_eq(current->signal->it_virt_expires,
1070 cputime_zero) ||
1071 !cputime_eq(current->signal->it_prof_expires,
1072 cputime_zero) ||
1073 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
1074 !list_empty(&current->signal->cpu_timers[0]) ||
1075 !list_empty(&current->signal->cpu_timers[1]) ||
1076 !list_empty(&current->signal->cpu_timers[2])) {
1077 /*
1078 * Have child wake up on its first tick to check
1079 * for process CPU timers.
1080 */
1081 p->it_prof_expires = jiffies_to_cputime(1);
1082 }
1083
1084 spin_unlock(&current->sighand->siglock);
1085 }
1086
1087 SET_LINKS(p);
1088 if (unlikely(p->ptrace & PT_PTRACED))
1089 __ptrace_link(p, current->parent);
1090
1091 cpuset_fork(p);
1092
1093 attach_pid(p, PIDTYPE_PID, p->pid);
1094 attach_pid(p, PIDTYPE_TGID, p->tgid);
1095 if (thread_group_leader(p)) {
1096 attach_pid(p, PIDTYPE_PGID, process_group(p));
1097 attach_pid(p, PIDTYPE_SID, p->signal->session);
1098 if (p->pid)
1099 __get_cpu_var(process_counts)++;
1100 }
1101
1102 nr_threads++;
1103 total_forks++;
1104 write_unlock_irq(&tasklist_lock);
1105 retval = 0;
1106
1107fork_out:
1108 if (retval)
1109 return ERR_PTR(retval);
1110 return p;
1111
1112bad_fork_cleanup_namespace:
1113 exit_namespace(p);
1114bad_fork_cleanup_keys:
1115 exit_keys(p);
1116bad_fork_cleanup_mm:
1117 if (p->mm)
1118 mmput(p->mm);
1119bad_fork_cleanup_signal:
1120 exit_signal(p);
1121bad_fork_cleanup_sighand:
1122 exit_sighand(p);
1123bad_fork_cleanup_fs:
1124 exit_fs(p); /* blocking */
1125bad_fork_cleanup_files:
1126 exit_files(p); /* blocking */
1127bad_fork_cleanup_semundo:
1128 exit_sem(p);
1129bad_fork_cleanup_audit:
1130 audit_free(p);
1131bad_fork_cleanup_security:
1132 security_task_free(p);
1133bad_fork_cleanup_policy:
1134#ifdef CONFIG_NUMA
1135 mpol_free(p->mempolicy);
1136#endif
1137bad_fork_cleanup:
1138 if (p->binfmt)
1139 module_put(p->binfmt->module);
1140bad_fork_cleanup_put_domain:
1141 module_put(p->thread_info->exec_domain->module);
1142bad_fork_cleanup_count:
1143 put_group_info(p->group_info);
1144 atomic_dec(&p->user->processes);
1145 free_uid(p->user);
1146bad_fork_free:
1147 free_task(p);
1148 goto fork_out;
1149}
1150
1151struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1152{
1153 memset(regs, 0, sizeof(struct pt_regs));
1154 return regs;
1155}
1156
1157task_t * __devinit fork_idle(int cpu)
1158{
1159 task_t *task;
1160 struct pt_regs regs;
1161
1162 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
1163 if (!task)
1164 return ERR_PTR(-ENOMEM);
1165 init_idle(task, cpu);
1166 unhash_process(task);
1167 return task;
1168}
1169
1170static inline int fork_traceflag (unsigned clone_flags)
1171{
1172 if (clone_flags & CLONE_UNTRACED)
1173 return 0;
1174 else if (clone_flags & CLONE_VFORK) {
1175 if (current->ptrace & PT_TRACE_VFORK)
1176 return PTRACE_EVENT_VFORK;
1177 } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
1178 if (current->ptrace & PT_TRACE_CLONE)
1179 return PTRACE_EVENT_CLONE;
1180 } else if (current->ptrace & PT_TRACE_FORK)
1181 return PTRACE_EVENT_FORK;
1182
1183 return 0;
1184}
1185
1186/*
1187 * Ok, this is the main fork-routine.
1188 *
1189 * It copies the process, and if successful kick-starts
1190 * it and waits for it to finish using the VM if required.
1191 */
1192long do_fork(unsigned long clone_flags,
1193 unsigned long stack_start,
1194 struct pt_regs *regs,
1195 unsigned long stack_size,
1196 int __user *parent_tidptr,
1197 int __user *child_tidptr)
1198{
1199 struct task_struct *p;
1200 int trace = 0;
1201 long pid = alloc_pidmap();
1202
1203 if (pid < 0)
1204 return -EAGAIN;
1205 if (unlikely(current->ptrace)) {
1206 trace = fork_traceflag (clone_flags);
1207 if (trace)
1208 clone_flags |= CLONE_PTRACE;
1209 }
1210
1211 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
1212 /*
1213 * Do this prior waking up the new thread - the thread pointer
1214 * might get invalid after that point, if the thread exits quickly.
1215 */
1216 if (!IS_ERR(p)) {
1217 struct completion vfork;
1218
1219 if (clone_flags & CLONE_VFORK) {
1220 p->vfork_done = &vfork;
1221 init_completion(&vfork);
1222 }
1223
1224 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
1225 /*
1226 * We'll start up with an immediate SIGSTOP.
1227 */
1228 sigaddset(&p->pending.signal, SIGSTOP);
1229 set_tsk_thread_flag(p, TIF_SIGPENDING);
1230 }
1231
1232 if (!(clone_flags & CLONE_STOPPED))
1233 wake_up_new_task(p, clone_flags);
1234 else
1235 p->state = TASK_STOPPED;
1236
1237 if (unlikely (trace)) {
1238 current->ptrace_message = pid;
1239 ptrace_notify ((trace << 8) | SIGTRAP);
1240 }
1241
1242 if (clone_flags & CLONE_VFORK) {
1243 wait_for_completion(&vfork);
1244 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
1245 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1246 }
1247 } else {
1248 free_pidmap(pid);
1249 pid = PTR_ERR(p);
1250 }
1251 return pid;
1252}
1253
1254void __init proc_caches_init(void)
1255{
1256 sighand_cachep = kmem_cache_create("sighand_cache",
1257 sizeof(struct sighand_struct), 0,
1258 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1259 signal_cachep = kmem_cache_create("signal_cache",
1260 sizeof(struct signal_struct), 0,
1261 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1262 files_cachep = kmem_cache_create("files_cache",
1263 sizeof(struct files_struct), 0,
1264 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1265 fs_cachep = kmem_cache_create("fs_cache",
1266 sizeof(struct fs_struct), 0,
1267 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1268 vm_area_cachep = kmem_cache_create("vm_area_struct",
1269 sizeof(struct vm_area_struct), 0,
1270 SLAB_PANIC, NULL, NULL);
1271 mm_cachep = kmem_cache_create("mm_struct",
1272 sizeof(struct mm_struct), 0,
1273 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1274}
diff --git a/kernel/futex.c b/kernel/futex.c
new file mode 100644
index 000000000000..7b54a672d0ad
--- /dev/null
+++ b/kernel/futex.c
@@ -0,0 +1,798 @@
1/*
2 * Fast Userspace Mutexes (which I call "Futexes!").
3 * (C) Rusty Russell, IBM 2002
4 *
5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7 *
8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier
10 *
11 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
12 * enough at me, Linus for the original (flawed) idea, Matthew
13 * Kirkwood for proof-of-concept implementation.
14 *
15 * "The futexes are also cursed."
16 * "But they come in a choice of three flavours!"
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License
29 * along with this program; if not, write to the Free Software
30 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31 */
32#include <linux/slab.h>
33#include <linux/poll.h>
34#include <linux/fs.h>
35#include <linux/file.h>
36#include <linux/jhash.h>
37#include <linux/init.h>
38#include <linux/futex.h>
39#include <linux/mount.h>
40#include <linux/pagemap.h>
41#include <linux/syscalls.h>
42
43#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
44
45/*
46 * Futexes are matched on equal values of this key.
47 * The key type depends on whether it's a shared or private mapping.
48 * Don't rearrange members without looking at hash_futex().
49 *
50 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
51 * We set bit 0 to indicate if it's an inode-based key.
52 */
53union futex_key {
54 struct {
55 unsigned long pgoff;
56 struct inode *inode;
57 int offset;
58 } shared;
59 struct {
60 unsigned long uaddr;
61 struct mm_struct *mm;
62 int offset;
63 } private;
64 struct {
65 unsigned long word;
66 void *ptr;
67 int offset;
68 } both;
69};
70
71/*
72 * We use this hashed waitqueue instead of a normal wait_queue_t, so
73 * we can wake only the relevant ones (hashed queues may be shared).
74 *
75 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
76 * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
77 * The order of wakup is always to make the first condition true, then
78 * wake up q->waiters, then make the second condition true.
79 */
80struct futex_q {
81 struct list_head list;
82 wait_queue_head_t waiters;
83
84 /* Which hash list lock to use. */
85 spinlock_t *lock_ptr;
86
87 /* Key which the futex is hashed on. */
88 union futex_key key;
89
90 /* For fd, sigio sent using these. */
91 int fd;
92 struct file *filp;
93};
94
95/*
96 * Split the global futex_lock into every hash list lock.
97 */
98struct futex_hash_bucket {
99 spinlock_t lock;
100 struct list_head chain;
101};
102
103static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
104
105/* Futex-fs vfsmount entry: */
106static struct vfsmount *futex_mnt;
107
108/*
109 * We hash on the keys returned from get_futex_key (see below).
110 */
111static struct futex_hash_bucket *hash_futex(union futex_key *key)
112{
113 u32 hash = jhash2((u32*)&key->both.word,
114 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
115 key->both.offset);
116 return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
117}
118
119/*
120 * Return 1 if two futex_keys are equal, 0 otherwise.
121 */
122static inline int match_futex(union futex_key *key1, union futex_key *key2)
123{
124 return (key1->both.word == key2->both.word
125 && key1->both.ptr == key2->both.ptr
126 && key1->both.offset == key2->both.offset);
127}
128
129/*
130 * Get parameters which are the keys for a futex.
131 *
132 * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
133 * offset_within_page). For private mappings, it's (uaddr, current->mm).
134 * We can usually work out the index without swapping in the page.
135 *
136 * Returns: 0, or negative error code.
137 * The key words are stored in *key on success.
138 *
139 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
140 */
141static int get_futex_key(unsigned long uaddr, union futex_key *key)
142{
143 struct mm_struct *mm = current->mm;
144 struct vm_area_struct *vma;
145 struct page *page;
146 int err;
147
148 /*
149 * The futex address must be "naturally" aligned.
150 */
151 key->both.offset = uaddr % PAGE_SIZE;
152 if (unlikely((key->both.offset % sizeof(u32)) != 0))
153 return -EINVAL;
154 uaddr -= key->both.offset;
155
156 /*
157 * The futex is hashed differently depending on whether
158 * it's in a shared or private mapping. So check vma first.
159 */
160 vma = find_extend_vma(mm, uaddr);
161 if (unlikely(!vma))
162 return -EFAULT;
163
164 /*
165 * Permissions.
166 */
167 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
168 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
169
170 /*
171 * Private mappings are handled in a simple way.
172 *
173 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
174 * it's a read-only handle, it's expected that futexes attach to
175 * the object not the particular process. Therefore we use
176 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
177 * mappings of _writable_ handles.
178 */
179 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
180 key->private.mm = mm;
181 key->private.uaddr = uaddr;
182 return 0;
183 }
184
185 /*
186 * Linear file mappings are also simple.
187 */
188 key->shared.inode = vma->vm_file->f_dentry->d_inode;
189 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
190 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
191 key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
192 + vma->vm_pgoff);
193 return 0;
194 }
195
196 /*
197 * We could walk the page table to read the non-linear
198 * pte, and get the page index without fetching the page
199 * from swap. But that's a lot of code to duplicate here
200 * for a rare case, so we simply fetch the page.
201 */
202
203 /*
204 * Do a quick atomic lookup first - this is the fastpath.
205 */
206 spin_lock(&current->mm->page_table_lock);
207 page = follow_page(mm, uaddr, 0);
208 if (likely(page != NULL)) {
209 key->shared.pgoff =
210 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
211 spin_unlock(&current->mm->page_table_lock);
212 return 0;
213 }
214 spin_unlock(&current->mm->page_table_lock);
215
216 /*
217 * Do it the general way.
218 */
219 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
220 if (err >= 0) {
221 key->shared.pgoff =
222 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
223 put_page(page);
224 return 0;
225 }
226 return err;
227}
228
229/*
230 * Take a reference to the resource addressed by a key.
231 * Can be called while holding spinlocks.
232 *
233 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
234 * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
235 */
236static inline void get_key_refs(union futex_key *key)
237{
238 if (key->both.ptr != 0) {
239 if (key->both.offset & 1)
240 atomic_inc(&key->shared.inode->i_count);
241 else
242 atomic_inc(&key->private.mm->mm_count);
243 }
244}
245
246/*
247 * Drop a reference to the resource addressed by a key.
248 * The hash bucket spinlock must not be held.
249 */
250static void drop_key_refs(union futex_key *key)
251{
252 if (key->both.ptr != 0) {
253 if (key->both.offset & 1)
254 iput(key->shared.inode);
255 else
256 mmdrop(key->private.mm);
257 }
258}
259
260static inline int get_futex_value_locked(int *dest, int __user *from)
261{
262 int ret;
263
264 inc_preempt_count();
265 ret = __copy_from_user_inatomic(dest, from, sizeof(int));
266 dec_preempt_count();
267
268 return ret ? -EFAULT : 0;
269}
270
271/*
272 * The hash bucket lock must be held when this is called.
273 * Afterwards, the futex_q must not be accessed.
274 */
275static void wake_futex(struct futex_q *q)
276{
277 list_del_init(&q->list);
278 if (q->filp)
279 send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
280 /*
281 * The lock in wake_up_all() is a crucial memory barrier after the
282 * list_del_init() and also before assigning to q->lock_ptr.
283 */
284 wake_up_all(&q->waiters);
285 /*
286 * The waiting task can free the futex_q as soon as this is written,
287 * without taking any locks. This must come last.
288 */
289 q->lock_ptr = NULL;
290}
291
292/*
293 * Wake up all waiters hashed on the physical page that is mapped
294 * to this virtual address:
295 */
296static int futex_wake(unsigned long uaddr, int nr_wake)
297{
298 union futex_key key;
299 struct futex_hash_bucket *bh;
300 struct list_head *head;
301 struct futex_q *this, *next;
302 int ret;
303
304 down_read(&current->mm->mmap_sem);
305
306 ret = get_futex_key(uaddr, &key);
307 if (unlikely(ret != 0))
308 goto out;
309
310 bh = hash_futex(&key);
311 spin_lock(&bh->lock);
312 head = &bh->chain;
313
314 list_for_each_entry_safe(this, next, head, list) {
315 if (match_futex (&this->key, &key)) {
316 wake_futex(this);
317 if (++ret >= nr_wake)
318 break;
319 }
320 }
321
322 spin_unlock(&bh->lock);
323out:
324 up_read(&current->mm->mmap_sem);
325 return ret;
326}
327
328/*
329 * Requeue all waiters hashed on one physical page to another
330 * physical page.
331 */
332static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
333 int nr_wake, int nr_requeue, int *valp)
334{
335 union futex_key key1, key2;
336 struct futex_hash_bucket *bh1, *bh2;
337 struct list_head *head1;
338 struct futex_q *this, *next;
339 int ret, drop_count = 0;
340
341 retry:
342 down_read(&current->mm->mmap_sem);
343
344 ret = get_futex_key(uaddr1, &key1);
345 if (unlikely(ret != 0))
346 goto out;
347 ret = get_futex_key(uaddr2, &key2);
348 if (unlikely(ret != 0))
349 goto out;
350
351 bh1 = hash_futex(&key1);
352 bh2 = hash_futex(&key2);
353
354 if (bh1 < bh2)
355 spin_lock(&bh1->lock);
356 spin_lock(&bh2->lock);
357 if (bh1 > bh2)
358 spin_lock(&bh1->lock);
359
360 if (likely(valp != NULL)) {
361 int curval;
362
363 ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
364
365 if (unlikely(ret)) {
366 spin_unlock(&bh1->lock);
367 if (bh1 != bh2)
368 spin_unlock(&bh2->lock);
369
370 /* If we would have faulted, release mmap_sem, fault
371 * it in and start all over again.
372 */
373 up_read(&current->mm->mmap_sem);
374
375 ret = get_user(curval, (int __user *)uaddr1);
376
377 if (!ret)
378 goto retry;
379
380 return ret;
381 }
382 if (curval != *valp) {
383 ret = -EAGAIN;
384 goto out_unlock;
385 }
386 }
387
388 head1 = &bh1->chain;
389 list_for_each_entry_safe(this, next, head1, list) {
390 if (!match_futex (&this->key, &key1))
391 continue;
392 if (++ret <= nr_wake) {
393 wake_futex(this);
394 } else {
395 list_move_tail(&this->list, &bh2->chain);
396 this->lock_ptr = &bh2->lock;
397 this->key = key2;
398 get_key_refs(&key2);
399 drop_count++;
400
401 if (ret - nr_wake >= nr_requeue)
402 break;
403 /* Make sure to stop if key1 == key2 */
404 if (head1 == &bh2->chain && head1 != &next->list)
405 head1 = &this->list;
406 }
407 }
408
409out_unlock:
410 spin_unlock(&bh1->lock);
411 if (bh1 != bh2)
412 spin_unlock(&bh2->lock);
413
414 /* drop_key_refs() must be called outside the spinlocks. */
415 while (--drop_count >= 0)
416 drop_key_refs(&key1);
417
418out:
419 up_read(&current->mm->mmap_sem);
420 return ret;
421}
422
423/* The key must be already stored in q->key. */
424static inline struct futex_hash_bucket *
425queue_lock(struct futex_q *q, int fd, struct file *filp)
426{
427 struct futex_hash_bucket *bh;
428
429 q->fd = fd;
430 q->filp = filp;
431
432 init_waitqueue_head(&q->waiters);
433
434 get_key_refs(&q->key);
435 bh = hash_futex(&q->key);
436 q->lock_ptr = &bh->lock;
437
438 spin_lock(&bh->lock);
439 return bh;
440}
441
442static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
443{
444 list_add_tail(&q->list, &bh->chain);
445 spin_unlock(&bh->lock);
446}
447
448static inline void
449queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
450{
451 spin_unlock(&bh->lock);
452 drop_key_refs(&q->key);
453}
454
455/*
456 * queue_me and unqueue_me must be called as a pair, each
457 * exactly once. They are called with the hashed spinlock held.
458 */
459
460/* The key must be already stored in q->key. */
461static void queue_me(struct futex_q *q, int fd, struct file *filp)
462{
463 struct futex_hash_bucket *bh;
464 bh = queue_lock(q, fd, filp);
465 __queue_me(q, bh);
466}
467
468/* Return 1 if we were still queued (ie. 0 means we were woken) */
469static int unqueue_me(struct futex_q *q)
470{
471 int ret = 0;
472 spinlock_t *lock_ptr;
473
474 /* In the common case we don't take the spinlock, which is nice. */
475 retry:
476 lock_ptr = q->lock_ptr;
477 if (lock_ptr != 0) {
478 spin_lock(lock_ptr);
479 /*
480 * q->lock_ptr can change between reading it and
481 * spin_lock(), causing us to take the wrong lock. This
482 * corrects the race condition.
483 *
484 * Reasoning goes like this: if we have the wrong lock,
485 * q->lock_ptr must have changed (maybe several times)
486 * between reading it and the spin_lock(). It can
487 * change again after the spin_lock() but only if it was
488 * already changed before the spin_lock(). It cannot,
489 * however, change back to the original value. Therefore
490 * we can detect whether we acquired the correct lock.
491 */
492 if (unlikely(lock_ptr != q->lock_ptr)) {
493 spin_unlock(lock_ptr);
494 goto retry;
495 }
496 WARN_ON(list_empty(&q->list));
497 list_del(&q->list);
498 spin_unlock(lock_ptr);
499 ret = 1;
500 }
501
502 drop_key_refs(&q->key);
503 return ret;
504}
505
506static int futex_wait(unsigned long uaddr, int val, unsigned long time)
507{
508 DECLARE_WAITQUEUE(wait, current);
509 int ret, curval;
510 struct futex_q q;
511 struct futex_hash_bucket *bh;
512
513 retry:
514 down_read(&current->mm->mmap_sem);
515
516 ret = get_futex_key(uaddr, &q.key);
517 if (unlikely(ret != 0))
518 goto out_release_sem;
519
520 bh = queue_lock(&q, -1, NULL);
521
522 /*
523 * Access the page AFTER the futex is queued.
524 * Order is important:
525 *
526 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
527 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
528 *
529 * The basic logical guarantee of a futex is that it blocks ONLY
530 * if cond(var) is known to be true at the time of blocking, for
531 * any cond. If we queued after testing *uaddr, that would open
532 * a race condition where we could block indefinitely with
533 * cond(var) false, which would violate the guarantee.
534 *
535 * A consequence is that futex_wait() can return zero and absorb
536 * a wakeup when *uaddr != val on entry to the syscall. This is
537 * rare, but normal.
538 *
539 * We hold the mmap semaphore, so the mapping cannot have changed
540 * since we looked it up in get_futex_key.
541 */
542
543 ret = get_futex_value_locked(&curval, (int __user *)uaddr);
544
545 if (unlikely(ret)) {
546 queue_unlock(&q, bh);
547
548 /* If we would have faulted, release mmap_sem, fault it in and
549 * start all over again.
550 */
551 up_read(&current->mm->mmap_sem);
552
553 ret = get_user(curval, (int __user *)uaddr);
554
555 if (!ret)
556 goto retry;
557 return ret;
558 }
559 if (curval != val) {
560 ret = -EWOULDBLOCK;
561 queue_unlock(&q, bh);
562 goto out_release_sem;
563 }
564
565 /* Only actually queue if *uaddr contained val. */
566 __queue_me(&q, bh);
567
568 /*
569 * Now the futex is queued and we have checked the data, we
570 * don't want to hold mmap_sem while we sleep.
571 */
572 up_read(&current->mm->mmap_sem);
573
574 /*
575 * There might have been scheduling since the queue_me(), as we
576 * cannot hold a spinlock across the get_user() in case it
577 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
578 * queueing ourselves into the futex hash. This code thus has to
579 * rely on the futex_wake() code removing us from hash when it
580 * wakes us up.
581 */
582
583 /* add_wait_queue is the barrier after __set_current_state. */
584 __set_current_state(TASK_INTERRUPTIBLE);
585 add_wait_queue(&q.waiters, &wait);
586 /*
587 * !list_empty() is safe here without any lock.
588 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
589 */
590 if (likely(!list_empty(&q.list)))
591 time = schedule_timeout(time);
592 __set_current_state(TASK_RUNNING);
593
594 /*
595 * NOTE: we don't remove ourselves from the waitqueue because
596 * we are the only user of it.
597 */
598
599 /* If we were woken (and unqueued), we succeeded, whatever. */
600 if (!unqueue_me(&q))
601 return 0;
602 if (time == 0)
603 return -ETIMEDOUT;
604 /* We expect signal_pending(current), but another thread may
605 * have handled it for us already. */
606 return -EINTR;
607
608 out_release_sem:
609 up_read(&current->mm->mmap_sem);
610 return ret;
611}
612
613static int futex_close(struct inode *inode, struct file *filp)
614{
615 struct futex_q *q = filp->private_data;
616
617 unqueue_me(q);
618 kfree(q);
619 return 0;
620}
621
622/* This is one-shot: once it's gone off you need a new fd */
623static unsigned int futex_poll(struct file *filp,
624 struct poll_table_struct *wait)
625{
626 struct futex_q *q = filp->private_data;
627 int ret = 0;
628
629 poll_wait(filp, &q->waiters, wait);
630
631 /*
632 * list_empty() is safe here without any lock.
633 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
634 */
635 if (list_empty(&q->list))
636 ret = POLLIN | POLLRDNORM;
637
638 return ret;
639}
640
641static struct file_operations futex_fops = {
642 .release = futex_close,
643 .poll = futex_poll,
644};
645
646/*
647 * Signal allows caller to avoid the race which would occur if they
648 * set the sigio stuff up afterwards.
649 */
650static int futex_fd(unsigned long uaddr, int signal)
651{
652 struct futex_q *q;
653 struct file *filp;
654 int ret, err;
655
656 ret = -EINVAL;
657 if (signal < 0 || signal > _NSIG)
658 goto out;
659
660 ret = get_unused_fd();
661 if (ret < 0)
662 goto out;
663 filp = get_empty_filp();
664 if (!filp) {
665 put_unused_fd(ret);
666 ret = -ENFILE;
667 goto out;
668 }
669 filp->f_op = &futex_fops;
670 filp->f_vfsmnt = mntget(futex_mnt);
671 filp->f_dentry = dget(futex_mnt->mnt_root);
672 filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
673
674 if (signal) {
675 int err;
676 err = f_setown(filp, current->pid, 1);
677 if (err < 0) {
678 put_unused_fd(ret);
679 put_filp(filp);
680 ret = err;
681 goto out;
682 }
683 filp->f_owner.signum = signal;
684 }
685
686 q = kmalloc(sizeof(*q), GFP_KERNEL);
687 if (!q) {
688 put_unused_fd(ret);
689 put_filp(filp);
690 ret = -ENOMEM;
691 goto out;
692 }
693
694 down_read(&current->mm->mmap_sem);
695 err = get_futex_key(uaddr, &q->key);
696
697 if (unlikely(err != 0)) {
698 up_read(&current->mm->mmap_sem);
699 put_unused_fd(ret);
700 put_filp(filp);
701 kfree(q);
702 return err;
703 }
704
705 /*
706 * queue_me() must be called before releasing mmap_sem, because
707 * key->shared.inode needs to be referenced while holding it.
708 */
709 filp->private_data = q;
710
711 queue_me(q, ret, filp);
712 up_read(&current->mm->mmap_sem);
713
714 /* Now we map fd to filp, so userspace can access it */
715 fd_install(ret, filp);
716out:
717 return ret;
718}
719
720long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
721 unsigned long uaddr2, int val2, int val3)
722{
723 int ret;
724
725 switch (op) {
726 case FUTEX_WAIT:
727 ret = futex_wait(uaddr, val, timeout);
728 break;
729 case FUTEX_WAKE:
730 ret = futex_wake(uaddr, val);
731 break;
732 case FUTEX_FD:
733 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
734 ret = futex_fd(uaddr, val);
735 break;
736 case FUTEX_REQUEUE:
737 ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
738 break;
739 case FUTEX_CMP_REQUEUE:
740 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
741 break;
742 default:
743 ret = -ENOSYS;
744 }
745 return ret;
746}
747
748
749asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
750 struct timespec __user *utime, u32 __user *uaddr2,
751 int val3)
752{
753 struct timespec t;
754 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
755 int val2 = 0;
756
757 if ((op == FUTEX_WAIT) && utime) {
758 if (copy_from_user(&t, utime, sizeof(t)) != 0)
759 return -EFAULT;
760 timeout = timespec_to_jiffies(&t) + 1;
761 }
762 /*
763 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
764 */
765 if (op >= FUTEX_REQUEUE)
766 val2 = (int) (unsigned long) utime;
767
768 return do_futex((unsigned long)uaddr, op, val, timeout,
769 (unsigned long)uaddr2, val2, val3);
770}
771
772static struct super_block *
773futexfs_get_sb(struct file_system_type *fs_type,
774 int flags, const char *dev_name, void *data)
775{
776 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
777}
778
779static struct file_system_type futex_fs_type = {
780 .name = "futexfs",
781 .get_sb = futexfs_get_sb,
782 .kill_sb = kill_anon_super,
783};
784
785static int __init init(void)
786{
787 unsigned int i;
788
789 register_filesystem(&futex_fs_type);
790 futex_mnt = kern_mount(&futex_fs_type);
791
792 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
793 INIT_LIST_HEAD(&futex_queues[i].chain);
794 spin_lock_init(&futex_queues[i].lock);
795 }
796 return 0;
797}
798__initcall(init);
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
new file mode 100644
index 000000000000..388977f3e9b7
--- /dev/null
+++ b/kernel/intermodule.c
@@ -0,0 +1,182 @@
1/* Deprecated, do not use. Moved from module.c to here. --RR */
2
3/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
4#include <linux/module.h>
5#include <linux/kmod.h>
6#include <linux/spinlock.h>
7#include <linux/list.h>
8#include <linux/slab.h>
9
10/* inter_module functions are always available, even when the kernel is
11 * compiled without modules. Consumers of inter_module_xxx routines
12 * will always work, even when both are built into the kernel, this
13 * approach removes lots of #ifdefs in mainline code.
14 */
15
16static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
17static DEFINE_SPINLOCK(ime_lock);
18static int kmalloc_failed;
19
20struct inter_module_entry {
21 struct list_head list;
22 const char *im_name;
23 struct module *owner;
24 const void *userdata;
25};
26
27/**
28 * inter_module_register - register a new set of inter module data.
29 * @im_name: an arbitrary string to identify the data, must be unique
30 * @owner: module that is registering the data, always use THIS_MODULE
31 * @userdata: pointer to arbitrary userdata to be registered
32 *
33 * Description: Check that the im_name has not already been registered,
34 * complain if it has. For new data, add it to the inter_module_entry
35 * list.
36 */
37void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
38{
39 struct list_head *tmp;
40 struct inter_module_entry *ime, *ime_new;
41
42 if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
43 /* Overloaded kernel, not fatal */
44 printk(KERN_ERR
45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
46 im_name);
47 kmalloc_failed = 1;
48 return;
49 }
50 memset(ime_new, 0, sizeof(*ime_new));
51 ime_new->im_name = im_name;
52 ime_new->owner = owner;
53 ime_new->userdata = userdata;
54
55 spin_lock(&ime_lock);
56 list_for_each(tmp, &ime_list) {
57 ime = list_entry(tmp, struct inter_module_entry, list);
58 if (strcmp(ime->im_name, im_name) == 0) {
59 spin_unlock(&ime_lock);
60 kfree(ime_new);
61 /* Program logic error, fatal */
62 printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
63 BUG();
64 }
65 }
66 list_add(&(ime_new->list), &ime_list);
67 spin_unlock(&ime_lock);
68}
69
70/**
71 * inter_module_unregister - unregister a set of inter module data.
72 * @im_name: an arbitrary string to identify the data, must be unique
73 *
74 * Description: Check that the im_name has been registered, complain if
75 * it has not. For existing data, remove it from the
76 * inter_module_entry list.
77 */
78void inter_module_unregister(const char *im_name)
79{
80 struct list_head *tmp;
81 struct inter_module_entry *ime;
82
83 spin_lock(&ime_lock);
84 list_for_each(tmp, &ime_list) {
85 ime = list_entry(tmp, struct inter_module_entry, list);
86 if (strcmp(ime->im_name, im_name) == 0) {
87 list_del(&(ime->list));
88 spin_unlock(&ime_lock);
89 kfree(ime);
90 return;
91 }
92 }
93 spin_unlock(&ime_lock);
94 if (kmalloc_failed) {
95 printk(KERN_ERR
96 "inter_module_unregister: no entry for '%s', "
97 "probably caused by previous kmalloc failure\n",
98 im_name);
99 return;
100 }
101 else {
102 /* Program logic error, fatal */
103 printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
104 BUG();
105 }
106}
107
108/**
109 * inter_module_get - return arbitrary userdata from another module.
110 * @im_name: an arbitrary string to identify the data, must be unique
111 *
112 * Description: If the im_name has not been registered, return NULL.
113 * Try to increment the use count on the owning module, if that fails
114 * then return NULL. Otherwise return the userdata.
115 */
116static const void *inter_module_get(const char *im_name)
117{
118 struct list_head *tmp;
119 struct inter_module_entry *ime;
120 const void *result = NULL;
121
122 spin_lock(&ime_lock);
123 list_for_each(tmp, &ime_list) {
124 ime = list_entry(tmp, struct inter_module_entry, list);
125 if (strcmp(ime->im_name, im_name) == 0) {
126 if (try_module_get(ime->owner))
127 result = ime->userdata;
128 break;
129 }
130 }
131 spin_unlock(&ime_lock);
132 return(result);
133}
134
135/**
136 * inter_module_get_request - im get with automatic request_module.
137 * @im_name: an arbitrary string to identify the data, must be unique
138 * @modname: module that is expected to register im_name
139 *
140 * Description: If inter_module_get fails, do request_module then retry.
141 */
142const void *inter_module_get_request(const char *im_name, const char *modname)
143{
144 const void *result = inter_module_get(im_name);
145 if (!result) {
146 request_module("%s", modname);
147 result = inter_module_get(im_name);
148 }
149 return(result);
150}
151
152/**
153 * inter_module_put - release use of data from another module.
154 * @im_name: an arbitrary string to identify the data, must be unique
155 *
156 * Description: If the im_name has not been registered, complain,
157 * otherwise decrement the use count on the owning module.
158 */
159void inter_module_put(const char *im_name)
160{
161 struct list_head *tmp;
162 struct inter_module_entry *ime;
163
164 spin_lock(&ime_lock);
165 list_for_each(tmp, &ime_list) {
166 ime = list_entry(tmp, struct inter_module_entry, list);
167 if (strcmp(ime->im_name, im_name) == 0) {
168 if (ime->owner)
169 module_put(ime->owner);
170 spin_unlock(&ime_lock);
171 return;
172 }
173 }
174 spin_unlock(&ime_lock);
175 printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
176 BUG();
177}
178
179EXPORT_SYMBOL(inter_module_register);
180EXPORT_SYMBOL(inter_module_unregister);
181EXPORT_SYMBOL(inter_module_get_request);
182EXPORT_SYMBOL(inter_module_put);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
new file mode 100644
index 000000000000..49378738ff5e
--- /dev/null
+++ b/kernel/irq/Makefile
@@ -0,0 +1,5 @@
1
2obj-y := handle.o manage.o spurious.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o
5
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
new file mode 100644
index 000000000000..98d62d8efeaf
--- /dev/null
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,189 @@
1/*
2 * linux/kernel/irq/autoprobe.c
3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the interrupt probing code and driver APIs.
7 */
8
9#include <linux/irq.h>
10#include <linux/module.h>
11#include <linux/interrupt.h>
12
13/*
14 * Autodetection depends on the fact that any interrupt that
15 * comes in on to an unassigned handler will get stuck with
16 * "IRQ_WAITING" cleared and the interrupt disabled.
17 */
18static DECLARE_MUTEX(probe_sem);
19
20/**
21 * probe_irq_on - begin an interrupt autodetect
22 *
23 * Commence probing for an interrupt. The interrupts are scanned
24 * and a mask of potential interrupt lines is returned.
25 *
26 */
27unsigned long probe_irq_on(void)
28{
29 unsigned long val, delay;
30 irq_desc_t *desc;
31 unsigned int i;
32
33 down(&probe_sem);
34 /*
35 * something may have generated an irq long ago and we want to
36 * flush such a longstanding irq before considering it as spurious.
37 */
38 for (i = NR_IRQS-1; i > 0; i--) {
39 desc = irq_desc + i;
40
41 spin_lock_irq(&desc->lock);
42 if (!irq_desc[i].action)
43 irq_desc[i].handler->startup(i);
44 spin_unlock_irq(&desc->lock);
45 }
46
47 /* Wait for longstanding interrupts to trigger. */
48 for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
49 /* about 20ms delay */ barrier();
50
51 /*
52 * enable any unassigned irqs
53 * (we must startup again here because if a longstanding irq
54 * happened in the previous stage, it may have masked itself)
55 */
56 for (i = NR_IRQS-1; i > 0; i--) {
57 desc = irq_desc + i;
58
59 spin_lock_irq(&desc->lock);
60 if (!desc->action) {
61 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
62 if (desc->handler->startup(i))
63 desc->status |= IRQ_PENDING;
64 }
65 spin_unlock_irq(&desc->lock);
66 }
67
68 /*
69 * Wait for spurious interrupts to trigger
70 */
71 for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
72 /* about 100ms delay */ barrier();
73
74 /*
75 * Now filter out any obviously spurious interrupts
76 */
77 val = 0;
78 for (i = 0; i < NR_IRQS; i++) {
79 irq_desc_t *desc = irq_desc + i;
80 unsigned int status;
81
82 spin_lock_irq(&desc->lock);
83 status = desc->status;
84
85 if (status & IRQ_AUTODETECT) {
86 /* It triggered already - consider it spurious. */
87 if (!(status & IRQ_WAITING)) {
88 desc->status = status & ~IRQ_AUTODETECT;
89 desc->handler->shutdown(i);
90 } else
91 if (i < 32)
92 val |= 1 << i;
93 }
94 spin_unlock_irq(&desc->lock);
95 }
96
97 return val;
98}
99
100EXPORT_SYMBOL(probe_irq_on);
101
102/**
103 * probe_irq_mask - scan a bitmap of interrupt lines
104 * @val: mask of interrupts to consider
105 *
106 * Scan the interrupt lines and return a bitmap of active
107 * autodetect interrupts. The interrupt probe logic state
108 * is then returned to its previous value.
109 *
110 * Note: we need to scan all the irq's even though we will
111 * only return autodetect irq numbers - just so that we reset
112 * them all to a known state.
113 */
114unsigned int probe_irq_mask(unsigned long val)
115{
116 unsigned int mask;
117 int i;
118
119 mask = 0;
120 for (i = 0; i < NR_IRQS; i++) {
121 irq_desc_t *desc = irq_desc + i;
122 unsigned int status;
123
124 spin_lock_irq(&desc->lock);
125 status = desc->status;
126
127 if (status & IRQ_AUTODETECT) {
128 if (i < 16 && !(status & IRQ_WAITING))
129 mask |= 1 << i;
130
131 desc->status = status & ~IRQ_AUTODETECT;
132 desc->handler->shutdown(i);
133 }
134 spin_unlock_irq(&desc->lock);
135 }
136 up(&probe_sem);
137
138 return mask & val;
139}
140EXPORT_SYMBOL(probe_irq_mask);
141
142/**
143 * probe_irq_off - end an interrupt autodetect
144 * @val: mask of potential interrupts (unused)
145 *
146 * Scans the unused interrupt lines and returns the line which
147 * appears to have triggered the interrupt. If no interrupt was
148 * found then zero is returned. If more than one interrupt is
149 * found then minus the first candidate is returned to indicate
150 * their is doubt.
151 *
152 * The interrupt probe logic state is returned to its previous
153 * value.
154 *
155 * BUGS: When used in a module (which arguably shouldn't happen)
156 * nothing prevents two IRQ probe callers from overlapping. The
157 * results of this are non-optimal.
158 */
159int probe_irq_off(unsigned long val)
160{
161 int i, irq_found = 0, nr_irqs = 0;
162
163 for (i = 0; i < NR_IRQS; i++) {
164 irq_desc_t *desc = irq_desc + i;
165 unsigned int status;
166
167 spin_lock_irq(&desc->lock);
168 status = desc->status;
169
170 if (status & IRQ_AUTODETECT) {
171 if (!(status & IRQ_WAITING)) {
172 if (!nr_irqs)
173 irq_found = i;
174 nr_irqs++;
175 }
176 desc->status = status & ~IRQ_AUTODETECT;
177 desc->handler->shutdown(i);
178 }
179 spin_unlock_irq(&desc->lock);
180 }
181 up(&probe_sem);
182
183 if (nr_irqs > 1)
184 irq_found = -irq_found;
185 return irq_found;
186}
187
188EXPORT_SYMBOL(probe_irq_off);
189
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
new file mode 100644
index 000000000000..2fb0e46e11f3
--- /dev/null
+++ b/kernel/irq/handle.c
@@ -0,0 +1,193 @@
1/*
2 * linux/kernel/irq/handle.c
3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the core interrupt handling code.
7 */
8
9#include <linux/irq.h>
10#include <linux/module.h>
11#include <linux/random.h>
12#include <linux/interrupt.h>
13#include <linux/kernel_stat.h>
14
15#include "internals.h"
16
17/*
18 * Linux has a controller-independent interrupt architecture.
19 * Every controller has a 'controller-template', that is used
20 * by the main code to do the right thing. Each driver-visible
21 * interrupt source is transparently wired to the apropriate
22 * controller. Thus drivers need not be aware of the
23 * interrupt-controller.
24 *
25 * The code is designed to be easily extended with new/different
26 * interrupt controllers, without having to do assembly magic or
27 * having to touch the generic code.
28 *
29 * Controller mappings for all interrupt sources:
30 */
31irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
32 [0 ... NR_IRQS-1] = {
33 .handler = &no_irq_type,
34 .lock = SPIN_LOCK_UNLOCKED
35 }
36};
37
38/*
39 * Generic 'no controller' code
40 */
41static void end_none(unsigned int irq) { }
42static void enable_none(unsigned int irq) { }
43static void disable_none(unsigned int irq) { }
44static void shutdown_none(unsigned int irq) { }
45static unsigned int startup_none(unsigned int irq) { return 0; }
46
47static void ack_none(unsigned int irq)
48{
49 /*
50 * 'what should we do if we get a hw irq event on an illegal vector'.
51 * each architecture has to answer this themself.
52 */
53 ack_bad_irq(irq);
54}
55
56struct hw_interrupt_type no_irq_type = {
57 .typename = "none",
58 .startup = startup_none,
59 .shutdown = shutdown_none,
60 .enable = enable_none,
61 .disable = disable_none,
62 .ack = ack_none,
63 .end = end_none,
64 .set_affinity = NULL
65};
66
67/*
68 * Special, empty irq handler:
69 */
70irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
71{
72 return IRQ_NONE;
73}
74
75/*
76 * Have got an event to handle:
77 */
78fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
79 struct irqaction *action)
80{
81 int ret, retval = 0, status = 0;
82
83 if (!(action->flags & SA_INTERRUPT))
84 local_irq_enable();
85
86 do {
87 ret = action->handler(irq, action->dev_id, regs);
88 if (ret == IRQ_HANDLED)
89 status |= action->flags;
90 retval |= ret;
91 action = action->next;
92 } while (action);
93
94 if (status & SA_SAMPLE_RANDOM)
95 add_interrupt_randomness(irq);
96 local_irq_disable();
97
98 return retval;
99}
100
101/*
102 * do_IRQ handles all normal device IRQ's (the special
103 * SMP cross-CPU interrupts have their own specific
104 * handlers).
105 */
106fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
107{
108 irq_desc_t *desc = irq_desc + irq;
109 struct irqaction * action;
110 unsigned int status;
111
112 kstat_this_cpu.irqs[irq]++;
113 if (desc->status & IRQ_PER_CPU) {
114 irqreturn_t action_ret;
115
116 /*
117 * No locking required for CPU-local interrupts:
118 */
119 desc->handler->ack(irq);
120 action_ret = handle_IRQ_event(irq, regs, desc->action);
121 if (!noirqdebug)
122 note_interrupt(irq, desc, action_ret);
123 desc->handler->end(irq);
124 return 1;
125 }
126
127 spin_lock(&desc->lock);
128 desc->handler->ack(irq);
129 /*
130 * REPLAY is when Linux resends an IRQ that was dropped earlier
131 * WAITING is used by probe to mark irqs that are being tested
132 */
133 status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
134 status |= IRQ_PENDING; /* we _want_ to handle it */
135
136 /*
137 * If the IRQ is disabled for whatever reason, we cannot
138 * use the action we have.
139 */
140 action = NULL;
141 if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
142 action = desc->action;
143 status &= ~IRQ_PENDING; /* we commit to handling */
144 status |= IRQ_INPROGRESS; /* we are handling it */
145 }
146 desc->status = status;
147
148 /*
149 * If there is no IRQ handler or it was disabled, exit early.
150 * Since we set PENDING, if another processor is handling
151 * a different instance of this same irq, the other processor
152 * will take care of it.
153 */
154 if (unlikely(!action))
155 goto out;
156
157 /*
158 * Edge triggered interrupts need to remember
159 * pending events.
160 * This applies to any hw interrupts that allow a second
161 * instance of the same irq to arrive while we are in do_IRQ
162 * or in the handler. But the code here only handles the _second_
163 * instance of the irq, not the third or fourth. So it is mostly
164 * useful for irq hardware that does not mask cleanly in an
165 * SMP environment.
166 */
167 for (;;) {
168 irqreturn_t action_ret;
169
170 spin_unlock(&desc->lock);
171
172 action_ret = handle_IRQ_event(irq, regs, action);
173
174 spin_lock(&desc->lock);
175 if (!noirqdebug)
176 note_interrupt(irq, desc, action_ret);
177 if (likely(!(desc->status & IRQ_PENDING)))
178 break;
179 desc->status &= ~IRQ_PENDING;
180 }
181 desc->status &= ~IRQ_INPROGRESS;
182
183out:
184 /*
185 * The ->end() handler has to deal with interrupts which got
186 * disabled while the handler was running.
187 */
188 desc->handler->end(irq);
189 spin_unlock(&desc->lock);
190
191 return 1;
192}
193
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
new file mode 100644
index 000000000000..46feba630266
--- /dev/null
+++ b/kernel/irq/internals.h
@@ -0,0 +1,18 @@
1/*
2 * IRQ subsystem internal functions and variables:
3 */
4
5extern int noirqdebug;
6
7#ifdef CONFIG_PROC_FS
8extern void register_irq_proc(unsigned int irq);
9extern void register_handler_proc(unsigned int irq, struct irqaction *action);
10extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
11#else
12static inline void register_irq_proc(unsigned int irq) { }
13static inline void register_handler_proc(unsigned int irq,
14 struct irqaction *action) { }
15static inline void unregister_handler_proc(unsigned int irq,
16 struct irqaction *action) { }
17#endif
18
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
new file mode 100644
index 000000000000..5202e4c4a5b6
--- /dev/null
+++ b/kernel/irq/manage.c
@@ -0,0 +1,349 @@
1/*
2 * linux/kernel/irq/manage.c
3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains driver APIs to the irq subsystem.
7 */
8
9#include <linux/irq.h>
10#include <linux/module.h>
11#include <linux/random.h>
12#include <linux/interrupt.h>
13
14#include "internals.h"
15
16#ifdef CONFIG_SMP
17
18cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
19
20/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 *
23 * This function waits for any pending IRQ handlers for this interrupt
24 * to complete before returning. If you use this function while
25 * holding a resource the IRQ handler may need you will deadlock.
26 *
27 * This function may be called - with care - from IRQ context.
28 */
29void synchronize_irq(unsigned int irq)
30{
31 struct irq_desc *desc = irq_desc + irq;
32
33 while (desc->status & IRQ_INPROGRESS)
34 cpu_relax();
35}
36
37EXPORT_SYMBOL(synchronize_irq);
38
39#endif
40
41/**
42 * disable_irq_nosync - disable an irq without waiting
43 * @irq: Interrupt to disable
44 *
45 * Disable the selected interrupt line. Disables and Enables are
46 * nested.
47 * Unlike disable_irq(), this function does not ensure existing
48 * instances of the IRQ handler have completed before returning.
49 *
50 * This function may be called from IRQ context.
51 */
52void disable_irq_nosync(unsigned int irq)
53{
54 irq_desc_t *desc = irq_desc + irq;
55 unsigned long flags;
56
57 spin_lock_irqsave(&desc->lock, flags);
58 if (!desc->depth++) {
59 desc->status |= IRQ_DISABLED;
60 desc->handler->disable(irq);
61 }
62 spin_unlock_irqrestore(&desc->lock, flags);
63}
64
65EXPORT_SYMBOL(disable_irq_nosync);
66
67/**
68 * disable_irq - disable an irq and wait for completion
69 * @irq: Interrupt to disable
70 *
71 * Disable the selected interrupt line. Enables and Disables are
72 * nested.
73 * This function waits for any pending IRQ handlers for this interrupt
74 * to complete before returning. If you use this function while
75 * holding a resource the IRQ handler may need you will deadlock.
76 *
77 * This function may be called - with care - from IRQ context.
78 */
79void disable_irq(unsigned int irq)
80{
81 irq_desc_t *desc = irq_desc + irq;
82
83 disable_irq_nosync(irq);
84 if (desc->action)
85 synchronize_irq(irq);
86}
87
88EXPORT_SYMBOL(disable_irq);
89
90/**
91 * enable_irq - enable handling of an irq
92 * @irq: Interrupt to enable
93 *
94 * Undoes the effect of one call to disable_irq(). If this
95 * matches the last disable, processing of interrupts on this
96 * IRQ line is re-enabled.
97 *
98 * This function may be called from IRQ context.
99 */
100void enable_irq(unsigned int irq)
101{
102 irq_desc_t *desc = irq_desc + irq;
103 unsigned long flags;
104
105 spin_lock_irqsave(&desc->lock, flags);
106 switch (desc->depth) {
107 case 0:
108 WARN_ON(1);
109 break;
110 case 1: {
111 unsigned int status = desc->status & ~IRQ_DISABLED;
112
113 desc->status = status;
114 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
115 desc->status = status | IRQ_REPLAY;
116 hw_resend_irq(desc->handler,irq);
117 }
118 desc->handler->enable(irq);
119 /* fall-through */
120 }
121 default:
122 desc->depth--;
123 }
124 spin_unlock_irqrestore(&desc->lock, flags);
125}
126
127EXPORT_SYMBOL(enable_irq);
128
129/*
130 * Internal function that tells the architecture code whether a
131 * particular irq has been exclusively allocated or is available
132 * for driver use.
133 */
134int can_request_irq(unsigned int irq, unsigned long irqflags)
135{
136 struct irqaction *action;
137
138 if (irq >= NR_IRQS)
139 return 0;
140
141 action = irq_desc[irq].action;
142 if (action)
143 if (irqflags & action->flags & SA_SHIRQ)
144 action = NULL;
145
146 return !action;
147}
148
149/*
150 * Internal function to register an irqaction - typically used to
151 * allocate special interrupts that are part of the architecture.
152 */
153int setup_irq(unsigned int irq, struct irqaction * new)
154{
155 struct irq_desc *desc = irq_desc + irq;
156 struct irqaction *old, **p;
157 unsigned long flags;
158 int shared = 0;
159
160 if (desc->handler == &no_irq_type)
161 return -ENOSYS;
162 /*
163 * Some drivers like serial.c use request_irq() heavily,
164 * so we have to be careful not to interfere with a
165 * running system.
166 */
167 if (new->flags & SA_SAMPLE_RANDOM) {
168 /*
169 * This function might sleep, we want to call it first,
170 * outside of the atomic block.
171 * Yes, this might clear the entropy pool if the wrong
172 * driver is attempted to be loaded, without actually
173 * installing a new handler, but is this really a problem,
174 * only the sysadmin is able to do this.
175 */
176 rand_initialize_irq(irq);
177 }
178
179 /*
180 * The following block of code has to be executed atomically
181 */
182 spin_lock_irqsave(&desc->lock,flags);
183 p = &desc->action;
184 if ((old = *p) != NULL) {
185 /* Can't share interrupts unless both agree to */
186 if (!(old->flags & new->flags & SA_SHIRQ)) {
187 spin_unlock_irqrestore(&desc->lock,flags);
188 return -EBUSY;
189 }
190
191 /* add new interrupt at end of irq queue */
192 do {
193 p = &old->next;
194 old = *p;
195 } while (old);
196 shared = 1;
197 }
198
199 *p = new;
200
201 if (!shared) {
202 desc->depth = 0;
203 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
204 IRQ_WAITING | IRQ_INPROGRESS);
205 if (desc->handler->startup)
206 desc->handler->startup(irq);
207 else
208 desc->handler->enable(irq);
209 }
210 spin_unlock_irqrestore(&desc->lock,flags);
211
212 new->irq = irq;
213 register_irq_proc(irq);
214 new->dir = NULL;
215 register_handler_proc(irq, new);
216
217 return 0;
218}
219
220/**
221 * free_irq - free an interrupt
222 * @irq: Interrupt line to free
223 * @dev_id: Device identity to free
224 *
225 * Remove an interrupt handler. The handler is removed and if the
226 * interrupt line is no longer in use by any driver it is disabled.
227 * On a shared IRQ the caller must ensure the interrupt is disabled
228 * on the card it drives before calling this function. The function
229 * does not return until any executing interrupts for this IRQ
230 * have completed.
231 *
232 * This function must not be called from interrupt context.
233 */
234void free_irq(unsigned int irq, void *dev_id)
235{
236 struct irq_desc *desc;
237 struct irqaction **p;
238 unsigned long flags;
239
240 if (irq >= NR_IRQS)
241 return;
242
243 desc = irq_desc + irq;
244 spin_lock_irqsave(&desc->lock,flags);
245 p = &desc->action;
246 for (;;) {
247 struct irqaction * action = *p;
248
249 if (action) {
250 struct irqaction **pp = p;
251
252 p = &action->next;
253 if (action->dev_id != dev_id)
254 continue;
255
256 /* Found it - now remove it from the list of entries */
257 *pp = action->next;
258 if (!desc->action) {
259 desc->status |= IRQ_DISABLED;
260 if (desc->handler->shutdown)
261 desc->handler->shutdown(irq);
262 else
263 desc->handler->disable(irq);
264 }
265 spin_unlock_irqrestore(&desc->lock,flags);
266 unregister_handler_proc(irq, action);
267
268 /* Make sure it's not being used on another CPU */
269 synchronize_irq(irq);
270 kfree(action);
271 return;
272 }
273 printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
274 spin_unlock_irqrestore(&desc->lock,flags);
275 return;
276 }
277}
278
279EXPORT_SYMBOL(free_irq);
280
281/**
282 * request_irq - allocate an interrupt line
283 * @irq: Interrupt line to allocate
284 * @handler: Function to be called when the IRQ occurs
285 * @irqflags: Interrupt type flags
286 * @devname: An ascii name for the claiming device
287 * @dev_id: A cookie passed back to the handler function
288 *
289 * This call allocates interrupt resources and enables the
290 * interrupt line and IRQ handling. From the point this
291 * call is made your handler function may be invoked. Since
292 * your handler function must clear any interrupt the board
293 * raises, you must take care both to initialise your hardware
294 * and to set up the interrupt handler in the right order.
295 *
296 * Dev_id must be globally unique. Normally the address of the
297 * device data structure is used as the cookie. Since the handler
298 * receives this value it makes sense to use it.
299 *
300 * If your interrupt is shared you must pass a non NULL dev_id
301 * as this is required when freeing the interrupt.
302 *
303 * Flags:
304 *
305 * SA_SHIRQ Interrupt is shared
306 * SA_INTERRUPT Disable local interrupts while processing
307 * SA_SAMPLE_RANDOM The interrupt can be used for entropy
308 *
309 */
310int request_irq(unsigned int irq,
311 irqreturn_t (*handler)(int, void *, struct pt_regs *),
312 unsigned long irqflags, const char * devname, void *dev_id)
313{
314 struct irqaction * action;
315 int retval;
316
317 /*
318 * Sanity-check: shared interrupts must pass in a real dev-ID,
319 * otherwise we'll have trouble later trying to figure out
320 * which interrupt is which (messes up the interrupt freeing
321 * logic etc).
322 */
323 if ((irqflags & SA_SHIRQ) && !dev_id)
324 return -EINVAL;
325 if (irq >= NR_IRQS)
326 return -EINVAL;
327 if (!handler)
328 return -EINVAL;
329
330 action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
331 if (!action)
332 return -ENOMEM;
333
334 action->handler = handler;
335 action->flags = irqflags;
336 cpus_clear(action->mask);
337 action->name = devname;
338 action->next = NULL;
339 action->dev_id = dev_id;
340
341 retval = setup_irq(irq, action);
342 if (retval)
343 kfree(action);
344
345 return retval;
346}
347
348EXPORT_SYMBOL(request_irq);
349
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
new file mode 100644
index 000000000000..85d08daa6600
--- /dev/null
+++ b/kernel/irq/proc.c
@@ -0,0 +1,159 @@
1/*
2 * linux/kernel/irq/proc.c
3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the /proc/irq/ handling code.
7 */
8
9#include <linux/irq.h>
10#include <linux/proc_fs.h>
11#include <linux/interrupt.h>
12
13static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
14
15#ifdef CONFIG_SMP
16
17/*
18 * The /proc/irq/<irq>/smp_affinity values:
19 */
20static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
21
22void __attribute__((weak))
23proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
24{
25 irq_affinity[irq] = mask_val;
26 irq_desc[irq].handler->set_affinity(irq, mask_val);
27}
28
29static int irq_affinity_read_proc(char *page, char **start, off_t off,
30 int count, int *eof, void *data)
31{
32 int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
33
34 if (count - len < 2)
35 return -EINVAL;
36 len += sprintf(page + len, "\n");
37 return len;
38}
39
40int no_irq_affinity;
41static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
42 unsigned long count, void *data)
43{
44 unsigned int irq = (int)(long)data, full_count = count, err;
45 cpumask_t new_value, tmp;
46
47 if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
48 return -EIO;
49
50 err = cpumask_parse(buffer, count, new_value);
51 if (err)
52 return err;
53
54 /*
55 * Do not allow disabling IRQs completely - it's a too easy
56 * way to make the system unusable accidentally :-) At least
57 * one online CPU still has to be targeted.
58 */
59 cpus_and(tmp, new_value, cpu_online_map);
60 if (cpus_empty(tmp))
61 return -EINVAL;
62
63 proc_set_irq_affinity(irq, new_value);
64
65 return full_count;
66}
67
68#endif
69
70#define MAX_NAMELEN 128
71
72static int name_unique(unsigned int irq, struct irqaction *new_action)
73{
74 struct irq_desc *desc = irq_desc + irq;
75 struct irqaction *action;
76
77 for (action = desc->action ; action; action = action->next)
78 if ((action != new_action) && action->name &&
79 !strcmp(new_action->name, action->name))
80 return 0;
81 return 1;
82}
83
84void register_handler_proc(unsigned int irq, struct irqaction *action)
85{
86 char name [MAX_NAMELEN];
87
88 if (!irq_dir[irq] || action->dir || !action->name ||
89 !name_unique(irq, action))
90 return;
91
92 memset(name, 0, MAX_NAMELEN);
93 snprintf(name, MAX_NAMELEN, "%s", action->name);
94
95 /* create /proc/irq/1234/handler/ */
96 action->dir = proc_mkdir(name, irq_dir[irq]);
97}
98
99#undef MAX_NAMELEN
100
101#define MAX_NAMELEN 10
102
103void register_irq_proc(unsigned int irq)
104{
105 char name [MAX_NAMELEN];
106
107 if (!root_irq_dir ||
108 (irq_desc[irq].handler == &no_irq_type) ||
109 irq_dir[irq])
110 return;
111
112 memset(name, 0, MAX_NAMELEN);
113 sprintf(name, "%d", irq);
114
115 /* create /proc/irq/1234 */
116 irq_dir[irq] = proc_mkdir(name, root_irq_dir);
117
118#ifdef CONFIG_SMP
119 {
120 struct proc_dir_entry *entry;
121
122 /* create /proc/irq/<irq>/smp_affinity */
123 entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
124
125 if (entry) {
126 entry->nlink = 1;
127 entry->data = (void *)(long)irq;
128 entry->read_proc = irq_affinity_read_proc;
129 entry->write_proc = irq_affinity_write_proc;
130 }
131 smp_affinity_entry[irq] = entry;
132 }
133#endif
134}
135
136#undef MAX_NAMELEN
137
138void unregister_handler_proc(unsigned int irq, struct irqaction *action)
139{
140 if (action->dir)
141 remove_proc_entry(action->dir->name, irq_dir[irq]);
142}
143
144void init_irq_proc(void)
145{
146 int i;
147
148 /* create /proc/irq */
149 root_irq_dir = proc_mkdir("irq", NULL);
150 if (!root_irq_dir)
151 return;
152
153 /*
154 * Create entries for all existing IRQs.
155 */
156 for (i = 0; i < NR_IRQS; i++)
157 register_irq_proc(i);
158}
159
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
new file mode 100644
index 000000000000..f6297c306905
--- /dev/null
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,96 @@
1/*
2 * linux/kernel/irq/spurious.c
3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains spurious interrupt handling.
7 */
8
9#include <linux/irq.h>
10#include <linux/module.h>
11#include <linux/kallsyms.h>
12#include <linux/interrupt.h>
13
14/*
15 * If 99,900 of the previous 100,000 interrupts have not been handled
16 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
17 * and try to turn the IRQ off.
18 *
19 * (The other 100-of-100,000 interrupts may have been a correctly
20 * functioning device sharing an IRQ with the failing one)
21 *
22 * Called under desc->lock
23 */
24
25static void
26__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
27{
28 struct irqaction *action;
29
30 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
31 printk(KERN_ERR "irq event %d: bogus return value %x\n",
32 irq, action_ret);
33 } else {
34 printk(KERN_ERR "irq %d: nobody cared!\n", irq);
35 }
36 dump_stack();
37 printk(KERN_ERR "handlers:\n");
38 action = desc->action;
39 while (action) {
40 printk(KERN_ERR "[<%p>]", action->handler);
41 print_symbol(" (%s)",
42 (unsigned long)action->handler);
43 printk("\n");
44 action = action->next;
45 }
46}
47
48void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
49{
50 static int count = 100;
51
52 if (count > 0) {
53 count--;
54 __report_bad_irq(irq, desc, action_ret);
55 }
56}
57
58void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
59{
60 if (action_ret != IRQ_HANDLED) {
61 desc->irqs_unhandled++;
62 if (action_ret != IRQ_NONE)
63 report_bad_irq(irq, desc, action_ret);
64 }
65
66 desc->irq_count++;
67 if (desc->irq_count < 100000)
68 return;
69
70 desc->irq_count = 0;
71 if (desc->irqs_unhandled > 99900) {
72 /*
73 * The interrupt is stuck
74 */
75 __report_bad_irq(irq, desc, action_ret);
76 /*
77 * Now kill the IRQ
78 */
79 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
80 desc->status |= IRQ_DISABLED;
81 desc->handler->disable(irq);
82 }
83 desc->irqs_unhandled = 0;
84}
85
86int noirqdebug;
87
88int __init noirqdebug_setup(char *str)
89{
90 noirqdebug = 1;
91 printk(KERN_INFO "IRQ lockup detection disabled\n");
92 return 1;
93}
94
95__setup("noirqdebug", noirqdebug_setup);
96
diff --git a/kernel/itimer.c b/kernel/itimer.c
new file mode 100644
index 000000000000..e9a40e947e07
--- /dev/null
+++ b/kernel/itimer.c
@@ -0,0 +1,241 @@
1/*
2 * linux/kernel/itimer.c
3 *
4 * Copyright (C) 1992 Darren Senn
5 */
6
7/* These are all the functions necessary to implement itimers */
8
9#include <linux/mm.h>
10#include <linux/smp_lock.h>
11#include <linux/interrupt.h>
12#include <linux/syscalls.h>
13#include <linux/time.h>
14#include <linux/posix-timers.h>
15
16#include <asm/uaccess.h>
17
18static unsigned long it_real_value(struct signal_struct *sig)
19{
20 unsigned long val = 0;
21 if (timer_pending(&sig->real_timer)) {
22 val = sig->real_timer.expires - jiffies;
23
24 /* look out for negative/zero itimer.. */
25 if ((long) val <= 0)
26 val = 1;
27 }
28 return val;
29}
30
31int do_getitimer(int which, struct itimerval *value)
32{
33 struct task_struct *tsk = current;
34 unsigned long interval, val;
35 cputime_t cinterval, cval;
36
37 switch (which) {
38 case ITIMER_REAL:
39 spin_lock_irq(&tsk->sighand->siglock);
40 interval = tsk->signal->it_real_incr;
41 val = it_real_value(tsk->signal);
42 spin_unlock_irq(&tsk->sighand->siglock);
43 jiffies_to_timeval(val, &value->it_value);
44 jiffies_to_timeval(interval, &value->it_interval);
45 break;
46 case ITIMER_VIRTUAL:
47 read_lock(&tasklist_lock);
48 spin_lock_irq(&tsk->sighand->siglock);
49 cval = tsk->signal->it_virt_expires;
50 cinterval = tsk->signal->it_virt_incr;
51 if (!cputime_eq(cval, cputime_zero)) {
52 struct task_struct *t = tsk;
53 cputime_t utime = tsk->signal->utime;
54 do {
55 utime = cputime_add(utime, t->utime);
56 t = next_thread(t);
57 } while (t != tsk);
58 if (cputime_le(cval, utime)) { /* about to fire */
59 cval = jiffies_to_cputime(1);
60 } else {
61 cval = cputime_sub(cval, utime);
62 }
63 }
64 spin_unlock_irq(&tsk->sighand->siglock);
65 read_unlock(&tasklist_lock);
66 cputime_to_timeval(cval, &value->it_value);
67 cputime_to_timeval(cinterval, &value->it_interval);
68 break;
69 case ITIMER_PROF:
70 read_lock(&tasklist_lock);
71 spin_lock_irq(&tsk->sighand->siglock);
72 cval = tsk->signal->it_prof_expires;
73 cinterval = tsk->signal->it_prof_incr;
74 if (!cputime_eq(cval, cputime_zero)) {
75 struct task_struct *t = tsk;
76 cputime_t ptime = cputime_add(tsk->signal->utime,
77 tsk->signal->stime);
78 do {
79 ptime = cputime_add(ptime,
80 cputime_add(t->utime,
81 t->stime));
82 t = next_thread(t);
83 } while (t != tsk);
84 if (cputime_le(cval, ptime)) { /* about to fire */
85 cval = jiffies_to_cputime(1);
86 } else {
87 cval = cputime_sub(cval, ptime);
88 }
89 }
90 spin_unlock_irq(&tsk->sighand->siglock);
91 read_unlock(&tasklist_lock);
92 cputime_to_timeval(cval, &value->it_value);
93 cputime_to_timeval(cinterval, &value->it_interval);
94 break;
95 default:
96 return(-EINVAL);
97 }
98 return 0;
99}
100
101asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
102{
103 int error = -EFAULT;
104 struct itimerval get_buffer;
105
106 if (value) {
107 error = do_getitimer(which, &get_buffer);
108 if (!error &&
109 copy_to_user(value, &get_buffer, sizeof(get_buffer)))
110 error = -EFAULT;
111 }
112 return error;
113}
114
115/*
116 * Called with P->sighand->siglock held and P->signal->real_timer inactive.
117 * If interval is nonzero, arm the timer for interval ticks from now.
118 */
119static inline void it_real_arm(struct task_struct *p, unsigned long interval)
120{
121 p->signal->it_real_value = interval; /* XXX unnecessary field?? */
122 if (interval == 0)
123 return;
124 if (interval > (unsigned long) LONG_MAX)
125 interval = LONG_MAX;
126 p->signal->real_timer.expires = jiffies + interval;
127 add_timer(&p->signal->real_timer);
128}
129
130void it_real_fn(unsigned long __data)
131{
132 struct task_struct * p = (struct task_struct *) __data;
133
134 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
135
136 /*
137 * Now restart the timer if necessary. We don't need any locking
138 * here because do_setitimer makes sure we have finished running
139 * before it touches anything.
140 */
141 it_real_arm(p, p->signal->it_real_incr);
142}
143
144int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
145{
146 struct task_struct *tsk = current;
147 unsigned long val, interval;
148 cputime_t cval, cinterval, nval, ninterval;
149
150 switch (which) {
151 case ITIMER_REAL:
152 spin_lock_irq(&tsk->sighand->siglock);
153 interval = tsk->signal->it_real_incr;
154 val = it_real_value(tsk->signal);
155 if (val)
156 del_timer_sync(&tsk->signal->real_timer);
157 tsk->signal->it_real_incr =
158 timeval_to_jiffies(&value->it_interval);
159 it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
160 spin_unlock_irq(&tsk->sighand->siglock);
161 if (ovalue) {
162 jiffies_to_timeval(val, &ovalue->it_value);
163 jiffies_to_timeval(interval,
164 &ovalue->it_interval);
165 }
166 break;
167 case ITIMER_VIRTUAL:
168 nval = timeval_to_cputime(&value->it_value);
169 ninterval = timeval_to_cputime(&value->it_interval);
170 read_lock(&tasklist_lock);
171 spin_lock_irq(&tsk->sighand->siglock);
172 cval = tsk->signal->it_virt_expires;
173 cinterval = tsk->signal->it_virt_incr;
174 if (!cputime_eq(cval, cputime_zero) ||
175 !cputime_eq(nval, cputime_zero)) {
176 if (cputime_gt(nval, cputime_zero))
177 nval = cputime_add(nval,
178 jiffies_to_cputime(1));
179 set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
180 &nval, &cval);
181 }
182 tsk->signal->it_virt_expires = nval;
183 tsk->signal->it_virt_incr = ninterval;
184 spin_unlock_irq(&tsk->sighand->siglock);
185 read_unlock(&tasklist_lock);
186 if (ovalue) {
187 cputime_to_timeval(cval, &ovalue->it_value);
188 cputime_to_timeval(cinterval, &ovalue->it_interval);
189 }
190 break;
191 case ITIMER_PROF:
192 nval = timeval_to_cputime(&value->it_value);
193 ninterval = timeval_to_cputime(&value->it_interval);
194 read_lock(&tasklist_lock);
195 spin_lock_irq(&tsk->sighand->siglock);
196 cval = tsk->signal->it_prof_expires;
197 cinterval = tsk->signal->it_prof_incr;
198 if (!cputime_eq(cval, cputime_zero) ||
199 !cputime_eq(nval, cputime_zero)) {
200 if (cputime_gt(nval, cputime_zero))
201 nval = cputime_add(nval,
202 jiffies_to_cputime(1));
203 set_process_cpu_timer(tsk, CPUCLOCK_PROF,
204 &nval, &cval);
205 }
206 tsk->signal->it_prof_expires = nval;
207 tsk->signal->it_prof_incr = ninterval;
208 spin_unlock_irq(&tsk->sighand->siglock);
209 read_unlock(&tasklist_lock);
210 if (ovalue) {
211 cputime_to_timeval(cval, &ovalue->it_value);
212 cputime_to_timeval(cinterval, &ovalue->it_interval);
213 }
214 break;
215 default:
216 return -EINVAL;
217 }
218 return 0;
219}
220
221asmlinkage long sys_setitimer(int which,
222 struct itimerval __user *value,
223 struct itimerval __user *ovalue)
224{
225 struct itimerval set_buffer, get_buffer;
226 int error;
227
228 if (value) {
229 if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
230 return -EFAULT;
231 } else
232 memset((char *) &set_buffer, 0, sizeof(set_buffer));
233
234 error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
235 if (error || !ovalue)
236 return error;
237
238 if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
239 return -EFAULT;
240 return 0;
241}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
new file mode 100644
index 000000000000..1627f8d6e0cd
--- /dev/null
+++ b/kernel/kallsyms.c
@@ -0,0 +1,411 @@
1/*
2 * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
3 *
4 * Rewritten and vastly simplified by Rusty Russell for in-kernel
5 * module loader:
6 * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
7 *
8 * ChangeLog:
9 *
10 * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
11 * Changed the compression method from stem compression to "table lookup"
12 * compression (see scripts/kallsyms.c for a more complete description)
13 */
14#include <linux/kallsyms.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/seq_file.h>
18#include <linux/fs.h>
19#include <linux/err.h>
20#include <linux/proc_fs.h>
21#include <linux/mm.h>
22
23#include <asm/sections.h>
24
25#ifdef CONFIG_KALLSYMS_ALL
26#define all_var 1
27#else
28#define all_var 0
29#endif
30
31/* These will be re-linked against their real values during the second link stage */
32extern unsigned long kallsyms_addresses[] __attribute__((weak));
33extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
34extern u8 kallsyms_names[] __attribute__((weak));
35
36extern u8 kallsyms_token_table[] __attribute__((weak));
37extern u16 kallsyms_token_index[] __attribute__((weak));
38
39extern unsigned long kallsyms_markers[] __attribute__((weak));
40
41static inline int is_kernel_inittext(unsigned long addr)
42{
43 if (addr >= (unsigned long)_sinittext
44 && addr <= (unsigned long)_einittext)
45 return 1;
46 return 0;
47}
48
49static inline int is_kernel_text(unsigned long addr)
50{
51 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
52 return 1;
53 return in_gate_area_no_task(addr);
54}
55
56static inline int is_kernel(unsigned long addr)
57{
58 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
59 return 1;
60 return in_gate_area_no_task(addr);
61}
62
63/* expand a compressed symbol data into the resulting uncompressed string,
64 given the offset to where the symbol is in the compressed stream */
65static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
66{
67 int len, skipped_first = 0;
68 u8 *tptr, *data;
69
70 /* get the compressed symbol length from the first symbol byte */
71 data = &kallsyms_names[off];
72 len = *data;
73 data++;
74
75 /* update the offset to return the offset for the next symbol on
76 * the compressed stream */
77 off += len + 1;
78
79 /* for every byte on the compressed symbol data, copy the table
80 entry for that byte */
81 while(len) {
82 tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
83 data++;
84 len--;
85
86 while (*tptr) {
87 if(skipped_first) {
88 *result = *tptr;
89 result++;
90 } else
91 skipped_first = 1;
92 tptr++;
93 }
94 }
95
96 *result = '\0';
97
98 /* return to offset to the next symbol */
99 return off;
100}
101
102/* get symbol type information. This is encoded as a single char at the
103 * begining of the symbol name */
104static char kallsyms_get_symbol_type(unsigned int off)
105{
106 /* get just the first code, look it up in the token table, and return the
107 * first char from this token */
108 return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
109}
110
111
112/* find the offset on the compressed stream given and index in the
113 * kallsyms array */
114static unsigned int get_symbol_offset(unsigned long pos)
115{
116 u8 *name;
117 int i;
118
119 /* use the closest marker we have. We have markers every 256 positions,
120 * so that should be close enough */
121 name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
122
123 /* sequentially scan all the symbols up to the point we're searching for.
124 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
125 * just need to add the len to the current pointer for every symbol we
126 * wish to skip */
127 for(i = 0; i < (pos&0xFF); i++)
128 name = name + (*name) + 1;
129
130 return name - kallsyms_names;
131}
132
133/* Lookup the address for this symbol. Returns 0 if not found. */
134unsigned long kallsyms_lookup_name(const char *name)
135{
136 char namebuf[KSYM_NAME_LEN+1];
137 unsigned long i;
138 unsigned int off;
139
140 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
141 off = kallsyms_expand_symbol(off, namebuf);
142
143 if (strcmp(namebuf, name) == 0)
144 return kallsyms_addresses[i];
145 }
146 return module_kallsyms_lookup_name(name);
147}
148EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
149
150/*
151 * Lookup an address
152 * - modname is set to NULL if it's in the kernel
153 * - we guarantee that the returned name is valid until we reschedule even if
154 * it resides in a module
155 * - we also guarantee that modname will be valid until rescheduled
156 */
157const char *kallsyms_lookup(unsigned long addr,
158 unsigned long *symbolsize,
159 unsigned long *offset,
160 char **modname, char *namebuf)
161{
162 unsigned long i, low, high, mid;
163 const char *msym;
164
165 /* This kernel should never had been booted. */
166 BUG_ON(!kallsyms_addresses);
167
168 namebuf[KSYM_NAME_LEN] = 0;
169 namebuf[0] = 0;
170
171 if ((all_var && is_kernel(addr)) ||
172 (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) {
173 unsigned long symbol_end=0;
174
175 /* do a binary search on the sorted kallsyms_addresses array */
176 low = 0;
177 high = kallsyms_num_syms;
178
179 while (high-low > 1) {
180 mid = (low + high) / 2;
181 if (kallsyms_addresses[mid] <= addr) low = mid;
182 else high = mid;
183 }
184
185 /* search for the first aliased symbol. Aliased symbols are
186 symbols with the same address */
187 while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low])
188 --low;
189
190 /* Grab name */
191 kallsyms_expand_symbol(get_symbol_offset(low), namebuf);
192
193 /* Search for next non-aliased symbol */
194 for (i = low + 1; i < kallsyms_num_syms; i++) {
195 if (kallsyms_addresses[i] > kallsyms_addresses[low]) {
196 symbol_end = kallsyms_addresses[i];
197 break;
198 }
199 }
200
201 /* if we found no next symbol, we use the end of the section */
202 if (!symbol_end) {
203 if (is_kernel_inittext(addr))
204 symbol_end = (unsigned long)_einittext;
205 else
206 symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext;
207 }
208
209 *symbolsize = symbol_end - kallsyms_addresses[low];
210 *modname = NULL;
211 *offset = addr - kallsyms_addresses[low];
212 return namebuf;
213 }
214
215 /* see if it's in a module */
216 msym = module_address_lookup(addr, symbolsize, offset, modname);
217 if (msym)
218 return strncpy(namebuf, msym, KSYM_NAME_LEN);
219
220 return NULL;
221}
222
223/* Replace "%s" in format with address, or returns -errno. */
224void __print_symbol(const char *fmt, unsigned long address)
225{
226 char *modname;
227 const char *name;
228 unsigned long offset, size;
229 char namebuf[KSYM_NAME_LEN+1];
230 char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN +
231 2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1];
232
233 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
234
235 if (!name)
236 sprintf(buffer, "0x%lx", address);
237 else {
238 if (modname)
239 sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
240 size, modname);
241 else
242 sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
243 }
244 printk(fmt, buffer);
245}
246
247/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
248struct kallsym_iter
249{
250 loff_t pos;
251 struct module *owner;
252 unsigned long value;
253 unsigned int nameoff; /* If iterating in core kernel symbols */
254 char type;
255 char name[KSYM_NAME_LEN+1];
256};
257
258/* Only label it "global" if it is exported. */
259static void upcase_if_global(struct kallsym_iter *iter)
260{
261 if (is_exported(iter->name, iter->owner))
262 iter->type += 'A' - 'a';
263}
264
265static int get_ksymbol_mod(struct kallsym_iter *iter)
266{
267 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
268 &iter->value,
269 &iter->type, iter->name);
270 if (iter->owner == NULL)
271 return 0;
272
273 upcase_if_global(iter);
274 return 1;
275}
276
277/* Returns space to next name. */
278static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
279{
280 unsigned off = iter->nameoff;
281
282 iter->owner = NULL;
283 iter->value = kallsyms_addresses[iter->pos];
284
285 iter->type = kallsyms_get_symbol_type(off);
286
287 off = kallsyms_expand_symbol(off, iter->name);
288
289 return off - iter->nameoff;
290}
291
292static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
293{
294 iter->name[0] = '\0';
295 iter->nameoff = get_symbol_offset(new_pos);
296 iter->pos = new_pos;
297}
298
299/* Returns false if pos at or past end of file. */
300static int update_iter(struct kallsym_iter *iter, loff_t pos)
301{
302 /* Module symbols can be accessed randomly. */
303 if (pos >= kallsyms_num_syms) {
304 iter->pos = pos;
305 return get_ksymbol_mod(iter);
306 }
307
308 /* If we're not on the desired position, reset to new position. */
309 if (pos != iter->pos)
310 reset_iter(iter, pos);
311
312 iter->nameoff += get_ksymbol_core(iter);
313 iter->pos++;
314
315 return 1;
316}
317
318static void *s_next(struct seq_file *m, void *p, loff_t *pos)
319{
320 (*pos)++;
321
322 if (!update_iter(m->private, *pos))
323 return NULL;
324 return p;
325}
326
327static void *s_start(struct seq_file *m, loff_t *pos)
328{
329 if (!update_iter(m->private, *pos))
330 return NULL;
331 return m->private;
332}
333
334static void s_stop(struct seq_file *m, void *p)
335{
336}
337
338static int s_show(struct seq_file *m, void *p)
339{
340 struct kallsym_iter *iter = m->private;
341
342 /* Some debugging symbols have no name. Ignore them. */
343 if (!iter->name[0])
344 return 0;
345
346 if (iter->owner)
347 seq_printf(m, "%0*lx %c %s\t[%s]\n",
348 (int)(2*sizeof(void*)),
349 iter->value, iter->type, iter->name,
350 module_name(iter->owner));
351 else
352 seq_printf(m, "%0*lx %c %s\n",
353 (int)(2*sizeof(void*)),
354 iter->value, iter->type, iter->name);
355 return 0;
356}
357
358static struct seq_operations kallsyms_op = {
359 .start = s_start,
360 .next = s_next,
361 .stop = s_stop,
362 .show = s_show
363};
364
365static int kallsyms_open(struct inode *inode, struct file *file)
366{
367 /* We keep iterator in m->private, since normal case is to
368 * s_start from where we left off, so we avoid doing
369 * using get_symbol_offset for every symbol */
370 struct kallsym_iter *iter;
371 int ret;
372
373 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
374 if (!iter)
375 return -ENOMEM;
376 reset_iter(iter, 0);
377
378 ret = seq_open(file, &kallsyms_op);
379 if (ret == 0)
380 ((struct seq_file *)file->private_data)->private = iter;
381 else
382 kfree(iter);
383 return ret;
384}
385
386static int kallsyms_release(struct inode *inode, struct file *file)
387{
388 struct seq_file *m = (struct seq_file *)file->private_data;
389 kfree(m->private);
390 return seq_release(inode, file);
391}
392
393static struct file_operations kallsyms_operations = {
394 .open = kallsyms_open,
395 .read = seq_read,
396 .llseek = seq_lseek,
397 .release = kallsyms_release,
398};
399
400static int __init kallsyms_init(void)
401{
402 struct proc_dir_entry *entry;
403
404 entry = create_proc_entry("kallsyms", 0444, NULL);
405 if (entry)
406 entry->proc_fops = &kallsyms_operations;
407 return 0;
408}
409__initcall(kallsyms_init);
410
411EXPORT_SYMBOL(__print_symbol);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
new file mode 100644
index 000000000000..179baafcdd96
--- /dev/null
+++ b/kernel/kfifo.c
@@ -0,0 +1,168 @@
1/*
2 * A simple kernel FIFO implementation.
3 *
4 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 */
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/err.h>
26#include <linux/kfifo.h>
27
28/**
29 * kfifo_init - allocates a new FIFO using a preallocated buffer
30 * @buffer: the preallocated buffer to be used.
31 * @size: the size of the internal buffer, this have to be a power of 2.
32 * @gfp_mask: get_free_pages mask, passed to kmalloc()
33 * @lock: the lock to be used to protect the fifo buffer
34 *
35 * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the
36 * struct kfifo with kfree().
37 */
38struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
39 unsigned int __nocast gfp_mask, spinlock_t *lock)
40{
41 struct kfifo *fifo;
42
43 /* size must be a power of 2 */
44 BUG_ON(size & (size - 1));
45
46 fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
47 if (!fifo)
48 return ERR_PTR(-ENOMEM);
49
50 fifo->buffer = buffer;
51 fifo->size = size;
52 fifo->in = fifo->out = 0;
53 fifo->lock = lock;
54
55 return fifo;
56}
57EXPORT_SYMBOL(kfifo_init);
58
59/**
60 * kfifo_alloc - allocates a new FIFO and its internal buffer
61 * @size: the size of the internal buffer to be allocated.
62 * @gfp_mask: get_free_pages mask, passed to kmalloc()
63 * @lock: the lock to be used to protect the fifo buffer
64 *
65 * The size will be rounded-up to a power of 2.
66 */
67struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock)
68{
69 unsigned char *buffer;
70 struct kfifo *ret;
71
72 /*
73 * round up to the next power of 2, since our 'let the indices
74 * wrap' tachnique works only in this case.
75 */
76 if (size & (size - 1)) {
77 BUG_ON(size > 0x80000000);
78 size = roundup_pow_of_two(size);
79 }
80
81 buffer = kmalloc(size, gfp_mask);
82 if (!buffer)
83 return ERR_PTR(-ENOMEM);
84
85 ret = kfifo_init(buffer, size, gfp_mask, lock);
86
87 if (IS_ERR(ret))
88 kfree(buffer);
89
90 return ret;
91}
92EXPORT_SYMBOL(kfifo_alloc);
93
94/**
95 * kfifo_free - frees the FIFO
96 * @fifo: the fifo to be freed.
97 */
98void kfifo_free(struct kfifo *fifo)
99{
100 kfree(fifo->buffer);
101 kfree(fifo);
102}
103EXPORT_SYMBOL(kfifo_free);
104
105/**
106 * __kfifo_put - puts some data into the FIFO, no locking version
107 * @fifo: the fifo to be used.
108 * @buffer: the data to be added.
109 * @len: the length of the data to be added.
110 *
111 * This function copies at most 'len' bytes from the 'buffer' into
112 * the FIFO depending on the free space, and returns the number of
113 * bytes copied.
114 *
115 * Note that with only one concurrent reader and one concurrent
116 * writer, you don't need extra locking to use these functions.
117 */
118unsigned int __kfifo_put(struct kfifo *fifo,
119 unsigned char *buffer, unsigned int len)
120{
121 unsigned int l;
122
123 len = min(len, fifo->size - fifo->in + fifo->out);
124
125 /* first put the data starting from fifo->in to buffer end */
126 l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
127 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
128
129 /* then put the rest (if any) at the beginning of the buffer */
130 memcpy(fifo->buffer, buffer + l, len - l);
131
132 fifo->in += len;
133
134 return len;
135}
136EXPORT_SYMBOL(__kfifo_put);
137
138/**
139 * __kfifo_get - gets some data from the FIFO, no locking version
140 * @fifo: the fifo to be used.
141 * @buffer: where the data must be copied.
142 * @len: the size of the destination buffer.
143 *
144 * This function copies at most 'len' bytes from the FIFO into the
145 * 'buffer' and returns the number of copied bytes.
146 *
147 * Note that with only one concurrent reader and one concurrent
148 * writer, you don't need extra locking to use these functions.
149 */
150unsigned int __kfifo_get(struct kfifo *fifo,
151 unsigned char *buffer, unsigned int len)
152{
153 unsigned int l;
154
155 len = min(len, fifo->in - fifo->out);
156
157 /* first get the data from fifo->out until the end of the buffer */
158 l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
159 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
160
161 /* then get the rest (if any) from the beginning of the buffer */
162 memcpy(buffer + l, fifo->buffer, len - l);
163
164 fifo->out += len;
165
166 return len;
167}
168EXPORT_SYMBOL(__kfifo_get);
diff --git a/kernel/kmod.c b/kernel/kmod.c
new file mode 100644
index 000000000000..eed53d4f5230
--- /dev/null
+++ b/kernel/kmod.c
@@ -0,0 +1,256 @@
1/*
2 kmod, the new module loader (replaces kerneld)
3 Kirk Petersen
4
5 Reorganized not to be a daemon by Adam Richter, with guidance
6 from Greg Zornetzer.
7
8 Modified to avoid chroot and file sharing problems.
9 Mikael Pettersson
10
11 Limit the concurrent number of kmod modprobes to catch loops from
12 "modprobe needs a service that is in a module".
13 Keith Owens <kaos@ocs.com.au> December 1999
14
15 Unblock all signals when we exec a usermode process.
16 Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
17
18 call_usermodehelper wait flag, and remove exec_usermodehelper.
19 Rusty Russell <rusty@rustcorp.com.au> Jan 2003
20*/
21#define __KERNEL_SYSCALLS__
22
23#include <linux/config.h>
24#include <linux/module.h>
25#include <linux/sched.h>
26#include <linux/syscalls.h>
27#include <linux/unistd.h>
28#include <linux/kmod.h>
29#include <linux/smp_lock.h>
30#include <linux/slab.h>
31#include <linux/namespace.h>
32#include <linux/completion.h>
33#include <linux/file.h>
34#include <linux/workqueue.h>
35#include <linux/security.h>
36#include <linux/mount.h>
37#include <linux/kernel.h>
38#include <linux/init.h>
39#include <asm/uaccess.h>
40
41extern int max_threads;
42
43static struct workqueue_struct *khelper_wq;
44
45#ifdef CONFIG_KMOD
46
47/*
48 modprobe_path is set via /proc/sys.
49*/
50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51
52/**
53 * request_module - try to load a kernel module
54 * @fmt: printf style format string for the name of the module
55 * @varargs: arguements as specified in the format string
56 *
57 * Load a module using the user mode module loader. The function returns
58 * zero on success or a negative errno code on failure. Note that a
59 * successful module load does not mean the module did not then unload
60 * and exit on an error of its own. Callers must check that the service
61 * they requested is now available not blindly invoke it.
62 *
63 * If module auto-loading support is disabled then this function
64 * becomes a no-operation.
65 */
66int request_module(const char *fmt, ...)
67{
68 va_list args;
69 char module_name[MODULE_NAME_LEN];
70 unsigned int max_modprobes;
71 int ret;
72 char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
73 static char *envp[] = { "HOME=/",
74 "TERM=linux",
75 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
76 NULL };
77 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
78#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
79 static int kmod_loop_msg;
80
81 va_start(args, fmt);
82 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
83 va_end(args);
84 if (ret >= MODULE_NAME_LEN)
85 return -ENAMETOOLONG;
86
87 /* If modprobe needs a service that is in a module, we get a recursive
88 * loop. Limit the number of running kmod threads to max_threads/2 or
89 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
90 * would be to run the parents of this process, counting how many times
91 * kmod was invoked. That would mean accessing the internals of the
92 * process tables to get the command line, proc_pid_cmdline is static
93 * and it is not worth changing the proc code just to handle this case.
94 * KAO.
95 *
96 * "trace the ppid" is simple, but will fail if someone's
97 * parent exits. I think this is as good as it gets. --RR
98 */
99 max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
100 atomic_inc(&kmod_concurrent);
101 if (atomic_read(&kmod_concurrent) > max_modprobes) {
102 /* We may be blaming an innocent here, but unlikely */
103 if (kmod_loop_msg++ < 5)
104 printk(KERN_ERR
105 "request_module: runaway loop modprobe %s\n",
106 module_name);
107 atomic_dec(&kmod_concurrent);
108 return -ENOMEM;
109 }
110
111 ret = call_usermodehelper(modprobe_path, argv, envp, 1);
112 atomic_dec(&kmod_concurrent);
113 return ret;
114}
115EXPORT_SYMBOL(request_module);
116#endif /* CONFIG_KMOD */
117
118struct subprocess_info {
119 struct completion *complete;
120 char *path;
121 char **argv;
122 char **envp;
123 int wait;
124 int retval;
125};
126
127/*
128 * This is the task which runs the usermode application
129 */
130static int ____call_usermodehelper(void *data)
131{
132 struct subprocess_info *sub_info = data;
133 int retval;
134
135 /* Unblock all signals. */
136 flush_signals(current);
137 spin_lock_irq(&current->sighand->siglock);
138 flush_signal_handlers(current, 1);
139 sigemptyset(&current->blocked);
140 recalc_sigpending();
141 spin_unlock_irq(&current->sighand->siglock);
142
143 /* We can run anywhere, unlike our parent keventd(). */
144 set_cpus_allowed(current, CPU_MASK_ALL);
145
146 retval = -EPERM;
147 if (current->fs->root)
148 retval = execve(sub_info->path, sub_info->argv,sub_info->envp);
149
150 /* Exec failed? */
151 sub_info->retval = retval;
152 do_exit(0);
153}
154
155/* Keventd can't block, but this (a child) can. */
156static int wait_for_helper(void *data)
157{
158 struct subprocess_info *sub_info = data;
159 pid_t pid;
160 struct k_sigaction sa;
161
162 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
163 * populate the status, but will return -ECHILD. */
164 sa.sa.sa_handler = SIG_IGN;
165 sa.sa.sa_flags = 0;
166 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
167 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
168 allow_signal(SIGCHLD);
169
170 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
171 if (pid < 0) {
172 sub_info->retval = pid;
173 } else {
174 /*
175 * Normally it is bogus to call wait4() from in-kernel because
176 * wait4() wants to write the exit code to a userspace address.
177 * But wait_for_helper() always runs as keventd, and put_user()
178 * to a kernel address works OK for kernel threads, due to their
179 * having an mm_segment_t which spans the entire address space.
180 *
181 * Thus the __user pointer cast is valid here.
182 */
183 sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
184 }
185
186 complete(sub_info->complete);
187 return 0;
188}
189
190/* This is run by khelper thread */
191static void __call_usermodehelper(void *data)
192{
193 struct subprocess_info *sub_info = data;
194 pid_t pid;
195
196 /* CLONE_VFORK: wait until the usermode helper has execve'd
197 * successfully We need the data structures to stay around
198 * until that is done. */
199 if (sub_info->wait)
200 pid = kernel_thread(wait_for_helper, sub_info,
201 CLONE_FS | CLONE_FILES | SIGCHLD);
202 else
203 pid = kernel_thread(____call_usermodehelper, sub_info,
204 CLONE_VFORK | SIGCHLD);
205
206 if (pid < 0) {
207 sub_info->retval = pid;
208 complete(sub_info->complete);
209 } else if (!sub_info->wait)
210 complete(sub_info->complete);
211}
212
213/**
214 * call_usermodehelper - start a usermode application
215 * @path: pathname for the application
216 * @argv: null-terminated argument list
217 * @envp: null-terminated environment list
218 * @wait: wait for the application to finish and return status.
219 *
220 * Runs a user-space application. The application is started
221 * asynchronously if wait is not set, and runs as a child of keventd.
222 * (ie. it runs with full root capabilities).
223 *
224 * Must be called from process context. Returns a negative error code
225 * if program was not execed successfully, or 0.
226 */
227int call_usermodehelper(char *path, char **argv, char **envp, int wait)
228{
229 DECLARE_COMPLETION(done);
230 struct subprocess_info sub_info = {
231 .complete = &done,
232 .path = path,
233 .argv = argv,
234 .envp = envp,
235 .wait = wait,
236 .retval = 0,
237 };
238 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
239
240 if (!khelper_wq)
241 return -EBUSY;
242
243 if (path[0] == '\0')
244 return 0;
245
246 queue_work(khelper_wq, &work);
247 wait_for_completion(&done);
248 return sub_info.retval;
249}
250EXPORT_SYMBOL(call_usermodehelper);
251
252void __init usermodehelper_init(void)
253{
254 khelper_wq = create_singlethread_workqueue("khelper");
255 BUG_ON(!khelper_wq);
256}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
new file mode 100644
index 000000000000..1d5dd1337bd1
--- /dev/null
+++ b/kernel/kprobes.c
@@ -0,0 +1,157 @@
1/*
2 * Kernel Probes (KProbes)
3 * kernel/kprobes.c
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004
20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation (includes suggestions from
23 * Rusty Russell).
24 * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
25 * hlists and exceptions notifier as suggested by Andi Kleen.
26 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
27 * interface to access function arguments.
28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
29 * exceptions notifier to be first on the priority list.
30 */
31#include <linux/kprobes.h>
32#include <linux/spinlock.h>
33#include <linux/hash.h>
34#include <linux/init.h>
35#include <linux/module.h>
36#include <asm/cacheflush.h>
37#include <asm/errno.h>
38#include <asm/kdebug.h>
39
40#define KPROBE_HASH_BITS 6
41#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
42
43static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
44
45unsigned int kprobe_cpu = NR_CPUS;
46static DEFINE_SPINLOCK(kprobe_lock);
47
48/* Locks kprobe: irqs must be disabled */
49void lock_kprobes(void)
50{
51 spin_lock(&kprobe_lock);
52 kprobe_cpu = smp_processor_id();
53}
54
55void unlock_kprobes(void)
56{
57 kprobe_cpu = NR_CPUS;
58 spin_unlock(&kprobe_lock);
59}
60
61/* You have to be holding the kprobe_lock */
62struct kprobe *get_kprobe(void *addr)
63{
64 struct hlist_head *head;
65 struct hlist_node *node;
66
67 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
68 hlist_for_each(node, head) {
69 struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
70 if (p->addr == addr)
71 return p;
72 }
73 return NULL;
74}
75
76int register_kprobe(struct kprobe *p)
77{
78 int ret = 0;
79 unsigned long flags = 0;
80
81 if ((ret = arch_prepare_kprobe(p)) != 0) {
82 goto rm_kprobe;
83 }
84 spin_lock_irqsave(&kprobe_lock, flags);
85 INIT_HLIST_NODE(&p->hlist);
86 if (get_kprobe(p->addr)) {
87 ret = -EEXIST;
88 goto out;
89 }
90 arch_copy_kprobe(p);
91
92 hlist_add_head(&p->hlist,
93 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
94
95 p->opcode = *p->addr;
96 *p->addr = BREAKPOINT_INSTRUCTION;
97 flush_icache_range((unsigned long) p->addr,
98 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
99out:
100 spin_unlock_irqrestore(&kprobe_lock, flags);
101rm_kprobe:
102 if (ret == -EEXIST)
103 arch_remove_kprobe(p);
104 return ret;
105}
106
107void unregister_kprobe(struct kprobe *p)
108{
109 unsigned long flags;
110 arch_remove_kprobe(p);
111 spin_lock_irqsave(&kprobe_lock, flags);
112 *p->addr = p->opcode;
113 hlist_del(&p->hlist);
114 flush_icache_range((unsigned long) p->addr,
115 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
116 spin_unlock_irqrestore(&kprobe_lock, flags);
117}
118
119static struct notifier_block kprobe_exceptions_nb = {
120 .notifier_call = kprobe_exceptions_notify,
121 .priority = 0x7fffffff /* we need to notified first */
122};
123
124int register_jprobe(struct jprobe *jp)
125{
126 /* Todo: Verify probepoint is a function entry point */
127 jp->kp.pre_handler = setjmp_pre_handler;
128 jp->kp.break_handler = longjmp_break_handler;
129
130 return register_kprobe(&jp->kp);
131}
132
133void unregister_jprobe(struct jprobe *jp)
134{
135 unregister_kprobe(&jp->kp);
136}
137
138static int __init init_kprobes(void)
139{
140 int i, err = 0;
141
142 /* FIXME allocate the probe table, currently defined statically */
143 /* initialize all list heads */
144 for (i = 0; i < KPROBE_TABLE_SIZE; i++)
145 INIT_HLIST_HEAD(&kprobe_table[i]);
146
147 err = register_die_notifier(&kprobe_exceptions_nb);
148 return err;
149}
150
151__initcall(init_kprobes);
152
153EXPORT_SYMBOL_GPL(register_kprobe);
154EXPORT_SYMBOL_GPL(unregister_kprobe);
155EXPORT_SYMBOL_GPL(register_jprobe);
156EXPORT_SYMBOL_GPL(unregister_jprobe);
157EXPORT_SYMBOL_GPL(jprobe_return);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
new file mode 100644
index 000000000000..1f064a63f8cf
--- /dev/null
+++ b/kernel/ksysfs.c
@@ -0,0 +1,57 @@
1/*
2 * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
3 * are not related to any other subsystem
4 *
5 * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
6 *
7 * This file is release under the GPLv2
8 *
9 */
10
11#include <linux/config.h>
12#include <linux/kobject.h>
13#include <linux/string.h>
14#include <linux/sysfs.h>
15#include <linux/module.h>
16#include <linux/init.h>
17
18#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
20
21#define KERNEL_ATTR_RW(_name) \
22static struct subsys_attribute _name##_attr = \
23 __ATTR(_name, 0644, _name##_show, _name##_store)
24
25#ifdef CONFIG_HOTPLUG
26static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
27{
28 return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
29}
30KERNEL_ATTR_RO(hotplug_seqnum);
31#endif
32
33decl_subsys(kernel, NULL, NULL);
34EXPORT_SYMBOL_GPL(kernel_subsys);
35
36static struct attribute * kernel_attrs[] = {
37#ifdef CONFIG_HOTPLUG
38 &hotplug_seqnum_attr.attr,
39#endif
40 NULL
41};
42
43static struct attribute_group kernel_attr_group = {
44 .attrs = kernel_attrs,
45};
46
47static int __init ksysfs_init(void)
48{
49 int error = subsystem_register(&kernel_subsys);
50 if (!error)
51 error = sysfs_create_group(&kernel_subsys.kset.kobj,
52 &kernel_attr_group);
53
54 return error;
55}
56
57core_initcall(ksysfs_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
new file mode 100644
index 000000000000..e377e2244103
--- /dev/null
+++ b/kernel/kthread.c
@@ -0,0 +1,202 @@
1/* Kernel thread helper functions.
2 * Copyright (C) 2004 IBM Corporation, Rusty Russell.
3 *
4 * Creation is done via keventd, so that we get a clean environment
5 * even if we're invoked from userspace (think modprobe, hotplug cpu,
6 * etc.).
7 */
8#include <linux/sched.h>
9#include <linux/kthread.h>
10#include <linux/completion.h>
11#include <linux/err.h>
12#include <linux/unistd.h>
13#include <linux/file.h>
14#include <linux/module.h>
15#include <asm/semaphore.h>
16
17/*
18 * We dont want to execute off keventd since it might
19 * hold a semaphore our callers hold too:
20 */
21static struct workqueue_struct *helper_wq;
22
23struct kthread_create_info
24{
25 /* Information passed to kthread() from keventd. */
26 int (*threadfn)(void *data);
27 void *data;
28 struct completion started;
29
30 /* Result passed back to kthread_create() from keventd. */
31 struct task_struct *result;
32 struct completion done;
33};
34
35struct kthread_stop_info
36{
37 struct task_struct *k;
38 int err;
39 struct completion done;
40};
41
42/* Thread stopping is done by setthing this var: lock serializes
43 * multiple kthread_stop calls. */
44static DECLARE_MUTEX(kthread_stop_lock);
45static struct kthread_stop_info kthread_stop_info;
46
47int kthread_should_stop(void)
48{
49 return (kthread_stop_info.k == current);
50}
51EXPORT_SYMBOL(kthread_should_stop);
52
53static void kthread_exit_files(void)
54{
55 struct fs_struct *fs;
56 struct task_struct *tsk = current;
57
58 exit_fs(tsk); /* current->fs->count--; */
59 fs = init_task.fs;
60 tsk->fs = fs;
61 atomic_inc(&fs->count);
62 exit_files(tsk);
63 current->files = init_task.files;
64 atomic_inc(&tsk->files->count);
65}
66
67static int kthread(void *_create)
68{
69 struct kthread_create_info *create = _create;
70 int (*threadfn)(void *data);
71 void *data;
72 sigset_t blocked;
73 int ret = -EINTR;
74
75 kthread_exit_files();
76
77 /* Copy data: it's on keventd's stack */
78 threadfn = create->threadfn;
79 data = create->data;
80
81 /* Block and flush all signals (in case we're not from keventd). */
82 sigfillset(&blocked);
83 sigprocmask(SIG_BLOCK, &blocked, NULL);
84 flush_signals(current);
85
86 /* By default we can run anywhere, unlike keventd. */
87 set_cpus_allowed(current, CPU_MASK_ALL);
88
89 /* OK, tell user we're spawned, wait for stop or wakeup */
90 __set_current_state(TASK_INTERRUPTIBLE);
91 complete(&create->started);
92 schedule();
93
94 if (!kthread_should_stop())
95 ret = threadfn(data);
96
97 /* It might have exited on its own, w/o kthread_stop. Check. */
98 if (kthread_should_stop()) {
99 kthread_stop_info.err = ret;
100 complete(&kthread_stop_info.done);
101 }
102 return 0;
103}
104
105/* We are keventd: create a thread. */
106static void keventd_create_kthread(void *_create)
107{
108 struct kthread_create_info *create = _create;
109 int pid;
110
111 /* We want our own signal handler (we take no signals by default). */
112 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
113 if (pid < 0) {
114 create->result = ERR_PTR(pid);
115 } else {
116 wait_for_completion(&create->started);
117 create->result = find_task_by_pid(pid);
118 }
119 complete(&create->done);
120}
121
122struct task_struct *kthread_create(int (*threadfn)(void *data),
123 void *data,
124 const char namefmt[],
125 ...)
126{
127 struct kthread_create_info create;
128 DECLARE_WORK(work, keventd_create_kthread, &create);
129
130 create.threadfn = threadfn;
131 create.data = data;
132 init_completion(&create.started);
133 init_completion(&create.done);
134
135 /*
136 * The workqueue needs to start up first:
137 */
138 if (!helper_wq)
139 work.func(work.data);
140 else {
141 queue_work(helper_wq, &work);
142 wait_for_completion(&create.done);
143 }
144 if (!IS_ERR(create.result)) {
145 va_list args;
146 va_start(args, namefmt);
147 vsnprintf(create.result->comm, sizeof(create.result->comm),
148 namefmt, args);
149 va_end(args);
150 }
151
152 return create.result;
153}
154EXPORT_SYMBOL(kthread_create);
155
156void kthread_bind(struct task_struct *k, unsigned int cpu)
157{
158 BUG_ON(k->state != TASK_INTERRUPTIBLE);
159 /* Must have done schedule() in kthread() before we set_task_cpu */
160 wait_task_inactive(k);
161 set_task_cpu(k, cpu);
162 k->cpus_allowed = cpumask_of_cpu(cpu);
163}
164EXPORT_SYMBOL(kthread_bind);
165
166int kthread_stop(struct task_struct *k)
167{
168 int ret;
169
170 down(&kthread_stop_lock);
171
172 /* It could exit after stop_info.k set, but before wake_up_process. */
173 get_task_struct(k);
174
175 /* Must init completion *before* thread sees kthread_stop_info.k */
176 init_completion(&kthread_stop_info.done);
177 wmb();
178
179 /* Now set kthread_should_stop() to true, and wake it up. */
180 kthread_stop_info.k = k;
181 wake_up_process(k);
182 put_task_struct(k);
183
184 /* Once it dies, reset stop ptr, gather result and we're done. */
185 wait_for_completion(&kthread_stop_info.done);
186 kthread_stop_info.k = NULL;
187 ret = kthread_stop_info.err;
188 up(&kthread_stop_lock);
189
190 return ret;
191}
192EXPORT_SYMBOL(kthread_stop);
193
194static __init int helper_init(void)
195{
196 helper_wq = create_singlethread_workqueue("kthread");
197 BUG_ON(!helper_wq);
198
199 return 0;
200}
201core_initcall(helper_init);
202
diff --git a/kernel/module.c b/kernel/module.c
new file mode 100644
index 000000000000..2dbfa0773faf
--- /dev/null
+++ b/kernel/module.c
@@ -0,0 +1,2108 @@
1/* Rewritten by Rusty Russell, on the backs of many others...
2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/
19#include <linux/config.h>
20#include <linux/module.h>
21#include <linux/moduleloader.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/elf.h>
26#include <linux/seq_file.h>
27#include <linux/syscalls.h>
28#include <linux/fcntl.h>
29#include <linux/rcupdate.h>
30#include <linux/cpu.h>
31#include <linux/moduleparam.h>
32#include <linux/errno.h>
33#include <linux/err.h>
34#include <linux/vermagic.h>
35#include <linux/notifier.h>
36#include <linux/stop_machine.h>
37#include <linux/device.h>
38#include <asm/uaccess.h>
39#include <asm/semaphore.h>
40#include <asm/cacheflush.h>
41
42#if 0
43#define DEBUGP printk
44#else
45#define DEBUGP(fmt , a...)
46#endif
47
48#ifndef ARCH_SHF_SMALL
49#define ARCH_SHF_SMALL 0
50#endif
51
52/* If this is set, the section belongs in the init part of the module */
53#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
54
55/* Protects module list */
56static DEFINE_SPINLOCK(modlist_lock);
57
58/* List of modules, protected by module_mutex AND modlist_lock */
59static DECLARE_MUTEX(module_mutex);
60static LIST_HEAD(modules);
61
62static DECLARE_MUTEX(notify_mutex);
63static struct notifier_block * module_notify_list;
64
65int register_module_notifier(struct notifier_block * nb)
66{
67 int err;
68 down(&notify_mutex);
69 err = notifier_chain_register(&module_notify_list, nb);
70 up(&notify_mutex);
71 return err;
72}
73EXPORT_SYMBOL(register_module_notifier);
74
75int unregister_module_notifier(struct notifier_block * nb)
76{
77 int err;
78 down(&notify_mutex);
79 err = notifier_chain_unregister(&module_notify_list, nb);
80 up(&notify_mutex);
81 return err;
82}
83EXPORT_SYMBOL(unregister_module_notifier);
84
85/* We require a truly strong try_module_get() */
86static inline int strong_try_module_get(struct module *mod)
87{
88 if (mod && mod->state == MODULE_STATE_COMING)
89 return 0;
90 return try_module_get(mod);
91}
92
93/* A thread that wants to hold a reference to a module only while it
94 * is running can call ths to safely exit.
95 * nfsd and lockd use this.
96 */
97void __module_put_and_exit(struct module *mod, long code)
98{
99 module_put(mod);
100 do_exit(code);
101}
102EXPORT_SYMBOL(__module_put_and_exit);
103
104/* Find a module section: 0 means not found. */
105static unsigned int find_sec(Elf_Ehdr *hdr,
106 Elf_Shdr *sechdrs,
107 const char *secstrings,
108 const char *name)
109{
110 unsigned int i;
111
112 for (i = 1; i < hdr->e_shnum; i++)
113 /* Alloc bit cleared means "ignore it." */
114 if ((sechdrs[i].sh_flags & SHF_ALLOC)
115 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
116 return i;
117 return 0;
118}
119
120/* Provided by the linker */
121extern const struct kernel_symbol __start___ksymtab[];
122extern const struct kernel_symbol __stop___ksymtab[];
123extern const struct kernel_symbol __start___ksymtab_gpl[];
124extern const struct kernel_symbol __stop___ksymtab_gpl[];
125extern const unsigned long __start___kcrctab[];
126extern const unsigned long __start___kcrctab_gpl[];
127
128#ifndef CONFIG_MODVERSIONS
129#define symversion(base, idx) NULL
130#else
131#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
132#endif
133
134/* Find a symbol, return value, crc and module which owns it */
135static unsigned long __find_symbol(const char *name,
136 struct module **owner,
137 const unsigned long **crc,
138 int gplok)
139{
140 struct module *mod;
141 unsigned int i;
142
143 /* Core kernel first. */
144 *owner = NULL;
145 for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
146 if (strcmp(__start___ksymtab[i].name, name) == 0) {
147 *crc = symversion(__start___kcrctab, i);
148 return __start___ksymtab[i].value;
149 }
150 }
151 if (gplok) {
152 for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
153 if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
154 *crc = symversion(__start___kcrctab_gpl, i);
155 return __start___ksymtab_gpl[i].value;
156 }
157 }
158
159 /* Now try modules. */
160 list_for_each_entry(mod, &modules, list) {
161 *owner = mod;
162 for (i = 0; i < mod->num_syms; i++)
163 if (strcmp(mod->syms[i].name, name) == 0) {
164 *crc = symversion(mod->crcs, i);
165 return mod->syms[i].value;
166 }
167
168 if (gplok) {
169 for (i = 0; i < mod->num_gpl_syms; i++) {
170 if (strcmp(mod->gpl_syms[i].name, name) == 0) {
171 *crc = symversion(mod->gpl_crcs, i);
172 return mod->gpl_syms[i].value;
173 }
174 }
175 }
176 }
177 DEBUGP("Failed to find symbol %s\n", name);
178 return 0;
179}
180
181/* Find a symbol in this elf symbol table */
182static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
183 unsigned int symindex,
184 const char *strtab,
185 const char *name)
186{
187 unsigned int i;
188 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
189
190 /* Search (defined) internal symbols first. */
191 for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
192 if (sym[i].st_shndx != SHN_UNDEF
193 && strcmp(name, strtab + sym[i].st_name) == 0)
194 return sym[i].st_value;
195 }
196 return 0;
197}
198
199/* Search for module by name: must hold module_mutex. */
200static struct module *find_module(const char *name)
201{
202 struct module *mod;
203
204 list_for_each_entry(mod, &modules, list) {
205 if (strcmp(mod->name, name) == 0)
206 return mod;
207 }
208 return NULL;
209}
210
211#ifdef CONFIG_SMP
212/* Number of blocks used and allocated. */
213static unsigned int pcpu_num_used, pcpu_num_allocated;
214/* Size of each block. -ve means used. */
215static int *pcpu_size;
216
217static int split_block(unsigned int i, unsigned short size)
218{
219 /* Reallocation required? */
220 if (pcpu_num_used + 1 > pcpu_num_allocated) {
221 int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
222 GFP_KERNEL);
223 if (!new)
224 return 0;
225
226 memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
227 pcpu_num_allocated *= 2;
228 kfree(pcpu_size);
229 pcpu_size = new;
230 }
231
232 /* Insert a new subblock */
233 memmove(&pcpu_size[i+1], &pcpu_size[i],
234 sizeof(pcpu_size[0]) * (pcpu_num_used - i));
235 pcpu_num_used++;
236
237 pcpu_size[i+1] -= size;
238 pcpu_size[i] = size;
239 return 1;
240}
241
242static inline unsigned int block_size(int val)
243{
244 if (val < 0)
245 return -val;
246 return val;
247}
248
249/* Created by linker magic */
250extern char __per_cpu_start[], __per_cpu_end[];
251
252static void *percpu_modalloc(unsigned long size, unsigned long align)
253{
254 unsigned long extra;
255 unsigned int i;
256 void *ptr;
257
258 BUG_ON(align > SMP_CACHE_BYTES);
259
260 ptr = __per_cpu_start;
261 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
262 /* Extra for alignment requirement. */
263 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
264 BUG_ON(i == 0 && extra != 0);
265
266 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
267 continue;
268
269 /* Transfer extra to previous block. */
270 if (pcpu_size[i-1] < 0)
271 pcpu_size[i-1] -= extra;
272 else
273 pcpu_size[i-1] += extra;
274 pcpu_size[i] -= extra;
275 ptr += extra;
276
277 /* Split block if warranted */
278 if (pcpu_size[i] - size > sizeof(unsigned long))
279 if (!split_block(i, size))
280 return NULL;
281
282 /* Mark allocated */
283 pcpu_size[i] = -pcpu_size[i];
284 return ptr;
285 }
286
287 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
288 size);
289 return NULL;
290}
291
292static void percpu_modfree(void *freeme)
293{
294 unsigned int i;
295 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
296
297 /* First entry is core kernel percpu data. */
298 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
299 if (ptr == freeme) {
300 pcpu_size[i] = -pcpu_size[i];
301 goto free;
302 }
303 }
304 BUG();
305
306 free:
307 /* Merge with previous? */
308 if (pcpu_size[i-1] >= 0) {
309 pcpu_size[i-1] += pcpu_size[i];
310 pcpu_num_used--;
311 memmove(&pcpu_size[i], &pcpu_size[i+1],
312 (pcpu_num_used - i) * sizeof(pcpu_size[0]));
313 i--;
314 }
315 /* Merge with next? */
316 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
317 pcpu_size[i] += pcpu_size[i+1];
318 pcpu_num_used--;
319 memmove(&pcpu_size[i+1], &pcpu_size[i+2],
320 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
321 }
322}
323
324static unsigned int find_pcpusec(Elf_Ehdr *hdr,
325 Elf_Shdr *sechdrs,
326 const char *secstrings)
327{
328 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
329}
330
331static int percpu_modinit(void)
332{
333 pcpu_num_used = 2;
334 pcpu_num_allocated = 2;
335 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
336 GFP_KERNEL);
337 /* Static in-kernel percpu data (used). */
338 pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
339 /* Free room. */
340 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
341 if (pcpu_size[1] < 0) {
342 printk(KERN_ERR "No per-cpu room for modules.\n");
343 pcpu_num_used = 1;
344 }
345
346 return 0;
347}
348__initcall(percpu_modinit);
349#else /* ... !CONFIG_SMP */
350static inline void *percpu_modalloc(unsigned long size, unsigned long align)
351{
352 return NULL;
353}
354static inline void percpu_modfree(void *pcpuptr)
355{
356 BUG();
357}
358static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
359 Elf_Shdr *sechdrs,
360 const char *secstrings)
361{
362 return 0;
363}
364static inline void percpu_modcopy(void *pcpudst, const void *src,
365 unsigned long size)
366{
367 /* pcpusec should be 0, and size of that section should be 0. */
368 BUG_ON(size != 0);
369}
370#endif /* CONFIG_SMP */
371
372#ifdef CONFIG_MODULE_UNLOAD
373/* Init the unload section of the module. */
374static void module_unload_init(struct module *mod)
375{
376 unsigned int i;
377
378 INIT_LIST_HEAD(&mod->modules_which_use_me);
379 for (i = 0; i < NR_CPUS; i++)
380 local_set(&mod->ref[i].count, 0);
381 /* Hold reference count during initialization. */
382 local_set(&mod->ref[_smp_processor_id()].count, 1);
383 /* Backwards compatibility macros put refcount during init. */
384 mod->waiter = current;
385}
386
387/* modules using other modules */
388struct module_use
389{
390 struct list_head list;
391 struct module *module_which_uses;
392};
393
394/* Does a already use b? */
395static int already_uses(struct module *a, struct module *b)
396{
397 struct module_use *use;
398
399 list_for_each_entry(use, &b->modules_which_use_me, list) {
400 if (use->module_which_uses == a) {
401 DEBUGP("%s uses %s!\n", a->name, b->name);
402 return 1;
403 }
404 }
405 DEBUGP("%s does not use %s!\n", a->name, b->name);
406 return 0;
407}
408
409/* Module a uses b */
410static int use_module(struct module *a, struct module *b)
411{
412 struct module_use *use;
413 if (b == NULL || already_uses(a, b)) return 1;
414
415 if (!strong_try_module_get(b))
416 return 0;
417
418 DEBUGP("Allocating new usage for %s.\n", a->name);
419 use = kmalloc(sizeof(*use), GFP_ATOMIC);
420 if (!use) {
421 printk("%s: out of memory loading\n", a->name);
422 module_put(b);
423 return 0;
424 }
425
426 use->module_which_uses = a;
427 list_add(&use->list, &b->modules_which_use_me);
428 return 1;
429}
430
431/* Clear the unload stuff of the module. */
432static void module_unload_free(struct module *mod)
433{
434 struct module *i;
435
436 list_for_each_entry(i, &modules, list) {
437 struct module_use *use;
438
439 list_for_each_entry(use, &i->modules_which_use_me, list) {
440 if (use->module_which_uses == mod) {
441 DEBUGP("%s unusing %s\n", mod->name, i->name);
442 module_put(i);
443 list_del(&use->list);
444 kfree(use);
445 /* There can be at most one match. */
446 break;
447 }
448 }
449 }
450}
451
452#ifdef CONFIG_MODULE_FORCE_UNLOAD
453static inline int try_force(unsigned int flags)
454{
455 int ret = (flags & O_TRUNC);
456 if (ret)
457 tainted |= TAINT_FORCED_MODULE;
458 return ret;
459}
460#else
461static inline int try_force(unsigned int flags)
462{
463 return 0;
464}
465#endif /* CONFIG_MODULE_FORCE_UNLOAD */
466
467struct stopref
468{
469 struct module *mod;
470 int flags;
471 int *forced;
472};
473
474/* Whole machine is stopped with interrupts off when this runs. */
475static int __try_stop_module(void *_sref)
476{
477 struct stopref *sref = _sref;
478
479 /* If it's not unused, quit unless we are told to block. */
480 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
481 if (!(*sref->forced = try_force(sref->flags)))
482 return -EWOULDBLOCK;
483 }
484
485 /* Mark it as dying. */
486 sref->mod->state = MODULE_STATE_GOING;
487 return 0;
488}
489
490static int try_stop_module(struct module *mod, int flags, int *forced)
491{
492 struct stopref sref = { mod, flags, forced };
493
494 return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
495}
496
497unsigned int module_refcount(struct module *mod)
498{
499 unsigned int i, total = 0;
500
501 for (i = 0; i < NR_CPUS; i++)
502 total += local_read(&mod->ref[i].count);
503 return total;
504}
505EXPORT_SYMBOL(module_refcount);
506
507/* This exists whether we can unload or not */
508static void free_module(struct module *mod);
509
510static void wait_for_zero_refcount(struct module *mod)
511{
512 /* Since we might sleep for some time, drop the semaphore first */
513 up(&module_mutex);
514 for (;;) {
515 DEBUGP("Looking at refcount...\n");
516 set_current_state(TASK_UNINTERRUPTIBLE);
517 if (module_refcount(mod) == 0)
518 break;
519 schedule();
520 }
521 current->state = TASK_RUNNING;
522 down(&module_mutex);
523}
524
525asmlinkage long
526sys_delete_module(const char __user *name_user, unsigned int flags)
527{
528 struct module *mod;
529 char name[MODULE_NAME_LEN];
530 int ret, forced = 0;
531
532 if (!capable(CAP_SYS_MODULE))
533 return -EPERM;
534
535 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
536 return -EFAULT;
537 name[MODULE_NAME_LEN-1] = '\0';
538
539 if (down_interruptible(&module_mutex) != 0)
540 return -EINTR;
541
542 mod = find_module(name);
543 if (!mod) {
544 ret = -ENOENT;
545 goto out;
546 }
547
548 if (!list_empty(&mod->modules_which_use_me)) {
549 /* Other modules depend on us: get rid of them first. */
550 ret = -EWOULDBLOCK;
551 goto out;
552 }
553
554 /* Doing init or already dying? */
555 if (mod->state != MODULE_STATE_LIVE) {
556 /* FIXME: if (force), slam module count and wake up
557 waiter --RR */
558 DEBUGP("%s already dying\n", mod->name);
559 ret = -EBUSY;
560 goto out;
561 }
562
563 /* If it has an init func, it must have an exit func to unload */
564 if ((mod->init != NULL && mod->exit == NULL)
565 || mod->unsafe) {
566 forced = try_force(flags);
567 if (!forced) {
568 /* This module can't be removed */
569 ret = -EBUSY;
570 goto out;
571 }
572 }
573
574 /* Set this up before setting mod->state */
575 mod->waiter = current;
576
577 /* Stop the machine so refcounts can't move and disable module. */
578 ret = try_stop_module(mod, flags, &forced);
579 if (ret != 0)
580 goto out;
581
582 /* Never wait if forced. */
583 if (!forced && module_refcount(mod) != 0)
584 wait_for_zero_refcount(mod);
585
586 /* Final destruction now noone is using it. */
587 if (mod->exit != NULL) {
588 up(&module_mutex);
589 mod->exit();
590 down(&module_mutex);
591 }
592 free_module(mod);
593
594 out:
595 up(&module_mutex);
596 return ret;
597}
598
599static void print_unload_info(struct seq_file *m, struct module *mod)
600{
601 struct module_use *use;
602 int printed_something = 0;
603
604 seq_printf(m, " %u ", module_refcount(mod));
605
606 /* Always include a trailing , so userspace can differentiate
607 between this and the old multi-field proc format. */
608 list_for_each_entry(use, &mod->modules_which_use_me, list) {
609 printed_something = 1;
610 seq_printf(m, "%s,", use->module_which_uses->name);
611 }
612
613 if (mod->unsafe) {
614 printed_something = 1;
615 seq_printf(m, "[unsafe],");
616 }
617
618 if (mod->init != NULL && mod->exit == NULL) {
619 printed_something = 1;
620 seq_printf(m, "[permanent],");
621 }
622
623 if (!printed_something)
624 seq_printf(m, "-");
625}
626
627void __symbol_put(const char *symbol)
628{
629 struct module *owner;
630 unsigned long flags;
631 const unsigned long *crc;
632
633 spin_lock_irqsave(&modlist_lock, flags);
634 if (!__find_symbol(symbol, &owner, &crc, 1))
635 BUG();
636 module_put(owner);
637 spin_unlock_irqrestore(&modlist_lock, flags);
638}
639EXPORT_SYMBOL(__symbol_put);
640
641void symbol_put_addr(void *addr)
642{
643 unsigned long flags;
644
645 spin_lock_irqsave(&modlist_lock, flags);
646 if (!kernel_text_address((unsigned long)addr))
647 BUG();
648
649 module_put(module_text_address((unsigned long)addr));
650 spin_unlock_irqrestore(&modlist_lock, flags);
651}
652EXPORT_SYMBOL_GPL(symbol_put_addr);
653
654static ssize_t show_refcnt(struct module_attribute *mattr,
655 struct module *mod, char *buffer)
656{
657 /* sysfs holds a reference */
658 return sprintf(buffer, "%u\n", module_refcount(mod)-1);
659}
660
661static struct module_attribute refcnt = {
662 .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE },
663 .show = show_refcnt,
664};
665
666#else /* !CONFIG_MODULE_UNLOAD */
667static void print_unload_info(struct seq_file *m, struct module *mod)
668{
669 /* We don't know the usage count, or what modules are using. */
670 seq_printf(m, " - -");
671}
672
673static inline void module_unload_free(struct module *mod)
674{
675}
676
677static inline int use_module(struct module *a, struct module *b)
678{
679 return strong_try_module_get(b);
680}
681
682static inline void module_unload_init(struct module *mod)
683{
684}
685#endif /* CONFIG_MODULE_UNLOAD */
686
687#ifdef CONFIG_OBSOLETE_MODPARM
688/* Bounds checking done below */
689static int obsparm_copy_string(const char *val, struct kernel_param *kp)
690{
691 strcpy(kp->arg, val);
692 return 0;
693}
694
695int set_obsolete(const char *val, struct kernel_param *kp)
696{
697 unsigned int min, max;
698 unsigned int size, maxsize;
699 int dummy;
700 char *endp;
701 const char *p;
702 struct obsolete_modparm *obsparm = kp->arg;
703
704 if (!val) {
705 printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
706 return -EINVAL;
707 }
708
709 /* type is: [min[-max]]{b,h,i,l,s} */
710 p = obsparm->type;
711 min = simple_strtol(p, &endp, 10);
712 if (endp == obsparm->type)
713 min = max = 1;
714 else if (*endp == '-') {
715 p = endp+1;
716 max = simple_strtol(p, &endp, 10);
717 } else
718 max = min;
719 switch (*endp) {
720 case 'b':
721 return param_array(kp->name, val, min, max, obsparm->addr,
722 1, param_set_byte, &dummy);
723 case 'h':
724 return param_array(kp->name, val, min, max, obsparm->addr,
725 sizeof(short), param_set_short, &dummy);
726 case 'i':
727 return param_array(kp->name, val, min, max, obsparm->addr,
728 sizeof(int), param_set_int, &dummy);
729 case 'l':
730 return param_array(kp->name, val, min, max, obsparm->addr,
731 sizeof(long), param_set_long, &dummy);
732 case 's':
733 return param_array(kp->name, val, min, max, obsparm->addr,
734 sizeof(char *), param_set_charp, &dummy);
735
736 case 'c':
737 /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
738 and the decl is "char xxx[5][50];" */
739 p = endp+1;
740 maxsize = simple_strtol(p, &endp, 10);
741 /* We check lengths here (yes, this is a hack). */
742 p = val;
743 while (p[size = strcspn(p, ",")]) {
744 if (size >= maxsize)
745 goto oversize;
746 p += size+1;
747 }
748 if (size >= maxsize)
749 goto oversize;
750 return param_array(kp->name, val, min, max, obsparm->addr,
751 maxsize, obsparm_copy_string, &dummy);
752 }
753 printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
754 return -EINVAL;
755 oversize:
756 printk(KERN_ERR
757 "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
758 return -EINVAL;
759}
760
761static int obsolete_params(const char *name,
762 char *args,
763 struct obsolete_modparm obsparm[],
764 unsigned int num,
765 Elf_Shdr *sechdrs,
766 unsigned int symindex,
767 const char *strtab)
768{
769 struct kernel_param *kp;
770 unsigned int i;
771 int ret;
772
773 kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
774 if (!kp)
775 return -ENOMEM;
776
777 for (i = 0; i < num; i++) {
778 char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
779
780 snprintf(sym_name, sizeof(sym_name), "%s%s",
781 MODULE_SYMBOL_PREFIX, obsparm[i].name);
782
783 kp[i].name = obsparm[i].name;
784 kp[i].perm = 000;
785 kp[i].set = set_obsolete;
786 kp[i].get = NULL;
787 obsparm[i].addr
788 = (void *)find_local_symbol(sechdrs, symindex, strtab,
789 sym_name);
790 if (!obsparm[i].addr) {
791 printk("%s: falsely claims to have parameter %s\n",
792 name, obsparm[i].name);
793 ret = -EINVAL;
794 goto out;
795 }
796 kp[i].arg = &obsparm[i];
797 }
798
799 ret = parse_args(name, args, kp, num, NULL);
800 out:
801 kfree(kp);
802 return ret;
803}
804#else
805static int obsolete_params(const char *name,
806 char *args,
807 struct obsolete_modparm obsparm[],
808 unsigned int num,
809 Elf_Shdr *sechdrs,
810 unsigned int symindex,
811 const char *strtab)
812{
813 if (num != 0)
814 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
815 name);
816 return 0;
817}
818#endif /* CONFIG_OBSOLETE_MODPARM */
819
820static const char vermagic[] = VERMAGIC_STRING;
821
822#ifdef CONFIG_MODVERSIONS
823static int check_version(Elf_Shdr *sechdrs,
824 unsigned int versindex,
825 const char *symname,
826 struct module *mod,
827 const unsigned long *crc)
828{
829 unsigned int i, num_versions;
830 struct modversion_info *versions;
831
832 /* Exporting module didn't supply crcs? OK, we're already tainted. */
833 if (!crc)
834 return 1;
835
836 versions = (void *) sechdrs[versindex].sh_addr;
837 num_versions = sechdrs[versindex].sh_size
838 / sizeof(struct modversion_info);
839
840 for (i = 0; i < num_versions; i++) {
841 if (strcmp(versions[i].name, symname) != 0)
842 continue;
843
844 if (versions[i].crc == *crc)
845 return 1;
846 printk("%s: disagrees about version of symbol %s\n",
847 mod->name, symname);
848 DEBUGP("Found checksum %lX vs module %lX\n",
849 *crc, versions[i].crc);
850 return 0;
851 }
852 /* Not in module's version table. OK, but that taints the kernel. */
853 if (!(tainted & TAINT_FORCED_MODULE)) {
854 printk("%s: no version for \"%s\" found: kernel tainted.\n",
855 mod->name, symname);
856 tainted |= TAINT_FORCED_MODULE;
857 }
858 return 1;
859}
860
861static inline int check_modstruct_version(Elf_Shdr *sechdrs,
862 unsigned int versindex,
863 struct module *mod)
864{
865 const unsigned long *crc;
866 struct module *owner;
867
868 if (!__find_symbol("struct_module", &owner, &crc, 1))
869 BUG();
870 return check_version(sechdrs, versindex, "struct_module", mod,
871 crc);
872}
873
874/* First part is kernel version, which we ignore. */
875static inline int same_magic(const char *amagic, const char *bmagic)
876{
877 amagic += strcspn(amagic, " ");
878 bmagic += strcspn(bmagic, " ");
879 return strcmp(amagic, bmagic) == 0;
880}
881#else
882static inline int check_version(Elf_Shdr *sechdrs,
883 unsigned int versindex,
884 const char *symname,
885 struct module *mod,
886 const unsigned long *crc)
887{
888 return 1;
889}
890
891static inline int check_modstruct_version(Elf_Shdr *sechdrs,
892 unsigned int versindex,
893 struct module *mod)
894{
895 return 1;
896}
897
898static inline int same_magic(const char *amagic, const char *bmagic)
899{
900 return strcmp(amagic, bmagic) == 0;
901}
902#endif /* CONFIG_MODVERSIONS */
903
904/* Resolve a symbol for this module. I.e. if we find one, record usage.
905 Must be holding module_mutex. */
906static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
907 unsigned int versindex,
908 const char *name,
909 struct module *mod)
910{
911 struct module *owner;
912 unsigned long ret;
913 const unsigned long *crc;
914
915 spin_lock_irq(&modlist_lock);
916 ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
917 if (ret) {
918 /* use_module can fail due to OOM, or module unloading */
919 if (!check_version(sechdrs, versindex, name, mod, crc) ||
920 !use_module(mod, owner))
921 ret = 0;
922 }
923 spin_unlock_irq(&modlist_lock);
924 return ret;
925}
926
927
928/*
929 * /sys/module/foo/sections stuff
930 * J. Corbet <corbet@lwn.net>
931 */
932#ifdef CONFIG_KALLSYMS
933static ssize_t module_sect_show(struct module_attribute *mattr,
934 struct module *mod, char *buf)
935{
936 struct module_sect_attr *sattr =
937 container_of(mattr, struct module_sect_attr, mattr);
938 return sprintf(buf, "0x%lx\n", sattr->address);
939}
940
941static void add_sect_attrs(struct module *mod, unsigned int nsect,
942 char *secstrings, Elf_Shdr *sechdrs)
943{
944 unsigned int nloaded = 0, i, size[2];
945 struct module_sect_attrs *sect_attrs;
946 struct module_sect_attr *sattr;
947 struct attribute **gattr;
948
949 /* Count loaded sections and allocate structures */
950 for (i = 0; i < nsect; i++)
951 if (sechdrs[i].sh_flags & SHF_ALLOC)
952 nloaded++;
953 size[0] = ALIGN(sizeof(*sect_attrs)
954 + nloaded * sizeof(sect_attrs->attrs[0]),
955 sizeof(sect_attrs->grp.attrs[0]));
956 size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
957 if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
958 return;
959
960 /* Setup section attributes. */
961 sect_attrs->grp.name = "sections";
962 sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
963
964 sattr = &sect_attrs->attrs[0];
965 gattr = &sect_attrs->grp.attrs[0];
966 for (i = 0; i < nsect; i++) {
967 if (! (sechdrs[i].sh_flags & SHF_ALLOC))
968 continue;
969 sattr->address = sechdrs[i].sh_addr;
970 strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
971 MODULE_SECT_NAME_LEN);
972 sattr->mattr.show = module_sect_show;
973 sattr->mattr.store = NULL;
974 sattr->mattr.attr.name = sattr->name;
975 sattr->mattr.attr.owner = mod;
976 sattr->mattr.attr.mode = S_IRUGO;
977 *(gattr++) = &(sattr++)->mattr.attr;
978 }
979 *gattr = NULL;
980
981 if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
982 goto out;
983
984 mod->sect_attrs = sect_attrs;
985 return;
986 out:
987 kfree(sect_attrs);
988}
989
990static void remove_sect_attrs(struct module *mod)
991{
992 if (mod->sect_attrs) {
993 sysfs_remove_group(&mod->mkobj.kobj,
994 &mod->sect_attrs->grp);
995 /* We are positive that no one is using any sect attrs
996 * at this point. Deallocate immediately. */
997 kfree(mod->sect_attrs);
998 mod->sect_attrs = NULL;
999 }
1000}
1001
1002
1003#else
1004static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
1005 char *sectstrings, Elf_Shdr *sechdrs)
1006{
1007}
1008
1009static inline void remove_sect_attrs(struct module *mod)
1010{
1011}
1012#endif /* CONFIG_KALLSYMS */
1013
1014
1015#ifdef CONFIG_MODULE_UNLOAD
1016static inline int module_add_refcnt_attr(struct module *mod)
1017{
1018 return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
1019}
1020static void module_remove_refcnt_attr(struct module *mod)
1021{
1022 return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
1023}
1024#else
1025static inline int module_add_refcnt_attr(struct module *mod)
1026{
1027 return 0;
1028}
1029static void module_remove_refcnt_attr(struct module *mod)
1030{
1031}
1032#endif
1033
1034
1035static int mod_sysfs_setup(struct module *mod,
1036 struct kernel_param *kparam,
1037 unsigned int num_params)
1038{
1039 int err;
1040
1041 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
1042 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
1043 if (err)
1044 goto out;
1045 kobj_set_kset_s(&mod->mkobj, module_subsys);
1046 mod->mkobj.mod = mod;
1047 err = kobject_register(&mod->mkobj.kobj);
1048 if (err)
1049 goto out;
1050
1051 err = module_add_refcnt_attr(mod);
1052 if (err)
1053 goto out_unreg;
1054
1055 err = module_param_sysfs_setup(mod, kparam, num_params);
1056 if (err)
1057 goto out_unreg;
1058
1059 return 0;
1060
1061out_unreg:
1062 kobject_unregister(&mod->mkobj.kobj);
1063out:
1064 return err;
1065}
1066
1067static void mod_kobject_remove(struct module *mod)
1068{
1069 module_remove_refcnt_attr(mod);
1070 module_param_sysfs_remove(mod);
1071
1072 kobject_unregister(&mod->mkobj.kobj);
1073}
1074
1075/*
1076 * unlink the module with the whole machine is stopped with interrupts off
1077 * - this defends against kallsyms not taking locks
1078 */
1079static int __unlink_module(void *_mod)
1080{
1081 struct module *mod = _mod;
1082 list_del(&mod->list);
1083 return 0;
1084}
1085
1086/* Free a module, remove from lists, etc (must hold module mutex). */
1087static void free_module(struct module *mod)
1088{
1089 /* Delete from various lists */
1090 stop_machine_run(__unlink_module, mod, NR_CPUS);
1091 remove_sect_attrs(mod);
1092 mod_kobject_remove(mod);
1093
1094 /* Arch-specific cleanup. */
1095 module_arch_cleanup(mod);
1096
1097 /* Module unload stuff */
1098 module_unload_free(mod);
1099
1100 /* This may be NULL, but that's OK */
1101 module_free(mod, mod->module_init);
1102 kfree(mod->args);
1103 if (mod->percpu)
1104 percpu_modfree(mod->percpu);
1105
1106 /* Finally, free the core (containing the module structure) */
1107 module_free(mod, mod->module_core);
1108}
1109
1110void *__symbol_get(const char *symbol)
1111{
1112 struct module *owner;
1113 unsigned long value, flags;
1114 const unsigned long *crc;
1115
1116 spin_lock_irqsave(&modlist_lock, flags);
1117 value = __find_symbol(symbol, &owner, &crc, 1);
1118 if (value && !strong_try_module_get(owner))
1119 value = 0;
1120 spin_unlock_irqrestore(&modlist_lock, flags);
1121
1122 return (void *)value;
1123}
1124EXPORT_SYMBOL_GPL(__symbol_get);
1125
1126/* Change all symbols so that sh_value encodes the pointer directly. */
1127static int simplify_symbols(Elf_Shdr *sechdrs,
1128 unsigned int symindex,
1129 const char *strtab,
1130 unsigned int versindex,
1131 unsigned int pcpuindex,
1132 struct module *mod)
1133{
1134 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
1135 unsigned long secbase;
1136 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1137 int ret = 0;
1138
1139 for (i = 1; i < n; i++) {
1140 switch (sym[i].st_shndx) {
1141 case SHN_COMMON:
1142 /* We compiled with -fno-common. These are not
1143 supposed to happen. */
1144 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
1145 printk("%s: please compile with -fno-common\n",
1146 mod->name);
1147 ret = -ENOEXEC;
1148 break;
1149
1150 case SHN_ABS:
1151 /* Don't need to do anything */
1152 DEBUGP("Absolute symbol: 0x%08lx\n",
1153 (long)sym[i].st_value);
1154 break;
1155
1156 case SHN_UNDEF:
1157 sym[i].st_value
1158 = resolve_symbol(sechdrs, versindex,
1159 strtab + sym[i].st_name, mod);
1160
1161 /* Ok if resolved. */
1162 if (sym[i].st_value != 0)
1163 break;
1164 /* Ok if weak. */
1165 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1166 break;
1167
1168 printk(KERN_WARNING "%s: Unknown symbol %s\n",
1169 mod->name, strtab + sym[i].st_name);
1170 ret = -ENOENT;
1171 break;
1172
1173 default:
1174 /* Divert to percpu allocation if a percpu var. */
1175 if (sym[i].st_shndx == pcpuindex)
1176 secbase = (unsigned long)mod->percpu;
1177 else
1178 secbase = sechdrs[sym[i].st_shndx].sh_addr;
1179 sym[i].st_value += secbase;
1180 break;
1181 }
1182 }
1183
1184 return ret;
1185}
1186
1187/* Update size with this section: return offset. */
1188static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
1189{
1190 long ret;
1191
1192 ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
1193 *size = ret + sechdr->sh_size;
1194 return ret;
1195}
1196
1197/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
1198 might -- code, read-only data, read-write data, small data. Tally
1199 sizes, and place the offsets into sh_entsize fields: high bit means it
1200 belongs in init. */
1201static void layout_sections(struct module *mod,
1202 const Elf_Ehdr *hdr,
1203 Elf_Shdr *sechdrs,
1204 const char *secstrings)
1205{
1206 static unsigned long const masks[][2] = {
1207 /* NOTE: all executable code must be the first section
1208 * in this array; otherwise modify the text_size
1209 * finder in the two loops below */
1210 { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
1211 { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
1212 { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
1213 { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
1214 };
1215 unsigned int m, i;
1216
1217 for (i = 0; i < hdr->e_shnum; i++)
1218 sechdrs[i].sh_entsize = ~0UL;
1219
1220 DEBUGP("Core section allocation order:\n");
1221 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1222 for (i = 0; i < hdr->e_shnum; ++i) {
1223 Elf_Shdr *s = &sechdrs[i];
1224
1225 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1226 || (s->sh_flags & masks[m][1])
1227 || s->sh_entsize != ~0UL
1228 || strncmp(secstrings + s->sh_name,
1229 ".init", 5) == 0)
1230 continue;
1231 s->sh_entsize = get_offset(&mod->core_size, s);
1232 DEBUGP("\t%s\n", secstrings + s->sh_name);
1233 }
1234 if (m == 0)
1235 mod->core_text_size = mod->core_size;
1236 }
1237
1238 DEBUGP("Init section allocation order:\n");
1239 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1240 for (i = 0; i < hdr->e_shnum; ++i) {
1241 Elf_Shdr *s = &sechdrs[i];
1242
1243 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1244 || (s->sh_flags & masks[m][1])
1245 || s->sh_entsize != ~0UL
1246 || strncmp(secstrings + s->sh_name,
1247 ".init", 5) != 0)
1248 continue;
1249 s->sh_entsize = (get_offset(&mod->init_size, s)
1250 | INIT_OFFSET_MASK);
1251 DEBUGP("\t%s\n", secstrings + s->sh_name);
1252 }
1253 if (m == 0)
1254 mod->init_text_size = mod->init_size;
1255 }
1256}
1257
1258static inline int license_is_gpl_compatible(const char *license)
1259{
1260 return (strcmp(license, "GPL") == 0
1261 || strcmp(license, "GPL v2") == 0
1262 || strcmp(license, "GPL and additional rights") == 0
1263 || strcmp(license, "Dual BSD/GPL") == 0
1264 || strcmp(license, "Dual MPL/GPL") == 0);
1265}
1266
1267static void set_license(struct module *mod, const char *license)
1268{
1269 if (!license)
1270 license = "unspecified";
1271
1272 mod->license_gplok = license_is_gpl_compatible(license);
1273 if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
1274 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
1275 mod->name, license);
1276 tainted |= TAINT_PROPRIETARY_MODULE;
1277 }
1278}
1279
1280/* Parse tag=value strings from .modinfo section */
1281static char *next_string(char *string, unsigned long *secsize)
1282{
1283 /* Skip non-zero chars */
1284 while (string[0]) {
1285 string++;
1286 if ((*secsize)-- <= 1)
1287 return NULL;
1288 }
1289
1290 /* Skip any zero padding. */
1291 while (!string[0]) {
1292 string++;
1293 if ((*secsize)-- <= 1)
1294 return NULL;
1295 }
1296 return string;
1297}
1298
1299static char *get_modinfo(Elf_Shdr *sechdrs,
1300 unsigned int info,
1301 const char *tag)
1302{
1303 char *p;
1304 unsigned int taglen = strlen(tag);
1305 unsigned long size = sechdrs[info].sh_size;
1306
1307 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
1308 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1309 return p + taglen + 1;
1310 }
1311 return NULL;
1312}
1313
1314#ifdef CONFIG_KALLSYMS
1315int is_exported(const char *name, const struct module *mod)
1316{
1317 unsigned int i;
1318
1319 if (!mod) {
1320 for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
1321 if (strcmp(__start___ksymtab[i].name, name) == 0)
1322 return 1;
1323 return 0;
1324 }
1325 for (i = 0; i < mod->num_syms; i++)
1326 if (strcmp(mod->syms[i].name, name) == 0)
1327 return 1;
1328 return 0;
1329}
1330
1331/* As per nm */
1332static char elf_type(const Elf_Sym *sym,
1333 Elf_Shdr *sechdrs,
1334 const char *secstrings,
1335 struct module *mod)
1336{
1337 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1338 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1339 return 'v';
1340 else
1341 return 'w';
1342 }
1343 if (sym->st_shndx == SHN_UNDEF)
1344 return 'U';
1345 if (sym->st_shndx == SHN_ABS)
1346 return 'a';
1347 if (sym->st_shndx >= SHN_LORESERVE)
1348 return '?';
1349 if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
1350 return 't';
1351 if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
1352 && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
1353 if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
1354 return 'r';
1355 else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
1356 return 'g';
1357 else
1358 return 'd';
1359 }
1360 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
1361 if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
1362 return 's';
1363 else
1364 return 'b';
1365 }
1366 if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
1367 ".debug", strlen(".debug")) == 0)
1368 return 'n';
1369 return '?';
1370}
1371
1372static void add_kallsyms(struct module *mod,
1373 Elf_Shdr *sechdrs,
1374 unsigned int symindex,
1375 unsigned int strindex,
1376 const char *secstrings)
1377{
1378 unsigned int i;
1379
1380 mod->symtab = (void *)sechdrs[symindex].sh_addr;
1381 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1382 mod->strtab = (void *)sechdrs[strindex].sh_addr;
1383
1384 /* Set types up while we still have access to sections. */
1385 for (i = 0; i < mod->num_symtab; i++)
1386 mod->symtab[i].st_info
1387 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1388}
1389#else
1390static inline void add_kallsyms(struct module *mod,
1391 Elf_Shdr *sechdrs,
1392 unsigned int symindex,
1393 unsigned int strindex,
1394 const char *secstrings)
1395{
1396}
1397#endif /* CONFIG_KALLSYMS */
1398
1399/* Allocate and load the module: note that size of section 0 is always
1400 zero, and we rely on this for optional sections. */
1401static struct module *load_module(void __user *umod,
1402 unsigned long len,
1403 const char __user *uargs)
1404{
1405 Elf_Ehdr *hdr;
1406 Elf_Shdr *sechdrs;
1407 char *secstrings, *args, *modmagic, *strtab = NULL;
1408 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
1409 exportindex, modindex, obsparmindex, infoindex, gplindex,
1410 crcindex, gplcrcindex, versindex, pcpuindex;
1411 long arglen;
1412 struct module *mod;
1413 long err = 0;
1414 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1415 struct exception_table_entry *extable;
1416
1417 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
1418 umod, len, uargs);
1419 if (len < sizeof(*hdr))
1420 return ERR_PTR(-ENOEXEC);
1421
1422 /* Suck in entire file: we'll want most of it. */
1423 /* vmalloc barfs on "unusual" numbers. Check here */
1424 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1425 return ERR_PTR(-ENOMEM);
1426 if (copy_from_user(hdr, umod, len) != 0) {
1427 err = -EFAULT;
1428 goto free_hdr;
1429 }
1430
1431 /* Sanity checks against insmoding binaries or wrong arch,
1432 weird elf version */
1433 if (memcmp(hdr->e_ident, ELFMAG, 4) != 0
1434 || hdr->e_type != ET_REL
1435 || !elf_check_arch(hdr)
1436 || hdr->e_shentsize != sizeof(*sechdrs)) {
1437 err = -ENOEXEC;
1438 goto free_hdr;
1439 }
1440
1441 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
1442 goto truncated;
1443
1444 /* Convenience variables */
1445 sechdrs = (void *)hdr + hdr->e_shoff;
1446 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
1447 sechdrs[0].sh_addr = 0;
1448
1449 for (i = 1; i < hdr->e_shnum; i++) {
1450 if (sechdrs[i].sh_type != SHT_NOBITS
1451 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
1452 goto truncated;
1453
1454 /* Mark all sections sh_addr with their address in the
1455 temporary image. */
1456 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
1457
1458 /* Internal symbols and strings. */
1459 if (sechdrs[i].sh_type == SHT_SYMTAB) {
1460 symindex = i;
1461 strindex = sechdrs[i].sh_link;
1462 strtab = (char *)hdr + sechdrs[strindex].sh_offset;
1463 }
1464#ifndef CONFIG_MODULE_UNLOAD
1465 /* Don't load .exit sections */
1466 if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
1467 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1468#endif
1469 }
1470
1471 modindex = find_sec(hdr, sechdrs, secstrings,
1472 ".gnu.linkonce.this_module");
1473 if (!modindex) {
1474 printk(KERN_WARNING "No module found in object\n");
1475 err = -ENOEXEC;
1476 goto free_hdr;
1477 }
1478 mod = (void *)sechdrs[modindex].sh_addr;
1479
1480 if (symindex == 0) {
1481 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
1482 mod->name);
1483 err = -ENOEXEC;
1484 goto free_hdr;
1485 }
1486
1487 /* Optional sections */
1488 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1489 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1490 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1491 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1492 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1493 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1494 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1495 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1496 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1497 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1498
1499 /* Don't keep modinfo section */
1500 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1501#ifdef CONFIG_KALLSYMS
1502 /* Keep symbol and string tables for decoding later. */
1503 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1504 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1505#endif
1506
1507 /* Check module struct version now, before we try to use module. */
1508 if (!check_modstruct_version(sechdrs, versindex, mod)) {
1509 err = -ENOEXEC;
1510 goto free_hdr;
1511 }
1512
1513 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1514 /* This is allowed: modprobe --force will invalidate it. */
1515 if (!modmagic) {
1516 tainted |= TAINT_FORCED_MODULE;
1517 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
1518 mod->name);
1519 } else if (!same_magic(modmagic, vermagic)) {
1520 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
1521 mod->name, modmagic, vermagic);
1522 err = -ENOEXEC;
1523 goto free_hdr;
1524 }
1525
1526 /* Now copy in args */
1527 arglen = strlen_user(uargs);
1528 if (!arglen) {
1529 err = -EFAULT;
1530 goto free_hdr;
1531 }
1532 args = kmalloc(arglen, GFP_KERNEL);
1533 if (!args) {
1534 err = -ENOMEM;
1535 goto free_hdr;
1536 }
1537 if (copy_from_user(args, uargs, arglen) != 0) {
1538 err = -EFAULT;
1539 goto free_mod;
1540 }
1541
1542 if (find_module(mod->name)) {
1543 err = -EEXIST;
1544 goto free_mod;
1545 }
1546
1547 mod->state = MODULE_STATE_COMING;
1548
1549 /* Allow arches to frob section contents and sizes. */
1550 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
1551 if (err < 0)
1552 goto free_mod;
1553
1554 if (pcpuindex) {
1555 /* We have a special allocation for this section. */
1556 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
1557 sechdrs[pcpuindex].sh_addralign);
1558 if (!percpu) {
1559 err = -ENOMEM;
1560 goto free_mod;
1561 }
1562 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1563 mod->percpu = percpu;
1564 }
1565
1566 /* Determine total sizes, and put offsets in sh_entsize. For now
1567 this is done generically; there doesn't appear to be any
1568 special cases for the architectures. */
1569 layout_sections(mod, hdr, sechdrs, secstrings);
1570
1571 /* Do the allocs. */
1572 ptr = module_alloc(mod->core_size);
1573 if (!ptr) {
1574 err = -ENOMEM;
1575 goto free_percpu;
1576 }
1577 memset(ptr, 0, mod->core_size);
1578 mod->module_core = ptr;
1579
1580 ptr = module_alloc(mod->init_size);
1581 if (!ptr && mod->init_size) {
1582 err = -ENOMEM;
1583 goto free_core;
1584 }
1585 memset(ptr, 0, mod->init_size);
1586 mod->module_init = ptr;
1587
1588 /* Transfer each section which specifies SHF_ALLOC */
1589 DEBUGP("final section addresses:\n");
1590 for (i = 0; i < hdr->e_shnum; i++) {
1591 void *dest;
1592
1593 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1594 continue;
1595
1596 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
1597 dest = mod->module_init
1598 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
1599 else
1600 dest = mod->module_core + sechdrs[i].sh_entsize;
1601
1602 if (sechdrs[i].sh_type != SHT_NOBITS)
1603 memcpy(dest, (void *)sechdrs[i].sh_addr,
1604 sechdrs[i].sh_size);
1605 /* Update sh_addr to point to copy in image. */
1606 sechdrs[i].sh_addr = (unsigned long)dest;
1607 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
1608 }
1609 /* Module has been moved. */
1610 mod = (void *)sechdrs[modindex].sh_addr;
1611
1612 /* Now we've moved module, initialize linked lists, etc. */
1613 module_unload_init(mod);
1614
1615 /* Set up license info based on the info section */
1616 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1617
1618 /* Fix up syms, so that st_value is a pointer to location. */
1619 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
1620 mod);
1621 if (err < 0)
1622 goto cleanup;
1623
1624 /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
1625 mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
1626 mod->syms = (void *)sechdrs[exportindex].sh_addr;
1627 if (crcindex)
1628 mod->crcs = (void *)sechdrs[crcindex].sh_addr;
1629 mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
1630 mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
1631 if (gplcrcindex)
1632 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
1633
1634#ifdef CONFIG_MODVERSIONS
1635 if ((mod->num_syms && !crcindex) ||
1636 (mod->num_gpl_syms && !gplcrcindex)) {
1637 printk(KERN_WARNING "%s: No versions for exported symbols."
1638 " Tainting kernel.\n", mod->name);
1639 tainted |= TAINT_FORCED_MODULE;
1640 }
1641#endif
1642
1643 /* Now do relocations. */
1644 for (i = 1; i < hdr->e_shnum; i++) {
1645 const char *strtab = (char *)sechdrs[strindex].sh_addr;
1646 unsigned int info = sechdrs[i].sh_info;
1647
1648 /* Not a valid relocation section? */
1649 if (info >= hdr->e_shnum)
1650 continue;
1651
1652 /* Don't bother with non-allocated sections */
1653 if (!(sechdrs[info].sh_flags & SHF_ALLOC))
1654 continue;
1655
1656 if (sechdrs[i].sh_type == SHT_REL)
1657 err = apply_relocate(sechdrs, strtab, symindex, i,mod);
1658 else if (sechdrs[i].sh_type == SHT_RELA)
1659 err = apply_relocate_add(sechdrs, strtab, symindex, i,
1660 mod);
1661 if (err < 0)
1662 goto cleanup;
1663 }
1664
1665 /* Set up and sort exception table */
1666 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
1667 mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
1668 sort_extable(extable, extable + mod->num_exentries);
1669
1670 /* Finally, copy percpu area over. */
1671 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
1672 sechdrs[pcpuindex].sh_size);
1673
1674 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
1675
1676 err = module_finalize(hdr, sechdrs, mod);
1677 if (err < 0)
1678 goto cleanup;
1679
1680 mod->args = args;
1681 if (obsparmindex) {
1682 err = obsolete_params(mod->name, mod->args,
1683 (struct obsolete_modparm *)
1684 sechdrs[obsparmindex].sh_addr,
1685 sechdrs[obsparmindex].sh_size
1686 / sizeof(struct obsolete_modparm),
1687 sechdrs, symindex,
1688 (char *)sechdrs[strindex].sh_addr);
1689 if (setupindex)
1690 printk(KERN_WARNING "%s: Ignoring new-style "
1691 "parameters in presence of obsolete ones\n",
1692 mod->name);
1693 } else {
1694 /* Size of section 0 is 0, so this works well if no params */
1695 err = parse_args(mod->name, mod->args,
1696 (struct kernel_param *)
1697 sechdrs[setupindex].sh_addr,
1698 sechdrs[setupindex].sh_size
1699 / sizeof(struct kernel_param),
1700 NULL);
1701 }
1702 if (err < 0)
1703 goto arch_cleanup;
1704
1705 err = mod_sysfs_setup(mod,
1706 (struct kernel_param *)
1707 sechdrs[setupindex].sh_addr,
1708 sechdrs[setupindex].sh_size
1709 / sizeof(struct kernel_param));
1710 if (err < 0)
1711 goto arch_cleanup;
1712 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1713
1714 /* Get rid of temporary copy */
1715 vfree(hdr);
1716
1717 /* Done! */
1718 return mod;
1719
1720 arch_cleanup:
1721 module_arch_cleanup(mod);
1722 cleanup:
1723 module_unload_free(mod);
1724 module_free(mod, mod->module_init);
1725 free_core:
1726 module_free(mod, mod->module_core);
1727 free_percpu:
1728 if (percpu)
1729 percpu_modfree(percpu);
1730 free_mod:
1731 kfree(args);
1732 free_hdr:
1733 vfree(hdr);
1734 if (err < 0) return ERR_PTR(err);
1735 else return ptr;
1736
1737 truncated:
1738 printk(KERN_ERR "Module len %lu truncated\n", len);
1739 err = -ENOEXEC;
1740 goto free_hdr;
1741}
1742
1743/*
1744 * link the module with the whole machine is stopped with interrupts off
1745 * - this defends against kallsyms not taking locks
1746 */
1747static int __link_module(void *_mod)
1748{
1749 struct module *mod = _mod;
1750 list_add(&mod->list, &modules);
1751 return 0;
1752}
1753
1754/* This is where the real work happens */
1755asmlinkage long
1756sys_init_module(void __user *umod,
1757 unsigned long len,
1758 const char __user *uargs)
1759{
1760 struct module *mod;
1761 int ret = 0;
1762
1763 /* Must have permission */
1764 if (!capable(CAP_SYS_MODULE))
1765 return -EPERM;
1766
1767 /* Only one module load at a time, please */
1768 if (down_interruptible(&module_mutex) != 0)
1769 return -EINTR;
1770
1771 /* Do all the hard work */
1772 mod = load_module(umod, len, uargs);
1773 if (IS_ERR(mod)) {
1774 up(&module_mutex);
1775 return PTR_ERR(mod);
1776 }
1777
1778 /* Flush the instruction cache, since we've played with text */
1779 if (mod->module_init)
1780 flush_icache_range((unsigned long)mod->module_init,
1781 (unsigned long)mod->module_init
1782 + mod->init_size);
1783 flush_icache_range((unsigned long)mod->module_core,
1784 (unsigned long)mod->module_core + mod->core_size);
1785
1786 /* Now sew it into the lists. They won't access us, since
1787 strong_try_module_get() will fail. */
1788 stop_machine_run(__link_module, mod, NR_CPUS);
1789
1790 /* Drop lock so they can recurse */
1791 up(&module_mutex);
1792
1793 down(&notify_mutex);
1794 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
1795 up(&notify_mutex);
1796
1797 /* Start the module */
1798 if (mod->init != NULL)
1799 ret = mod->init();
1800 if (ret < 0) {
1801 /* Init routine failed: abort. Try to protect us from
1802 buggy refcounters. */
1803 mod->state = MODULE_STATE_GOING;
1804 synchronize_kernel();
1805 if (mod->unsafe)
1806 printk(KERN_ERR "%s: module is now stuck!\n",
1807 mod->name);
1808 else {
1809 module_put(mod);
1810 down(&module_mutex);
1811 free_module(mod);
1812 up(&module_mutex);
1813 }
1814 return ret;
1815 }
1816
1817 /* Now it's a first class citizen! */
1818 down(&module_mutex);
1819 mod->state = MODULE_STATE_LIVE;
1820 /* Drop initial reference. */
1821 module_put(mod);
1822 module_free(mod, mod->module_init);
1823 mod->module_init = NULL;
1824 mod->init_size = 0;
1825 mod->init_text_size = 0;
1826 up(&module_mutex);
1827
1828 return 0;
1829}
1830
1831static inline int within(unsigned long addr, void *start, unsigned long size)
1832{
1833 return ((void *)addr >= start && (void *)addr < start + size);
1834}
1835
1836#ifdef CONFIG_KALLSYMS
1837/*
1838 * This ignores the intensely annoying "mapping symbols" found
1839 * in ARM ELF files: $a, $t and $d.
1840 */
1841static inline int is_arm_mapping_symbol(const char *str)
1842{
1843 return str[0] == '$' && strchr("atd", str[1])
1844 && (str[2] == '\0' || str[2] == '.');
1845}
1846
1847static const char *get_ksymbol(struct module *mod,
1848 unsigned long addr,
1849 unsigned long *size,
1850 unsigned long *offset)
1851{
1852 unsigned int i, best = 0;
1853 unsigned long nextval;
1854
1855 /* At worse, next value is at end of module */
1856 if (within(addr, mod->module_init, mod->init_size))
1857 nextval = (unsigned long)mod->module_init+mod->init_text_size;
1858 else
1859 nextval = (unsigned long)mod->module_core+mod->core_text_size;
1860
1861 /* Scan for closest preceeding symbol, and next symbol. (ELF
1862 starts real symbols at 1). */
1863 for (i = 1; i < mod->num_symtab; i++) {
1864 if (mod->symtab[i].st_shndx == SHN_UNDEF)
1865 continue;
1866
1867 /* We ignore unnamed symbols: they're uninformative
1868 * and inserted at a whim. */
1869 if (mod->symtab[i].st_value <= addr
1870 && mod->symtab[i].st_value > mod->symtab[best].st_value
1871 && *(mod->strtab + mod->symtab[i].st_name) != '\0'
1872 && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
1873 best = i;
1874 if (mod->symtab[i].st_value > addr
1875 && mod->symtab[i].st_value < nextval
1876 && *(mod->strtab + mod->symtab[i].st_name) != '\0'
1877 && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
1878 nextval = mod->symtab[i].st_value;
1879 }
1880
1881 if (!best)
1882 return NULL;
1883
1884 *size = nextval - mod->symtab[best].st_value;
1885 *offset = addr - mod->symtab[best].st_value;
1886 return mod->strtab + mod->symtab[best].st_name;
1887}
1888
1889/* For kallsyms to ask for address resolution. NULL means not found.
1890 We don't lock, as this is used for oops resolution and races are a
1891 lesser concern. */
1892const char *module_address_lookup(unsigned long addr,
1893 unsigned long *size,
1894 unsigned long *offset,
1895 char **modname)
1896{
1897 struct module *mod;
1898
1899 list_for_each_entry(mod, &modules, list) {
1900 if (within(addr, mod->module_init, mod->init_size)
1901 || within(addr, mod->module_core, mod->core_size)) {
1902 *modname = mod->name;
1903 return get_ksymbol(mod, addr, size, offset);
1904 }
1905 }
1906 return NULL;
1907}
1908
1909struct module *module_get_kallsym(unsigned int symnum,
1910 unsigned long *value,
1911 char *type,
1912 char namebuf[128])
1913{
1914 struct module *mod;
1915
1916 down(&module_mutex);
1917 list_for_each_entry(mod, &modules, list) {
1918 if (symnum < mod->num_symtab) {
1919 *value = mod->symtab[symnum].st_value;
1920 *type = mod->symtab[symnum].st_info;
1921 strncpy(namebuf,
1922 mod->strtab + mod->symtab[symnum].st_name,
1923 127);
1924 up(&module_mutex);
1925 return mod;
1926 }
1927 symnum -= mod->num_symtab;
1928 }
1929 up(&module_mutex);
1930 return NULL;
1931}
1932
1933static unsigned long mod_find_symname(struct module *mod, const char *name)
1934{
1935 unsigned int i;
1936
1937 for (i = 0; i < mod->num_symtab; i++)
1938 if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0)
1939 return mod->symtab[i].st_value;
1940 return 0;
1941}
1942
1943/* Look for this name: can be of form module:name. */
1944unsigned long module_kallsyms_lookup_name(const char *name)
1945{
1946 struct module *mod;
1947 char *colon;
1948 unsigned long ret = 0;
1949
1950 /* Don't lock: we're in enough trouble already. */
1951 if ((colon = strchr(name, ':')) != NULL) {
1952 *colon = '\0';
1953 if ((mod = find_module(name)) != NULL)
1954 ret = mod_find_symname(mod, colon+1);
1955 *colon = ':';
1956 } else {
1957 list_for_each_entry(mod, &modules, list)
1958 if ((ret = mod_find_symname(mod, name)) != 0)
1959 break;
1960 }
1961 return ret;
1962}
1963#endif /* CONFIG_KALLSYMS */
1964
1965/* Called by the /proc file system to return a list of modules. */
1966static void *m_start(struct seq_file *m, loff_t *pos)
1967{
1968 struct list_head *i;
1969 loff_t n = 0;
1970
1971 down(&module_mutex);
1972 list_for_each(i, &modules) {
1973 if (n++ == *pos)
1974 break;
1975 }
1976 if (i == &modules)
1977 return NULL;
1978 return i;
1979}
1980
1981static void *m_next(struct seq_file *m, void *p, loff_t *pos)
1982{
1983 struct list_head *i = p;
1984 (*pos)++;
1985 if (i->next == &modules)
1986 return NULL;
1987 return i->next;
1988}
1989
1990static void m_stop(struct seq_file *m, void *p)
1991{
1992 up(&module_mutex);
1993}
1994
1995static int m_show(struct seq_file *m, void *p)
1996{
1997 struct module *mod = list_entry(p, struct module, list);
1998 seq_printf(m, "%s %lu",
1999 mod->name, mod->init_size + mod->core_size);
2000 print_unload_info(m, mod);
2001
2002 /* Informative for users. */
2003 seq_printf(m, " %s",
2004 mod->state == MODULE_STATE_GOING ? "Unloading":
2005 mod->state == MODULE_STATE_COMING ? "Loading":
2006 "Live");
2007 /* Used by oprofile and other similar tools. */
2008 seq_printf(m, " 0x%p", mod->module_core);
2009
2010 seq_printf(m, "\n");
2011 return 0;
2012}
2013
2014/* Format: modulename size refcount deps address
2015
2016 Where refcount is a number or -, and deps is a comma-separated list
2017 of depends or -.
2018*/
2019struct seq_operations modules_op = {
2020 .start = m_start,
2021 .next = m_next,
2022 .stop = m_stop,
2023 .show = m_show
2024};
2025
2026/* Given an address, look for it in the module exception tables. */
2027const struct exception_table_entry *search_module_extables(unsigned long addr)
2028{
2029 unsigned long flags;
2030 const struct exception_table_entry *e = NULL;
2031 struct module *mod;
2032
2033 spin_lock_irqsave(&modlist_lock, flags);
2034 list_for_each_entry(mod, &modules, list) {
2035 if (mod->num_exentries == 0)
2036 continue;
2037
2038 e = search_extable(mod->extable,
2039 mod->extable + mod->num_exentries - 1,
2040 addr);
2041 if (e)
2042 break;
2043 }
2044 spin_unlock_irqrestore(&modlist_lock, flags);
2045
2046 /* Now, if we found one, we are running inside it now, hence
2047 we cannot unload the module, hence no refcnt needed. */
2048 return e;
2049}
2050
2051/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */
2052struct module *__module_text_address(unsigned long addr)
2053{
2054 struct module *mod;
2055
2056 list_for_each_entry(mod, &modules, list)
2057 if (within(addr, mod->module_init, mod->init_text_size)
2058 || within(addr, mod->module_core, mod->core_text_size))
2059 return mod;
2060 return NULL;
2061}
2062
2063struct module *module_text_address(unsigned long addr)
2064{
2065 struct module *mod;
2066 unsigned long flags;
2067
2068 spin_lock_irqsave(&modlist_lock, flags);
2069 mod = __module_text_address(addr);
2070 spin_unlock_irqrestore(&modlist_lock, flags);
2071
2072 return mod;
2073}
2074
2075/* Don't grab lock, we're oopsing. */
2076void print_modules(void)
2077{
2078 struct module *mod;
2079
2080 printk("Modules linked in:");
2081 list_for_each_entry(mod, &modules, list)
2082 printk(" %s", mod->name);
2083 printk("\n");
2084}
2085
2086void module_add_driver(struct module *mod, struct device_driver *drv)
2087{
2088 if (!mod || !drv)
2089 return;
2090
2091 /* Don't check return code; this call is idempotent */
2092 sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
2093}
2094EXPORT_SYMBOL(module_add_driver);
2095
2096void module_remove_driver(struct device_driver *drv)
2097{
2098 if (!drv)
2099 return;
2100 sysfs_remove_link(&drv->kobj, "module");
2101}
2102EXPORT_SYMBOL(module_remove_driver);
2103
2104#ifdef CONFIG_MODVERSIONS
2105/* Generate the signature for struct module here, too, for modversions. */
2106void struct_module(struct module *mod) { return; }
2107EXPORT_SYMBOL(struct_module);
2108#endif
diff --git a/kernel/panic.c b/kernel/panic.c
new file mode 100644
index 000000000000..0fa3f3a66fb6
--- /dev/null
+++ b/kernel/panic.c
@@ -0,0 +1,157 @@
1/*
2 * linux/kernel/panic.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7/*
8 * This function is used through-out the kernel (including mm and fs)
9 * to indicate a major problem.
10 */
11#include <linux/config.h>
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/delay.h>
15#include <linux/reboot.h>
16#include <linux/notifier.h>
17#include <linux/init.h>
18#include <linux/sysrq.h>
19#include <linux/interrupt.h>
20#include <linux/nmi.h>
21
22int panic_timeout;
23int panic_on_oops;
24int tainted;
25
26EXPORT_SYMBOL(panic_timeout);
27
28struct notifier_block *panic_notifier_list;
29
30EXPORT_SYMBOL(panic_notifier_list);
31
32static int __init panic_setup(char *str)
33{
34 panic_timeout = simple_strtoul(str, NULL, 0);
35 return 1;
36}
37__setup("panic=", panic_setup);
38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */
45long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink);
47
48/**
49 * panic - halt the system
50 * @fmt: The text string to print
51 *
52 * Display a message, then perform cleanups.
53 *
54 * This function never returns.
55 */
56
57NORET_TYPE void panic(const char * fmt, ...)
58{
59 long i;
60 static char buf[1024];
61 va_list args;
62#if defined(CONFIG_ARCH_S390)
63 unsigned long caller = (unsigned long) __builtin_return_address(0);
64#endif
65
66 bust_spinlocks(1);
67 va_start(args, fmt);
68 vsnprintf(buf, sizeof(buf), fmt, args);
69 va_end(args);
70 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
71 bust_spinlocks(0);
72
73#ifdef CONFIG_SMP
74 smp_send_stop();
75#endif
76
77 notifier_call_chain(&panic_notifier_list, 0, buf);
78
79 if (!panic_blink)
80 panic_blink = no_blink;
81
82 if (panic_timeout > 0)
83 {
84 /*
85 * Delay timeout seconds before rebooting the machine.
86 * We can't use the "normal" timers since we just panicked..
87 */
88 printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
89 for (i = 0; i < panic_timeout*1000; ) {
90 touch_nmi_watchdog();
91 i += panic_blink(i);
92 mdelay(1);
93 i++;
94 }
95 /*
96 * Should we run the reboot notifier. For the moment Im
97 * choosing not too. It might crash, be corrupt or do
98 * more harm than good for other reasons.
99 */
100 machine_restart(NULL);
101 }
102#ifdef __sparc__
103 {
104 extern int stop_a_enabled;
105 /* Make sure the user can actually press L1-A */
106 stop_a_enabled = 1;
107 printk(KERN_EMERG "Press L1-A to return to the boot prom\n");
108 }
109#endif
110#if defined(CONFIG_ARCH_S390)
111 disabled_wait(caller);
112#endif
113 local_irq_enable();
114 for (i = 0;;) {
115 i += panic_blink(i);
116 mdelay(1);
117 i++;
118 }
119}
120
121EXPORT_SYMBOL(panic);
122
123/**
124 * print_tainted - return a string to represent the kernel taint state.
125 *
126 * 'P' - Proprietary module has been loaded.
127 * 'F' - Module has been forcibly loaded.
128 * 'S' - SMP with CPUs not designed for SMP.
129 * 'R' - User forced a module unload.
130 * 'M' - Machine had a machine check experience.
131 * 'B' - System has hit bad_page.
132 *
133 * The string is overwritten by the next call to print_taint().
134 */
135
136const char *print_tainted(void)
137{
138 static char buf[20];
139 if (tainted) {
140 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c",
141 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
142 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
143 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
144 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
145 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
146 tainted & TAINT_BAD_PAGE ? 'B' : ' ');
147 }
148 else
149 snprintf(buf, sizeof(buf), "Not tainted");
150 return(buf);
151}
152
153void add_taint(unsigned flag)
154{
155 tainted |= flag;
156}
157EXPORT_SYMBOL(add_taint);
diff --git a/kernel/params.c b/kernel/params.c
new file mode 100644
index 000000000000..5538608bd339
--- /dev/null
+++ b/kernel/params.c
@@ -0,0 +1,721 @@
1/* Helpers for initial module or kernel cmdline parsing
2 Copyright (C) 2001 Rusty Russell.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/config.h>
19#include <linux/moduleparam.h>
20#include <linux/kernel.h>
21#include <linux/string.h>
22#include <linux/errno.h>
23#include <linux/module.h>
24#include <linux/device.h>
25#include <linux/err.h>
26
27#if 0
28#define DEBUGP printk
29#else
30#define DEBUGP(fmt, a...)
31#endif
32
33static inline int dash2underscore(char c)
34{
35 if (c == '-')
36 return '_';
37 return c;
38}
39
40static inline int parameq(const char *input, const char *paramname)
41{
42 unsigned int i;
43 for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
44 if (input[i] == '\0')
45 return 1;
46 return 0;
47}
48
49static int parse_one(char *param,
50 char *val,
51 struct kernel_param *params,
52 unsigned num_params,
53 int (*handle_unknown)(char *param, char *val))
54{
55 unsigned int i;
56
57 /* Find parameter */
58 for (i = 0; i < num_params; i++) {
59 if (parameq(param, params[i].name)) {
60 DEBUGP("They are equal! Calling %p\n",
61 params[i].set);
62 return params[i].set(val, &params[i]);
63 }
64 }
65
66 if (handle_unknown) {
67 DEBUGP("Unknown argument: calling %p\n", handle_unknown);
68 return handle_unknown(param, val);
69 }
70
71 DEBUGP("Unknown argument `%s'\n", param);
72 return -ENOENT;
73}
74
75/* You can use " around spaces, but can't escape ". */
76/* Hyphens and underscores equivalent in parameter names. */
77static char *next_arg(char *args, char **param, char **val)
78{
79 unsigned int i, equals = 0;
80 int in_quote = 0, quoted = 0;
81 char *next;
82
83 /* Chew any extra spaces */
84 while (*args == ' ') args++;
85 if (*args == '"') {
86 args++;
87 in_quote = 1;
88 quoted = 1;
89 }
90
91 for (i = 0; args[i]; i++) {
92 if (args[i] == ' ' && !in_quote)
93 break;
94 if (equals == 0) {
95 if (args[i] == '=')
96 equals = i;
97 }
98 if (args[i] == '"')
99 in_quote = !in_quote;
100 }
101
102 *param = args;
103 if (!equals)
104 *val = NULL;
105 else {
106 args[equals] = '\0';
107 *val = args + equals + 1;
108
109 /* Don't include quotes in value. */
110 if (**val == '"') {
111 (*val)++;
112 if (args[i-1] == '"')
113 args[i-1] = '\0';
114 }
115 if (quoted && args[i-1] == '"')
116 args[i-1] = '\0';
117 }
118
119 if (args[i]) {
120 args[i] = '\0';
121 next = args + i + 1;
122 } else
123 next = args + i;
124 return next;
125}
126
127/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
128int parse_args(const char *name,
129 char *args,
130 struct kernel_param *params,
131 unsigned num,
132 int (*unknown)(char *param, char *val))
133{
134 char *param, *val;
135
136 DEBUGP("Parsing ARGS: %s\n", args);
137
138 while (*args) {
139 int ret;
140
141 args = next_arg(args, &param, &val);
142 ret = parse_one(param, val, params, num, unknown);
143 switch (ret) {
144 case -ENOENT:
145 printk(KERN_ERR "%s: Unknown parameter `%s'\n",
146 name, param);
147 return ret;
148 case -ENOSPC:
149 printk(KERN_ERR
150 "%s: `%s' too large for parameter `%s'\n",
151 name, val ?: "", param);
152 return ret;
153 case 0:
154 break;
155 default:
156 printk(KERN_ERR
157 "%s: `%s' invalid for parameter `%s'\n",
158 name, val ?: "", param);
159 return ret;
160 }
161 }
162
163 /* All parsed OK. */
164 return 0;
165}
166
167/* Lazy bastard, eh? */
168#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
169 int param_set_##name(const char *val, struct kernel_param *kp) \
170 { \
171 char *endp; \
172 tmptype l; \
173 \
174 if (!val) return -EINVAL; \
175 l = strtolfn(val, &endp, 0); \
176 if (endp == val || ((type)l != l)) \
177 return -EINVAL; \
178 *((type *)kp->arg) = l; \
179 return 0; \
180 } \
181 int param_get_##name(char *buffer, struct kernel_param *kp) \
182 { \
183 return sprintf(buffer, format, *((type *)kp->arg)); \
184 }
185
186STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul);
187STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol);
188STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul);
189STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol);
190STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul);
191STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol);
192STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul);
193
194int param_set_charp(const char *val, struct kernel_param *kp)
195{
196 if (!val) {
197 printk(KERN_ERR "%s: string parameter expected\n",
198 kp->name);
199 return -EINVAL;
200 }
201
202 if (strlen(val) > 1024) {
203 printk(KERN_ERR "%s: string parameter too long\n",
204 kp->name);
205 return -ENOSPC;
206 }
207
208 *(char **)kp->arg = (char *)val;
209 return 0;
210}
211
212int param_get_charp(char *buffer, struct kernel_param *kp)
213{
214 return sprintf(buffer, "%s", *((char **)kp->arg));
215}
216
217int param_set_bool(const char *val, struct kernel_param *kp)
218{
219 /* No equals means "set"... */
220 if (!val) val = "1";
221
222 /* One of =[yYnN01] */
223 switch (val[0]) {
224 case 'y': case 'Y': case '1':
225 *(int *)kp->arg = 1;
226 return 0;
227 case 'n': case 'N': case '0':
228 *(int *)kp->arg = 0;
229 return 0;
230 }
231 return -EINVAL;
232}
233
234int param_get_bool(char *buffer, struct kernel_param *kp)
235{
236 /* Y and N chosen as being relatively non-coder friendly */
237 return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
238}
239
240int param_set_invbool(const char *val, struct kernel_param *kp)
241{
242 int boolval, ret;
243 struct kernel_param dummy = { .arg = &boolval };
244
245 ret = param_set_bool(val, &dummy);
246 if (ret == 0)
247 *(int *)kp->arg = !boolval;
248 return ret;
249}
250
251int param_get_invbool(char *buffer, struct kernel_param *kp)
252{
253 int val;
254 struct kernel_param dummy = { .arg = &val };
255
256 val = !*(int *)kp->arg;
257 return param_get_bool(buffer, &dummy);
258}
259
260/* We cheat here and temporarily mangle the string. */
261int param_array(const char *name,
262 const char *val,
263 unsigned int min, unsigned int max,
264 void *elem, int elemsize,
265 int (*set)(const char *, struct kernel_param *kp),
266 int *num)
267{
268 int ret;
269 struct kernel_param kp;
270 char save;
271
272 /* Get the name right for errors. */
273 kp.name = name;
274 kp.arg = elem;
275
276 /* No equals sign? */
277 if (!val) {
278 printk(KERN_ERR "%s: expects arguments\n", name);
279 return -EINVAL;
280 }
281
282 *num = 0;
283 /* We expect a comma-separated list of values. */
284 do {
285 int len;
286
287 if (*num == max) {
288 printk(KERN_ERR "%s: can only take %i arguments\n",
289 name, max);
290 return -EINVAL;
291 }
292 len = strcspn(val, ",");
293
294 /* nul-terminate and parse */
295 save = val[len];
296 ((char *)val)[len] = '\0';
297 ret = set(val, &kp);
298
299 if (ret != 0)
300 return ret;
301 kp.arg += elemsize;
302 val += len+1;
303 (*num)++;
304 } while (save == ',');
305
306 if (*num < min) {
307 printk(KERN_ERR "%s: needs at least %i arguments\n",
308 name, min);
309 return -EINVAL;
310 }
311 return 0;
312}
313
314int param_array_set(const char *val, struct kernel_param *kp)
315{
316 struct kparam_array *arr = kp->arg;
317
318 return param_array(kp->name, val, 1, arr->max, arr->elem,
319 arr->elemsize, arr->set, arr->num ?: &arr->max);
320}
321
322int param_array_get(char *buffer, struct kernel_param *kp)
323{
324 int i, off, ret;
325 struct kparam_array *arr = kp->arg;
326 struct kernel_param p;
327
328 p = *kp;
329 for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
330 if (i)
331 buffer[off++] = ',';
332 p.arg = arr->elem + arr->elemsize * i;
333 ret = arr->get(buffer + off, &p);
334 if (ret < 0)
335 return ret;
336 off += ret;
337 }
338 buffer[off] = '\0';
339 return off;
340}
341
342int param_set_copystring(const char *val, struct kernel_param *kp)
343{
344 struct kparam_string *kps = kp->arg;
345
346 if (strlen(val)+1 > kps->maxlen) {
347 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
348 kp->name, kps->maxlen-1);
349 return -ENOSPC;
350 }
351 strcpy(kps->string, val);
352 return 0;
353}
354
355int param_get_string(char *buffer, struct kernel_param *kp)
356{
357 struct kparam_string *kps = kp->arg;
358 return strlcpy(buffer, kps->string, kps->maxlen);
359}
360
361/* sysfs output in /sys/modules/XYZ/parameters/ */
362
363extern struct kernel_param __start___param[], __stop___param[];
364
365#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
366
367struct param_attribute
368{
369 struct module_attribute mattr;
370 struct kernel_param *param;
371};
372
373struct module_param_attrs
374{
375 struct attribute_group grp;
376 struct param_attribute attrs[0];
377};
378
379#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
380
381static ssize_t param_attr_show(struct module_attribute *mattr,
382 struct module *mod, char *buf)
383{
384 int count;
385 struct param_attribute *attribute = to_param_attr(mattr);
386
387 if (!attribute->param->get)
388 return -EPERM;
389
390 count = attribute->param->get(buf, attribute->param);
391 if (count > 0) {
392 strcat(buf, "\n");
393 ++count;
394 }
395 return count;
396}
397
398/* sysfs always hands a nul-terminated string in buf. We rely on that. */
399static ssize_t param_attr_store(struct module_attribute *mattr,
400 struct module *owner,
401 const char *buf, size_t len)
402{
403 int err;
404 struct param_attribute *attribute = to_param_attr(mattr);
405
406 if (!attribute->param->set)
407 return -EPERM;
408
409 err = attribute->param->set(buf, attribute->param);
410 if (!err)
411 return len;
412 return err;
413}
414
415#ifdef CONFIG_MODULES
416#define __modinit
417#else
418#define __modinit __init
419#endif
420
421/*
422 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
423 * @mk: struct module_kobject (contains parent kobject)
424 * @kparam: array of struct kernel_param, the actual parameter definitions
425 * @num_params: number of entries in array
426 * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
427 *
428 * Create a kobject for a (per-module) group of parameters, and create files
429 * in sysfs. A pointer to the param_kobject is returned on success,
430 * NULL if there's no parameter to export, or other ERR_PTR(err).
431 */
432static __modinit struct module_param_attrs *
433param_sysfs_setup(struct module_kobject *mk,
434 struct kernel_param *kparam,
435 unsigned int num_params,
436 unsigned int name_skip)
437{
438 struct module_param_attrs *mp;
439 unsigned int valid_attrs = 0;
440 unsigned int i, size[2];
441 struct param_attribute *pattr;
442 struct attribute **gattr;
443 int err;
444
445 for (i=0; i<num_params; i++) {
446 if (kparam[i].perm)
447 valid_attrs++;
448 }
449
450 if (!valid_attrs)
451 return NULL;
452
453 size[0] = ALIGN(sizeof(*mp) +
454 valid_attrs * sizeof(mp->attrs[0]),
455 sizeof(mp->grp.attrs[0]));
456 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
457
458 mp = kmalloc(size[0] + size[1], GFP_KERNEL);
459 if (!mp)
460 return ERR_PTR(-ENOMEM);
461
462 mp->grp.name = "parameters";
463 mp->grp.attrs = (void *)mp + size[0];
464
465 pattr = &mp->attrs[0];
466 gattr = &mp->grp.attrs[0];
467 for (i = 0; i < num_params; i++) {
468 struct kernel_param *kp = &kparam[i];
469 if (kp->perm) {
470 pattr->param = kp;
471 pattr->mattr.show = param_attr_show;
472 pattr->mattr.store = param_attr_store;
473 pattr->mattr.attr.name = (char *)&kp->name[name_skip];
474 pattr->mattr.attr.owner = mk->mod;
475 pattr->mattr.attr.mode = kp->perm;
476 *(gattr++) = &(pattr++)->mattr.attr;
477 }
478 }
479 *gattr = NULL;
480
481 if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
482 kfree(mp);
483 return ERR_PTR(err);
484 }
485 return mp;
486}
487
488
489#ifdef CONFIG_MODULES
490
491/*
492 * module_param_sysfs_setup - setup sysfs support for one module
493 * @mod: module
494 * @kparam: module parameters (array)
495 * @num_params: number of module parameters
496 *
497 * Adds sysfs entries for module parameters, and creates a link from
498 * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/
499 */
500int module_param_sysfs_setup(struct module *mod,
501 struct kernel_param *kparam,
502 unsigned int num_params)
503{
504 struct module_param_attrs *mp;
505
506 mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
507 if (IS_ERR(mp))
508 return PTR_ERR(mp);
509
510 mod->param_attrs = mp;
511 return 0;
512}
513
514/*
515 * module_param_sysfs_remove - remove sysfs support for one module
516 * @mod: module
517 *
518 * Remove sysfs entries for module parameters and the corresponding
519 * kobject.
520 */
521void module_param_sysfs_remove(struct module *mod)
522{
523 if (mod->param_attrs) {
524 sysfs_remove_group(&mod->mkobj.kobj,
525 &mod->param_attrs->grp);
526 /* We are positive that no one is using any param
527 * attrs at this point. Deallocate immediately. */
528 kfree(mod->param_attrs);
529 mod->param_attrs = NULL;
530 }
531}
532#endif
533
534/*
535 * kernel_param_sysfs_setup - wrapper for built-in params support
536 */
537static void __init kernel_param_sysfs_setup(const char *name,
538 struct kernel_param *kparam,
539 unsigned int num_params,
540 unsigned int name_skip)
541{
542 struct module_kobject *mk;
543
544 mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
545 memset(mk, 0, sizeof(struct module_kobject));
546
547 mk->mod = THIS_MODULE;
548 kobj_set_kset_s(mk, module_subsys);
549 kobject_set_name(&mk->kobj, name);
550 kobject_register(&mk->kobj);
551
552 /* no need to keep the kobject if no parameter is exported */
553 if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
554 kobject_unregister(&mk->kobj);
555 kfree(mk);
556 }
557}
558
559/*
560 * param_sysfs_builtin - add contents in /sys/parameters for built-in modules
561 *
562 * Add module_parameters to sysfs for "modules" built into the kernel.
563 *
564 * The "module" name (KBUILD_MODNAME) is stored before a dot, the
565 * "parameter" name is stored behind a dot in kernel_param->name. So,
566 * extract the "module" name for all built-in kernel_param-eters,
567 * and for all who have the same, call kernel_param_sysfs_setup.
568 */
569static void __init param_sysfs_builtin(void)
570{
571 struct kernel_param *kp, *kp_begin = NULL;
572 unsigned int i, name_len, count = 0;
573 char modname[MAX_KBUILD_MODNAME + 1] = "";
574
575 for (i=0; i < __stop___param - __start___param; i++) {
576 char *dot;
577
578 kp = &__start___param[i];
579
580 /* We do not handle args without periods. */
581 dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME);
582 if (!dot) {
583 DEBUGP("couldn't find period in %s\n", kp->name);
584 continue;
585 }
586 name_len = dot - kp->name;
587
588 /* new kbuild_modname? */
589 if (strlen(modname) != name_len
590 || strncmp(modname, kp->name, name_len) != 0) {
591 /* add a new kobject for previous kernel_params. */
592 if (count)
593 kernel_param_sysfs_setup(modname,
594 kp_begin,
595 count,
596 strlen(modname)+1);
597
598 strncpy(modname, kp->name, name_len);
599 modname[name_len] = '\0';
600 count = 0;
601 kp_begin = kp;
602 }
603 count++;
604 }
605
606 /* last kernel_params need to be registered as well */
607 if (count)
608 kernel_param_sysfs_setup(modname, kp_begin, count,
609 strlen(modname)+1);
610}
611
612
613/* module-related sysfs stuff */
614#ifdef CONFIG_MODULES
615
616#define to_module_attr(n) container_of(n, struct module_attribute, attr);
617#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
618
619static ssize_t module_attr_show(struct kobject *kobj,
620 struct attribute *attr,
621 char *buf)
622{
623 struct module_attribute *attribute;
624 struct module_kobject *mk;
625 int ret;
626
627 attribute = to_module_attr(attr);
628 mk = to_module_kobject(kobj);
629
630 if (!attribute->show)
631 return -EPERM;
632
633 if (!try_module_get(mk->mod))
634 return -ENODEV;
635
636 ret = attribute->show(attribute, mk->mod, buf);
637
638 module_put(mk->mod);
639
640 return ret;
641}
642
643static ssize_t module_attr_store(struct kobject *kobj,
644 struct attribute *attr,
645 const char *buf, size_t len)
646{
647 struct module_attribute *attribute;
648 struct module_kobject *mk;
649 int ret;
650
651 attribute = to_module_attr(attr);
652 mk = to_module_kobject(kobj);
653
654 if (!attribute->store)
655 return -EPERM;
656
657 if (!try_module_get(mk->mod))
658 return -ENODEV;
659
660 ret = attribute->store(attribute, mk->mod, buf, len);
661
662 module_put(mk->mod);
663
664 return ret;
665}
666
667static struct sysfs_ops module_sysfs_ops = {
668 .show = module_attr_show,
669 .store = module_attr_store,
670};
671
672#else
673static struct sysfs_ops module_sysfs_ops = {
674 .show = NULL,
675 .store = NULL,
676};
677#endif
678
679static struct kobj_type module_ktype = {
680 .sysfs_ops = &module_sysfs_ops,
681};
682
683decl_subsys(module, &module_ktype, NULL);
684
685/*
686 * param_sysfs_init - wrapper for built-in params support
687 */
688static int __init param_sysfs_init(void)
689{
690 subsystem_register(&module_subsys);
691
692 param_sysfs_builtin();
693
694 return 0;
695}
696__initcall(param_sysfs_init);
697
698EXPORT_SYMBOL(param_set_byte);
699EXPORT_SYMBOL(param_get_byte);
700EXPORT_SYMBOL(param_set_short);
701EXPORT_SYMBOL(param_get_short);
702EXPORT_SYMBOL(param_set_ushort);
703EXPORT_SYMBOL(param_get_ushort);
704EXPORT_SYMBOL(param_set_int);
705EXPORT_SYMBOL(param_get_int);
706EXPORT_SYMBOL(param_set_uint);
707EXPORT_SYMBOL(param_get_uint);
708EXPORT_SYMBOL(param_set_long);
709EXPORT_SYMBOL(param_get_long);
710EXPORT_SYMBOL(param_set_ulong);
711EXPORT_SYMBOL(param_get_ulong);
712EXPORT_SYMBOL(param_set_charp);
713EXPORT_SYMBOL(param_get_charp);
714EXPORT_SYMBOL(param_set_bool);
715EXPORT_SYMBOL(param_get_bool);
716EXPORT_SYMBOL(param_set_invbool);
717EXPORT_SYMBOL(param_get_invbool);
718EXPORT_SYMBOL(param_array_set);
719EXPORT_SYMBOL(param_array_get);
720EXPORT_SYMBOL(param_set_copystring);
721EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/pid.c b/kernel/pid.c
new file mode 100644
index 000000000000..edba31c681ac
--- /dev/null
+++ b/kernel/pid.c
@@ -0,0 +1,292 @@
1/*
2 * Generic pidhash and scalable, time-bounded PID allocator
3 *
4 * (C) 2002-2003 William Irwin, IBM
5 * (C) 2004 William Irwin, Oracle
6 * (C) 2002-2004 Ingo Molnar, Red Hat
7 *
8 * pid-structures are backing objects for tasks sharing a given ID to chain
9 * against. There is very little to them aside from hashing them and
10 * parking tasks using given ID's on a list.
11 *
12 * The hash is always changed with the tasklist_lock write-acquired,
13 * and the hash is only accessed with the tasklist_lock at least
14 * read-acquired, so there's no additional SMP locking needed here.
15 *
16 * We have a list of bitmap pages, which bitmaps represent the PID space.
17 * Allocating and freeing PIDs is completely lockless. The worst-case
18 * allocation scenario when all but one out of 1 million PIDs possible are
19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
21 */
22
23#include <linux/mm.h>
24#include <linux/module.h>
25#include <linux/slab.h>
26#include <linux/init.h>
27#include <linux/bootmem.h>
28#include <linux/hash.h>
29
30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
31static struct hlist_head *pid_hash[PIDTYPE_MAX];
32static int pidhash_shift;
33
34int pid_max = PID_MAX_DEFAULT;
35int last_pid;
36
37#define RESERVED_PIDS 300
38
39int pid_max_min = RESERVED_PIDS + 1;
40int pid_max_max = PID_MAX_LIMIT;
41
42#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
43#define BITS_PER_PAGE (PAGE_SIZE*8)
44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
45#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off))
46#define find_next_offset(map, off) \
47 find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
48
49/*
50 * PID-map pages start out as NULL, they get allocated upon
51 * first use and are never deallocated. This way a low pid_max
52 * value does not cause lots of bitmaps to be allocated, but
53 * the scheme scales to up to 4 million PIDs, runtime.
54 */
55typedef struct pidmap {
56 atomic_t nr_free;
57 void *page;
58} pidmap_t;
59
60static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
61 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
62
63static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
64
65fastcall void free_pidmap(int pid)
66{
67 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
68 int offset = pid & BITS_PER_PAGE_MASK;
69
70 clear_bit(offset, map->page);
71 atomic_inc(&map->nr_free);
72}
73
74int alloc_pidmap(void)
75{
76 int i, offset, max_scan, pid, last = last_pid;
77 pidmap_t *map;
78
79 pid = last + 1;
80 if (pid >= pid_max)
81 pid = RESERVED_PIDS;
82 offset = pid & BITS_PER_PAGE_MASK;
83 map = &pidmap_array[pid/BITS_PER_PAGE];
84 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
85 for (i = 0; i <= max_scan; ++i) {
86 if (unlikely(!map->page)) {
87 unsigned long page = get_zeroed_page(GFP_KERNEL);
88 /*
89 * Free the page if someone raced with us
90 * installing it:
91 */
92 spin_lock(&pidmap_lock);
93 if (map->page)
94 free_page(page);
95 else
96 map->page = (void *)page;
97 spin_unlock(&pidmap_lock);
98 if (unlikely(!map->page))
99 break;
100 }
101 if (likely(atomic_read(&map->nr_free))) {
102 do {
103 if (!test_and_set_bit(offset, map->page)) {
104 atomic_dec(&map->nr_free);
105 last_pid = pid;
106 return pid;
107 }
108 offset = find_next_offset(map, offset);
109 pid = mk_pid(map, offset);
110 /*
111 * find_next_offset() found a bit, the pid from it
112 * is in-bounds, and if we fell back to the last
113 * bitmap block and the final block was the same
114 * as the starting point, pid is before last_pid.
115 */
116 } while (offset < BITS_PER_PAGE && pid < pid_max &&
117 (i != max_scan || pid < last ||
118 !((last+1) & BITS_PER_PAGE_MASK)));
119 }
120 if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {
121 ++map;
122 offset = 0;
123 } else {
124 map = &pidmap_array[0];
125 offset = RESERVED_PIDS;
126 if (unlikely(last == offset))
127 break;
128 }
129 pid = mk_pid(map, offset);
130 }
131 return -1;
132}
133
134struct pid * fastcall find_pid(enum pid_type type, int nr)
135{
136 struct hlist_node *elem;
137 struct pid *pid;
138
139 hlist_for_each_entry(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr)
142 return pid;
143 }
144 return NULL;
145}
146
147int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
148{
149 struct pid *pid, *task_pid;
150
151 task_pid = &task->pids[type];
152 pid = find_pid(type, nr);
153 if (pid == NULL) {
154 hlist_add_head(&task_pid->pid_chain,
155 &pid_hash[type][pid_hashfn(nr)]);
156 INIT_LIST_HEAD(&task_pid->pid_list);
157 } else {
158 INIT_HLIST_NODE(&task_pid->pid_chain);
159 list_add_tail(&task_pid->pid_list, &pid->pid_list);
160 }
161 task_pid->nr = nr;
162
163 return 0;
164}
165
166static fastcall int __detach_pid(task_t *task, enum pid_type type)
167{
168 struct pid *pid, *pid_next;
169 int nr = 0;
170
171 pid = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) {
173 hlist_del(&pid->pid_chain);
174
175 if (list_empty(&pid->pid_list))
176 nr = pid->nr;
177 else {
178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */
181 hlist_add_head(&pid_next->pid_chain,
182 &pid_hash[type][pid_hashfn(pid_next->nr)]);
183 }
184 }
185
186 list_del(&pid->pid_list);
187 pid->nr = 0;
188
189 return nr;
190}
191
192void fastcall detach_pid(task_t *task, enum pid_type type)
193{
194 int tmp, nr;
195
196 nr = __detach_pid(task, type);
197 if (!nr)
198 return;
199
200 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
201 if (tmp != type && find_pid(tmp, nr))
202 return;
203
204 free_pidmap(nr);
205}
206
207task_t *find_task_by_pid_type(int type, int nr)
208{
209 struct pid *pid;
210
211 pid = find_pid(type, nr);
212 if (!pid)
213 return NULL;
214
215 return pid_task(&pid->pid_list, type);
216}
217
218EXPORT_SYMBOL(find_task_by_pid_type);
219
220/*
221 * This function switches the PIDs if a non-leader thread calls
222 * sys_execve() - this must be done without releasing the PID.
223 * (which a detach_pid() would eventually do.)
224 */
225void switch_exec_pids(task_t *leader, task_t *thread)
226{
227 __detach_pid(leader, PIDTYPE_PID);
228 __detach_pid(leader, PIDTYPE_TGID);
229 __detach_pid(leader, PIDTYPE_PGID);
230 __detach_pid(leader, PIDTYPE_SID);
231
232 __detach_pid(thread, PIDTYPE_PID);
233 __detach_pid(thread, PIDTYPE_TGID);
234
235 leader->pid = leader->tgid = thread->pid;
236 thread->pid = thread->tgid;
237
238 attach_pid(thread, PIDTYPE_PID, thread->pid);
239 attach_pid(thread, PIDTYPE_TGID, thread->tgid);
240 attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
241 attach_pid(thread, PIDTYPE_SID, thread->signal->session);
242 list_add_tail(&thread->tasks, &init_task.tasks);
243
244 attach_pid(leader, PIDTYPE_PID, leader->pid);
245 attach_pid(leader, PIDTYPE_TGID, leader->tgid);
246 attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
247 attach_pid(leader, PIDTYPE_SID, leader->signal->session);
248}
249
250/*
251 * The pid hash table is scaled according to the amount of memory in the
252 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
253 * more.
254 */
255void __init pidhash_init(void)
256{
257 int i, j, pidhash_size;
258 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
259
260 pidhash_shift = max(4, fls(megabytes * 4));
261 pidhash_shift = min(12, pidhash_shift);
262 pidhash_size = 1 << pidhash_shift;
263
264 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
265 pidhash_size, pidhash_shift,
266 PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
267
268 for (i = 0; i < PIDTYPE_MAX; i++) {
269 pid_hash[i] = alloc_bootmem(pidhash_size *
270 sizeof(*(pid_hash[i])));
271 if (!pid_hash[i])
272 panic("Could not alloc pidhash!\n");
273 for (j = 0; j < pidhash_size; j++)
274 INIT_HLIST_HEAD(&pid_hash[i][j]);
275 }
276}
277
278void __init pidmap_init(void)
279{
280 int i;
281
282 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
283 set_bit(0, pidmap_array->page);
284 atomic_dec(&pidmap_array->nr_free);
285
286 /*
287 * Allocate PID 0, and hash it via all PID types:
288 */
289
290 for (i = 0; i < PIDTYPE_MAX; i++)
291 attach_pid(current, i, 0);
292}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
new file mode 100644
index 000000000000..ad85d3f0dcc4
--- /dev/null
+++ b/kernel/posix-cpu-timers.c
@@ -0,0 +1,1559 @@
1/*
2 * Implement CPU time clocks for the POSIX clock interface.
3 */
4
5#include <linux/sched.h>
6#include <linux/posix-timers.h>
7#include <asm/uaccess.h>
8#include <linux/errno.h>
9
10static int check_clock(clockid_t which_clock)
11{
12 int error = 0;
13 struct task_struct *p;
14 const pid_t pid = CPUCLOCK_PID(which_clock);
15
16 if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
17 return -EINVAL;
18
19 if (pid == 0)
20 return 0;
21
22 read_lock(&tasklist_lock);
23 p = find_task_by_pid(pid);
24 if (!p || (CPUCLOCK_PERTHREAD(which_clock) ?
25 p->tgid != current->tgid : p->tgid != pid)) {
26 error = -EINVAL;
27 }
28 read_unlock(&tasklist_lock);
29
30 return error;
31}
32
33static inline union cpu_time_count
34timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
35{
36 union cpu_time_count ret;
37 ret.sched = 0; /* high half always zero when .cpu used */
38 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
39 ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
40 } else {
41 ret.cpu = timespec_to_cputime(tp);
42 }
43 return ret;
44}
45
46static void sample_to_timespec(clockid_t which_clock,
47 union cpu_time_count cpu,
48 struct timespec *tp)
49{
50 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
51 tp->tv_sec = div_long_long_rem(cpu.sched,
52 NSEC_PER_SEC, &tp->tv_nsec);
53 } else {
54 cputime_to_timespec(cpu.cpu, tp);
55 }
56}
57
58static inline int cpu_time_before(clockid_t which_clock,
59 union cpu_time_count now,
60 union cpu_time_count then)
61{
62 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
63 return now.sched < then.sched;
64 } else {
65 return cputime_lt(now.cpu, then.cpu);
66 }
67}
68static inline void cpu_time_add(clockid_t which_clock,
69 union cpu_time_count *acc,
70 union cpu_time_count val)
71{
72 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
73 acc->sched += val.sched;
74 } else {
75 acc->cpu = cputime_add(acc->cpu, val.cpu);
76 }
77}
78static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
79 union cpu_time_count a,
80 union cpu_time_count b)
81{
82 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
83 a.sched -= b.sched;
84 } else {
85 a.cpu = cputime_sub(a.cpu, b.cpu);
86 }
87 return a;
88}
89
90/*
91 * Update expiry time from increment, and increase overrun count,
92 * given the current clock sample.
93 */
94static inline void bump_cpu_timer(struct k_itimer *timer,
95 union cpu_time_count now)
96{
97 int i;
98
99 if (timer->it.cpu.incr.sched == 0)
100 return;
101
102 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
103 unsigned long long delta, incr;
104
105 if (now.sched < timer->it.cpu.expires.sched)
106 return;
107 incr = timer->it.cpu.incr.sched;
108 delta = now.sched + incr - timer->it.cpu.expires.sched;
109 /* Don't use (incr*2 < delta), incr*2 might overflow. */
110 for (i = 0; incr < delta - incr; i++)
111 incr = incr << 1;
112 for (; i >= 0; incr >>= 1, i--) {
113 if (delta <= incr)
114 continue;
115 timer->it.cpu.expires.sched += incr;
116 timer->it_overrun += 1 << i;
117 delta -= incr;
118 }
119 } else {
120 cputime_t delta, incr;
121
122 if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
123 return;
124 incr = timer->it.cpu.incr.cpu;
125 delta = cputime_sub(cputime_add(now.cpu, incr),
126 timer->it.cpu.expires.cpu);
127 /* Don't use (incr*2 < delta), incr*2 might overflow. */
128 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
129 incr = cputime_add(incr, incr);
130 for (; i >= 0; incr = cputime_halve(incr), i--) {
131 if (cputime_le(delta, incr))
132 continue;
133 timer->it.cpu.expires.cpu =
134 cputime_add(timer->it.cpu.expires.cpu, incr);
135 timer->it_overrun += 1 << i;
136 delta = cputime_sub(delta, incr);
137 }
138 }
139}
140
141static inline cputime_t prof_ticks(struct task_struct *p)
142{
143 return cputime_add(p->utime, p->stime);
144}
145static inline cputime_t virt_ticks(struct task_struct *p)
146{
147 return p->utime;
148}
149static inline unsigned long long sched_ns(struct task_struct *p)
150{
151 return (p == current) ? current_sched_time(p) : p->sched_time;
152}
153
154int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
155{
156 int error = check_clock(which_clock);
157 if (!error) {
158 tp->tv_sec = 0;
159 tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
160 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
161 /*
162 * If sched_clock is using a cycle counter, we
163 * don't have any idea of its true resolution
164 * exported, but it is much more than 1s/HZ.
165 */
166 tp->tv_nsec = 1;
167 }
168 }
169 return error;
170}
171
172int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
173{
174 /*
175 * You can never reset a CPU clock, but we check for other errors
176 * in the call before failing with EPERM.
177 */
178 int error = check_clock(which_clock);
179 if (error == 0) {
180 error = -EPERM;
181 }
182 return error;
183}
184
185
186/*
187 * Sample a per-thread clock for the given task.
188 */
189static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
190 union cpu_time_count *cpu)
191{
192 switch (CPUCLOCK_WHICH(which_clock)) {
193 default:
194 return -EINVAL;
195 case CPUCLOCK_PROF:
196 cpu->cpu = prof_ticks(p);
197 break;
198 case CPUCLOCK_VIRT:
199 cpu->cpu = virt_ticks(p);
200 break;
201 case CPUCLOCK_SCHED:
202 cpu->sched = sched_ns(p);
203 break;
204 }
205 return 0;
206}
207
208/*
209 * Sample a process (thread group) clock for the given group_leader task.
210 * Must be called with tasklist_lock held for reading.
211 * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
212 */
213static int cpu_clock_sample_group_locked(unsigned int clock_idx,
214 struct task_struct *p,
215 union cpu_time_count *cpu)
216{
217 struct task_struct *t = p;
218 switch (clock_idx) {
219 default:
220 return -EINVAL;
221 case CPUCLOCK_PROF:
222 cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
223 do {
224 cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
225 t = next_thread(t);
226 } while (t != p);
227 break;
228 case CPUCLOCK_VIRT:
229 cpu->cpu = p->signal->utime;
230 do {
231 cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
232 t = next_thread(t);
233 } while (t != p);
234 break;
235 case CPUCLOCK_SCHED:
236 cpu->sched = p->signal->sched_time;
237 /* Add in each other live thread. */
238 while ((t = next_thread(t)) != p) {
239 cpu->sched += t->sched_time;
240 }
241 if (p->tgid == current->tgid) {
242 /*
243 * We're sampling ourselves, so include the
244 * cycles not yet banked. We still omit
245 * other threads running on other CPUs,
246 * so the total can always be behind as
247 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
248 */
249 cpu->sched += current_sched_time(current);
250 } else {
251 cpu->sched += p->sched_time;
252 }
253 break;
254 }
255 return 0;
256}
257
258/*
259 * Sample a process (thread group) clock for the given group_leader task.
260 * Must be called with tasklist_lock held for reading.
261 */
262static int cpu_clock_sample_group(clockid_t which_clock,
263 struct task_struct *p,
264 union cpu_time_count *cpu)
265{
266 int ret;
267 unsigned long flags;
268 spin_lock_irqsave(&p->sighand->siglock, flags);
269 ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
270 cpu);
271 spin_unlock_irqrestore(&p->sighand->siglock, flags);
272 return ret;
273}
274
275
276int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
277{
278 const pid_t pid = CPUCLOCK_PID(which_clock);
279 int error = -EINVAL;
280 union cpu_time_count rtn;
281
282 if (pid == 0) {
283 /*
284 * Special case constant value for our own clocks.
285 * We don't have to do any lookup to find ourselves.
286 */
287 if (CPUCLOCK_PERTHREAD(which_clock)) {
288 /*
289 * Sampling just ourselves we can do with no locking.
290 */
291 error = cpu_clock_sample(which_clock,
292 current, &rtn);
293 } else {
294 read_lock(&tasklist_lock);
295 error = cpu_clock_sample_group(which_clock,
296 current, &rtn);
297 read_unlock(&tasklist_lock);
298 }
299 } else {
300 /*
301 * Find the given PID, and validate that the caller
302 * should be able to see it.
303 */
304 struct task_struct *p;
305 read_lock(&tasklist_lock);
306 p = find_task_by_pid(pid);
307 if (p) {
308 if (CPUCLOCK_PERTHREAD(which_clock)) {
309 if (p->tgid == current->tgid) {
310 error = cpu_clock_sample(which_clock,
311 p, &rtn);
312 }
313 } else if (p->tgid == pid && p->signal) {
314 error = cpu_clock_sample_group(which_clock,
315 p, &rtn);
316 }
317 }
318 read_unlock(&tasklist_lock);
319 }
320
321 if (error)
322 return error;
323 sample_to_timespec(which_clock, rtn, tp);
324 return 0;
325}
326
327
328/*
329 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
330 * This is called from sys_timer_create with the new timer already locked.
331 */
332int posix_cpu_timer_create(struct k_itimer *new_timer)
333{
334 int ret = 0;
335 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
336 struct task_struct *p;
337
338 if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
339 return -EINVAL;
340
341 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
342 new_timer->it.cpu.incr.sched = 0;
343 new_timer->it.cpu.expires.sched = 0;
344
345 read_lock(&tasklist_lock);
346 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
347 if (pid == 0) {
348 p = current;
349 } else {
350 p = find_task_by_pid(pid);
351 if (p && p->tgid != current->tgid)
352 p = NULL;
353 }
354 } else {
355 if (pid == 0) {
356 p = current->group_leader;
357 } else {
358 p = find_task_by_pid(pid);
359 if (p && p->tgid != pid)
360 p = NULL;
361 }
362 }
363 new_timer->it.cpu.task = p;
364 if (p) {
365 get_task_struct(p);
366 } else {
367 ret = -EINVAL;
368 }
369 read_unlock(&tasklist_lock);
370
371 return ret;
372}
373
374/*
375 * Clean up a CPU-clock timer that is about to be destroyed.
376 * This is called from timer deletion with the timer already locked.
377 * If we return TIMER_RETRY, it's necessary to release the timer's lock
378 * and try again. (This happens when the timer is in the middle of firing.)
379 */
380int posix_cpu_timer_del(struct k_itimer *timer)
381{
382 struct task_struct *p = timer->it.cpu.task;
383
384 if (timer->it.cpu.firing)
385 return TIMER_RETRY;
386
387 if (unlikely(p == NULL))
388 return 0;
389
390 if (!list_empty(&timer->it.cpu.entry)) {
391 read_lock(&tasklist_lock);
392 if (unlikely(p->signal == NULL)) {
393 /*
394 * We raced with the reaping of the task.
395 * The deletion should have cleared us off the list.
396 */
397 BUG_ON(!list_empty(&timer->it.cpu.entry));
398 } else {
399 /*
400 * Take us off the task's timer list.
401 */
402 spin_lock(&p->sighand->siglock);
403 list_del(&timer->it.cpu.entry);
404 spin_unlock(&p->sighand->siglock);
405 }
406 read_unlock(&tasklist_lock);
407 }
408 put_task_struct(p);
409
410 return 0;
411}
412
413/*
414 * Clean out CPU timers still ticking when a thread exited. The task
415 * pointer is cleared, and the expiry time is replaced with the residual
416 * time for later timer_gettime calls to return.
417 * This must be called with the siglock held.
418 */
419static void cleanup_timers(struct list_head *head,
420 cputime_t utime, cputime_t stime,
421 unsigned long long sched_time)
422{
423 struct cpu_timer_list *timer, *next;
424 cputime_t ptime = cputime_add(utime, stime);
425
426 list_for_each_entry_safe(timer, next, head, entry) {
427 timer->task = NULL;
428 list_del_init(&timer->entry);
429 if (cputime_lt(timer->expires.cpu, ptime)) {
430 timer->expires.cpu = cputime_zero;
431 } else {
432 timer->expires.cpu = cputime_sub(timer->expires.cpu,
433 ptime);
434 }
435 }
436
437 ++head;
438 list_for_each_entry_safe(timer, next, head, entry) {
439 timer->task = NULL;
440 list_del_init(&timer->entry);
441 if (cputime_lt(timer->expires.cpu, utime)) {
442 timer->expires.cpu = cputime_zero;
443 } else {
444 timer->expires.cpu = cputime_sub(timer->expires.cpu,
445 utime);
446 }
447 }
448
449 ++head;
450 list_for_each_entry_safe(timer, next, head, entry) {
451 timer->task = NULL;
452 list_del_init(&timer->entry);
453 if (timer->expires.sched < sched_time) {
454 timer->expires.sched = 0;
455 } else {
456 timer->expires.sched -= sched_time;
457 }
458 }
459}
460
461/*
462 * These are both called with the siglock held, when the current thread
463 * is being reaped. When the final (leader) thread in the group is reaped,
464 * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
465 */
466void posix_cpu_timers_exit(struct task_struct *tsk)
467{
468 cleanup_timers(tsk->cpu_timers,
469 tsk->utime, tsk->stime, tsk->sched_time);
470
471}
472void posix_cpu_timers_exit_group(struct task_struct *tsk)
473{
474 cleanup_timers(tsk->signal->cpu_timers,
475 cputime_add(tsk->utime, tsk->signal->utime),
476 cputime_add(tsk->stime, tsk->signal->stime),
477 tsk->sched_time + tsk->signal->sched_time);
478}
479
480
481/*
482 * Set the expiry times of all the threads in the process so one of them
483 * will go off before the process cumulative expiry total is reached.
484 */
485static void process_timer_rebalance(struct task_struct *p,
486 unsigned int clock_idx,
487 union cpu_time_count expires,
488 union cpu_time_count val)
489{
490 cputime_t ticks, left;
491 unsigned long long ns, nsleft;
492 struct task_struct *t = p;
493 unsigned int nthreads = atomic_read(&p->signal->live);
494
495 switch (clock_idx) {
496 default:
497 BUG();
498 break;
499 case CPUCLOCK_PROF:
500 left = cputime_div(cputime_sub(expires.cpu, val.cpu),
501 nthreads);
502 do {
503 if (!unlikely(t->exit_state)) {
504 ticks = cputime_add(prof_ticks(t), left);
505 if (cputime_eq(t->it_prof_expires,
506 cputime_zero) ||
507 cputime_gt(t->it_prof_expires, ticks)) {
508 t->it_prof_expires = ticks;
509 }
510 }
511 t = next_thread(t);
512 } while (t != p);
513 break;
514 case CPUCLOCK_VIRT:
515 left = cputime_div(cputime_sub(expires.cpu, val.cpu),
516 nthreads);
517 do {
518 if (!unlikely(t->exit_state)) {
519 ticks = cputime_add(virt_ticks(t), left);
520 if (cputime_eq(t->it_virt_expires,
521 cputime_zero) ||
522 cputime_gt(t->it_virt_expires, ticks)) {
523 t->it_virt_expires = ticks;
524 }
525 }
526 t = next_thread(t);
527 } while (t != p);
528 break;
529 case CPUCLOCK_SCHED:
530 nsleft = expires.sched - val.sched;
531 do_div(nsleft, nthreads);
532 do {
533 if (!unlikely(t->exit_state)) {
534 ns = t->sched_time + nsleft;
535 if (t->it_sched_expires == 0 ||
536 t->it_sched_expires > ns) {
537 t->it_sched_expires = ns;
538 }
539 }
540 t = next_thread(t);
541 } while (t != p);
542 break;
543 }
544}
545
546static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
547{
548 /*
549 * That's all for this thread or process.
550 * We leave our residual in expires to be reported.
551 */
552 put_task_struct(timer->it.cpu.task);
553 timer->it.cpu.task = NULL;
554 timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
555 timer->it.cpu.expires,
556 now);
557}
558
559/*
560 * Insert the timer on the appropriate list before any timers that
561 * expire later. This must be called with the tasklist_lock held
562 * for reading, and interrupts disabled.
563 */
564static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
565{
566 struct task_struct *p = timer->it.cpu.task;
567 struct list_head *head, *listpos;
568 struct cpu_timer_list *const nt = &timer->it.cpu;
569 struct cpu_timer_list *next;
570 unsigned long i;
571
572 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
573 p->cpu_timers : p->signal->cpu_timers);
574 head += CPUCLOCK_WHICH(timer->it_clock);
575
576 BUG_ON(!irqs_disabled());
577 spin_lock(&p->sighand->siglock);
578
579 listpos = head;
580 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
581 list_for_each_entry(next, head, entry) {
582 if (next->expires.sched > nt->expires.sched) {
583 listpos = &next->entry;
584 break;
585 }
586 }
587 } else {
588 list_for_each_entry(next, head, entry) {
589 if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
590 listpos = &next->entry;
591 break;
592 }
593 }
594 }
595 list_add(&nt->entry, listpos);
596
597 if (listpos == head) {
598 /*
599 * We are the new earliest-expiring timer.
600 * If we are a thread timer, there can always
601 * be a process timer telling us to stop earlier.
602 */
603
604 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
605 switch (CPUCLOCK_WHICH(timer->it_clock)) {
606 default:
607 BUG();
608 case CPUCLOCK_PROF:
609 if (cputime_eq(p->it_prof_expires,
610 cputime_zero) ||
611 cputime_gt(p->it_prof_expires,
612 nt->expires.cpu))
613 p->it_prof_expires = nt->expires.cpu;
614 break;
615 case CPUCLOCK_VIRT:
616 if (cputime_eq(p->it_virt_expires,
617 cputime_zero) ||
618 cputime_gt(p->it_virt_expires,
619 nt->expires.cpu))
620 p->it_virt_expires = nt->expires.cpu;
621 break;
622 case CPUCLOCK_SCHED:
623 if (p->it_sched_expires == 0 ||
624 p->it_sched_expires > nt->expires.sched)
625 p->it_sched_expires = nt->expires.sched;
626 break;
627 }
628 } else {
629 /*
630 * For a process timer, we must balance
631 * all the live threads' expirations.
632 */
633 switch (CPUCLOCK_WHICH(timer->it_clock)) {
634 default:
635 BUG();
636 case CPUCLOCK_VIRT:
637 if (!cputime_eq(p->signal->it_virt_expires,
638 cputime_zero) &&
639 cputime_lt(p->signal->it_virt_expires,
640 timer->it.cpu.expires.cpu))
641 break;
642 goto rebalance;
643 case CPUCLOCK_PROF:
644 if (!cputime_eq(p->signal->it_prof_expires,
645 cputime_zero) &&
646 cputime_lt(p->signal->it_prof_expires,
647 timer->it.cpu.expires.cpu))
648 break;
649 i = p->signal->rlim[RLIMIT_CPU].rlim_cur;
650 if (i != RLIM_INFINITY &&
651 i <= cputime_to_secs(timer->it.cpu.expires.cpu))
652 break;
653 goto rebalance;
654 case CPUCLOCK_SCHED:
655 rebalance:
656 process_timer_rebalance(
657 timer->it.cpu.task,
658 CPUCLOCK_WHICH(timer->it_clock),
659 timer->it.cpu.expires, now);
660 break;
661 }
662 }
663 }
664
665 spin_unlock(&p->sighand->siglock);
666}
667
668/*
669 * The timer is locked, fire it and arrange for its reload.
670 */
671static void cpu_timer_fire(struct k_itimer *timer)
672{
673 if (unlikely(timer->sigq == NULL)) {
674 /*
675 * This a special case for clock_nanosleep,
676 * not a normal timer from sys_timer_create.
677 */
678 wake_up_process(timer->it_process);
679 timer->it.cpu.expires.sched = 0;
680 } else if (timer->it.cpu.incr.sched == 0) {
681 /*
682 * One-shot timer. Clear it as soon as it's fired.
683 */
684 posix_timer_event(timer, 0);
685 timer->it.cpu.expires.sched = 0;
686 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
687 /*
688 * The signal did not get queued because the signal
689 * was ignored, so we won't get any callback to
690 * reload the timer. But we need to keep it
691 * ticking in case the signal is deliverable next time.
692 */
693 posix_cpu_timer_schedule(timer);
694 }
695}
696
697/*
698 * Guts of sys_timer_settime for CPU timers.
699 * This is called with the timer locked and interrupts disabled.
700 * If we return TIMER_RETRY, it's necessary to release the timer's lock
701 * and try again. (This happens when the timer is in the middle of firing.)
702 */
703int posix_cpu_timer_set(struct k_itimer *timer, int flags,
704 struct itimerspec *new, struct itimerspec *old)
705{
706 struct task_struct *p = timer->it.cpu.task;
707 union cpu_time_count old_expires, new_expires, val;
708 int ret;
709
710 if (unlikely(p == NULL)) {
711 /*
712 * Timer refers to a dead task's clock.
713 */
714 return -ESRCH;
715 }
716
717 new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
718
719 read_lock(&tasklist_lock);
720 /*
721 * We need the tasklist_lock to protect against reaping that
722 * clears p->signal. If p has just been reaped, we can no
723 * longer get any information about it at all.
724 */
725 if (unlikely(p->signal == NULL)) {
726 read_unlock(&tasklist_lock);
727 put_task_struct(p);
728 timer->it.cpu.task = NULL;
729 return -ESRCH;
730 }
731
732 /*
733 * Disarm any old timer after extracting its expiry time.
734 */
735 BUG_ON(!irqs_disabled());
736 spin_lock(&p->sighand->siglock);
737 old_expires = timer->it.cpu.expires;
738 list_del_init(&timer->it.cpu.entry);
739 spin_unlock(&p->sighand->siglock);
740
741 /*
742 * We need to sample the current value to convert the new
743 * value from to relative and absolute, and to convert the
744 * old value from absolute to relative. To set a process
745 * timer, we need a sample to balance the thread expiry
746 * times (in arm_timer). With an absolute time, we must
747 * check if it's already passed. In short, we need a sample.
748 */
749 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
750 cpu_clock_sample(timer->it_clock, p, &val);
751 } else {
752 cpu_clock_sample_group(timer->it_clock, p, &val);
753 }
754
755 if (old) {
756 if (old_expires.sched == 0) {
757 old->it_value.tv_sec = 0;
758 old->it_value.tv_nsec = 0;
759 } else {
760 /*
761 * Update the timer in case it has
762 * overrun already. If it has,
763 * we'll report it as having overrun
764 * and with the next reloaded timer
765 * already ticking, though we are
766 * swallowing that pending
767 * notification here to install the
768 * new setting.
769 */
770 bump_cpu_timer(timer, val);
771 if (cpu_time_before(timer->it_clock, val,
772 timer->it.cpu.expires)) {
773 old_expires = cpu_time_sub(
774 timer->it_clock,
775 timer->it.cpu.expires, val);
776 sample_to_timespec(timer->it_clock,
777 old_expires,
778 &old->it_value);
779 } else {
780 old->it_value.tv_nsec = 1;
781 old->it_value.tv_sec = 0;
782 }
783 }
784 }
785
786 if (unlikely(timer->it.cpu.firing)) {
787 /*
788 * We are colliding with the timer actually firing.
789 * Punt after filling in the timer's old value, and
790 * disable this firing since we are already reporting
791 * it as an overrun (thanks to bump_cpu_timer above).
792 */
793 read_unlock(&tasklist_lock);
794 timer->it.cpu.firing = -1;
795 ret = TIMER_RETRY;
796 goto out;
797 }
798
799 if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
800 cpu_time_add(timer->it_clock, &new_expires, val);
801 }
802
803 /*
804 * Install the new expiry time (or zero).
805 * For a timer with no notification action, we don't actually
806 * arm the timer (we'll just fake it for timer_gettime).
807 */
808 timer->it.cpu.expires = new_expires;
809 if (new_expires.sched != 0 &&
810 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
811 cpu_time_before(timer->it_clock, val, new_expires)) {
812 arm_timer(timer, val);
813 }
814
815 read_unlock(&tasklist_lock);
816
817 /*
818 * Install the new reload setting, and
819 * set up the signal and overrun bookkeeping.
820 */
821 timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
822 &new->it_interval);
823
824 /*
825 * This acts as a modification timestamp for the timer,
826 * so any automatic reload attempt will punt on seeing
827 * that we have reset the timer manually.
828 */
829 timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
830 ~REQUEUE_PENDING;
831 timer->it_overrun_last = 0;
832 timer->it_overrun = -1;
833
834 if (new_expires.sched != 0 &&
835 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
836 !cpu_time_before(timer->it_clock, val, new_expires)) {
837 /*
838 * The designated time already passed, so we notify
839 * immediately, even if the thread never runs to
840 * accumulate more time on this clock.
841 */
842 cpu_timer_fire(timer);
843 }
844
845 ret = 0;
846 out:
847 if (old) {
848 sample_to_timespec(timer->it_clock,
849 timer->it.cpu.incr, &old->it_interval);
850 }
851 return ret;
852}
853
854void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
855{
856 union cpu_time_count now;
857 struct task_struct *p = timer->it.cpu.task;
858 int clear_dead;
859
860 /*
861 * Easy part: convert the reload time.
862 */
863 sample_to_timespec(timer->it_clock,
864 timer->it.cpu.incr, &itp->it_interval);
865
866 if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */
867 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
868 return;
869 }
870
871 if (unlikely(p == NULL)) {
872 /*
873 * This task already died and the timer will never fire.
874 * In this case, expires is actually the dead value.
875 */
876 dead:
877 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
878 &itp->it_value);
879 return;
880 }
881
882 /*
883 * Sample the clock to take the difference with the expiry time.
884 */
885 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
886 cpu_clock_sample(timer->it_clock, p, &now);
887 clear_dead = p->exit_state;
888 } else {
889 read_lock(&tasklist_lock);
890 if (unlikely(p->signal == NULL)) {
891 /*
892 * The process has been reaped.
893 * We can't even collect a sample any more.
894 * Call the timer disarmed, nothing else to do.
895 */
896 put_task_struct(p);
897 timer->it.cpu.task = NULL;
898 timer->it.cpu.expires.sched = 0;
899 read_unlock(&tasklist_lock);
900 goto dead;
901 } else {
902 cpu_clock_sample_group(timer->it_clock, p, &now);
903 clear_dead = (unlikely(p->exit_state) &&
904 thread_group_empty(p));
905 }
906 read_unlock(&tasklist_lock);
907 }
908
909 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
910 if (timer->it.cpu.incr.sched == 0 &&
911 cpu_time_before(timer->it_clock,
912 timer->it.cpu.expires, now)) {
913 /*
914 * Do-nothing timer expired and has no reload,
915 * so it's as if it was never set.
916 */
917 timer->it.cpu.expires.sched = 0;
918 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
919 return;
920 }
921 /*
922 * Account for any expirations and reloads that should
923 * have happened.
924 */
925 bump_cpu_timer(timer, now);
926 }
927
928 if (unlikely(clear_dead)) {
929 /*
930 * We've noticed that the thread is dead, but
931 * not yet reaped. Take this opportunity to
932 * drop our task ref.
933 */
934 clear_dead_task(timer, now);
935 goto dead;
936 }
937
938 if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
939 sample_to_timespec(timer->it_clock,
940 cpu_time_sub(timer->it_clock,
941 timer->it.cpu.expires, now),
942 &itp->it_value);
943 } else {
944 /*
945 * The timer should have expired already, but the firing
946 * hasn't taken place yet. Say it's just about to expire.
947 */
948 itp->it_value.tv_nsec = 1;
949 itp->it_value.tv_sec = 0;
950 }
951}
952
953/*
954 * Check for any per-thread CPU timers that have fired and move them off
955 * the tsk->cpu_timers[N] list onto the firing list. Here we update the
956 * tsk->it_*_expires values to reflect the remaining thread CPU timers.
957 */
958static void check_thread_timers(struct task_struct *tsk,
959 struct list_head *firing)
960{
961 struct list_head *timers = tsk->cpu_timers;
962
963 tsk->it_prof_expires = cputime_zero;
964 while (!list_empty(timers)) {
965 struct cpu_timer_list *t = list_entry(timers->next,
966 struct cpu_timer_list,
967 entry);
968 if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
969 tsk->it_prof_expires = t->expires.cpu;
970 break;
971 }
972 t->firing = 1;
973 list_move_tail(&t->entry, firing);
974 }
975
976 ++timers;
977 tsk->it_virt_expires = cputime_zero;
978 while (!list_empty(timers)) {
979 struct cpu_timer_list *t = list_entry(timers->next,
980 struct cpu_timer_list,
981 entry);
982 if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
983 tsk->it_virt_expires = t->expires.cpu;
984 break;
985 }
986 t->firing = 1;
987 list_move_tail(&t->entry, firing);
988 }
989
990 ++timers;
991 tsk->it_sched_expires = 0;
992 while (!list_empty(timers)) {
993 struct cpu_timer_list *t = list_entry(timers->next,
994 struct cpu_timer_list,
995 entry);
996 if (tsk->sched_time < t->expires.sched) {
997 tsk->it_sched_expires = t->expires.sched;
998 break;
999 }
1000 t->firing = 1;
1001 list_move_tail(&t->entry, firing);
1002 }
1003}
1004
1005/*
1006 * Check for any per-thread CPU timers that have fired and move them
1007 * off the tsk->*_timers list onto the firing list. Per-thread timers
1008 * have already been taken off.
1009 */
1010static void check_process_timers(struct task_struct *tsk,
1011 struct list_head *firing)
1012{
1013 struct signal_struct *const sig = tsk->signal;
1014 cputime_t utime, stime, ptime, virt_expires, prof_expires;
1015 unsigned long long sched_time, sched_expires;
1016 struct task_struct *t;
1017 struct list_head *timers = sig->cpu_timers;
1018
1019 /*
1020 * Don't sample the current process CPU clocks if there are no timers.
1021 */
1022 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1023 cputime_eq(sig->it_prof_expires, cputime_zero) &&
1024 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1025 list_empty(&timers[CPUCLOCK_VIRT]) &&
1026 cputime_eq(sig->it_virt_expires, cputime_zero) &&
1027 list_empty(&timers[CPUCLOCK_SCHED]))
1028 return;
1029
1030 /*
1031 * Collect the current process totals.
1032 */
1033 utime = sig->utime;
1034 stime = sig->stime;
1035 sched_time = sig->sched_time;
1036 t = tsk;
1037 do {
1038 utime = cputime_add(utime, t->utime);
1039 stime = cputime_add(stime, t->stime);
1040 sched_time += t->sched_time;
1041 t = next_thread(t);
1042 } while (t != tsk);
1043 ptime = cputime_add(utime, stime);
1044
1045 prof_expires = cputime_zero;
1046 while (!list_empty(timers)) {
1047 struct cpu_timer_list *t = list_entry(timers->next,
1048 struct cpu_timer_list,
1049 entry);
1050 if (cputime_lt(ptime, t->expires.cpu)) {
1051 prof_expires = t->expires.cpu;
1052 break;
1053 }
1054 t->firing = 1;
1055 list_move_tail(&t->entry, firing);
1056 }
1057
1058 ++timers;
1059 virt_expires = cputime_zero;
1060 while (!list_empty(timers)) {
1061 struct cpu_timer_list *t = list_entry(timers->next,
1062 struct cpu_timer_list,
1063 entry);
1064 if (cputime_lt(utime, t->expires.cpu)) {
1065 virt_expires = t->expires.cpu;
1066 break;
1067 }
1068 t->firing = 1;
1069 list_move_tail(&t->entry, firing);
1070 }
1071
1072 ++timers;
1073 sched_expires = 0;
1074 while (!list_empty(timers)) {
1075 struct cpu_timer_list *t = list_entry(timers->next,
1076 struct cpu_timer_list,
1077 entry);
1078 if (sched_time < t->expires.sched) {
1079 sched_expires = t->expires.sched;
1080 break;
1081 }
1082 t->firing = 1;
1083 list_move_tail(&t->entry, firing);
1084 }
1085
1086 /*
1087 * Check for the special case process timers.
1088 */
1089 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
1090 if (cputime_ge(ptime, sig->it_prof_expires)) {
1091 /* ITIMER_PROF fires and reloads. */
1092 sig->it_prof_expires = sig->it_prof_incr;
1093 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
1094 sig->it_prof_expires = cputime_add(
1095 sig->it_prof_expires, ptime);
1096 }
1097 __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
1098 }
1099 if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
1100 (cputime_eq(prof_expires, cputime_zero) ||
1101 cputime_lt(sig->it_prof_expires, prof_expires))) {
1102 prof_expires = sig->it_prof_expires;
1103 }
1104 }
1105 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1106 if (cputime_ge(utime, sig->it_virt_expires)) {
1107 /* ITIMER_VIRTUAL fires and reloads. */
1108 sig->it_virt_expires = sig->it_virt_incr;
1109 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1110 sig->it_virt_expires = cputime_add(
1111 sig->it_virt_expires, utime);
1112 }
1113 __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
1114 }
1115 if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
1116 (cputime_eq(virt_expires, cputime_zero) ||
1117 cputime_lt(sig->it_virt_expires, virt_expires))) {
1118 virt_expires = sig->it_virt_expires;
1119 }
1120 }
1121 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
1122 unsigned long psecs = cputime_to_secs(ptime);
1123 cputime_t x;
1124 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
1125 /*
1126 * At the hard limit, we just die.
1127 * No need to calculate anything else now.
1128 */
1129 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1130 return;
1131 }
1132 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
1133 /*
1134 * At the soft limit, send a SIGXCPU every second.
1135 */
1136 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1137 if (sig->rlim[RLIMIT_CPU].rlim_cur
1138 < sig->rlim[RLIMIT_CPU].rlim_max) {
1139 sig->rlim[RLIMIT_CPU].rlim_cur++;
1140 }
1141 }
1142 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
1143 if (cputime_eq(prof_expires, cputime_zero) ||
1144 cputime_lt(x, prof_expires)) {
1145 prof_expires = x;
1146 }
1147 }
1148
1149 if (!cputime_eq(prof_expires, cputime_zero) ||
1150 !cputime_eq(virt_expires, cputime_zero) ||
1151 sched_expires != 0) {
1152 /*
1153 * Rebalance the threads' expiry times for the remaining
1154 * process CPU timers.
1155 */
1156
1157 cputime_t prof_left, virt_left, ticks;
1158 unsigned long long sched_left, sched;
1159 const unsigned int nthreads = atomic_read(&sig->live);
1160
1161 prof_left = cputime_sub(prof_expires, utime);
1162 prof_left = cputime_sub(prof_left, stime);
1163 prof_left = cputime_div(prof_left, nthreads);
1164 virt_left = cputime_sub(virt_expires, utime);
1165 virt_left = cputime_div(virt_left, nthreads);
1166 if (sched_expires) {
1167 sched_left = sched_expires - sched_time;
1168 do_div(sched_left, nthreads);
1169 } else {
1170 sched_left = 0;
1171 }
1172 t = tsk;
1173 do {
1174 ticks = cputime_add(cputime_add(t->utime, t->stime),
1175 prof_left);
1176 if (!cputime_eq(prof_expires, cputime_zero) &&
1177 (cputime_eq(t->it_prof_expires, cputime_zero) ||
1178 cputime_gt(t->it_prof_expires, ticks))) {
1179 t->it_prof_expires = ticks;
1180 }
1181
1182 ticks = cputime_add(t->utime, virt_left);
1183 if (!cputime_eq(virt_expires, cputime_zero) &&
1184 (cputime_eq(t->it_virt_expires, cputime_zero) ||
1185 cputime_gt(t->it_virt_expires, ticks))) {
1186 t->it_virt_expires = ticks;
1187 }
1188
1189 sched = t->sched_time + sched_left;
1190 if (sched_expires && (t->it_sched_expires == 0 ||
1191 t->it_sched_expires > sched)) {
1192 t->it_sched_expires = sched;
1193 }
1194
1195 do {
1196 t = next_thread(t);
1197 } while (unlikely(t->exit_state));
1198 } while (t != tsk);
1199 }
1200}
1201
1202/*
1203 * This is called from the signal code (via do_schedule_next_timer)
1204 * when the last timer signal was delivered and we have to reload the timer.
1205 */
1206void posix_cpu_timer_schedule(struct k_itimer *timer)
1207{
1208 struct task_struct *p = timer->it.cpu.task;
1209 union cpu_time_count now;
1210
1211 if (unlikely(p == NULL))
1212 /*
1213 * The task was cleaned up already, no future firings.
1214 */
1215 return;
1216
1217 /*
1218 * Fetch the current sample and update the timer's expiry time.
1219 */
1220 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1221 cpu_clock_sample(timer->it_clock, p, &now);
1222 bump_cpu_timer(timer, now);
1223 if (unlikely(p->exit_state)) {
1224 clear_dead_task(timer, now);
1225 return;
1226 }
1227 read_lock(&tasklist_lock); /* arm_timer needs it. */
1228 } else {
1229 read_lock(&tasklist_lock);
1230 if (unlikely(p->signal == NULL)) {
1231 /*
1232 * The process has been reaped.
1233 * We can't even collect a sample any more.
1234 */
1235 put_task_struct(p);
1236 timer->it.cpu.task = p = NULL;
1237 timer->it.cpu.expires.sched = 0;
1238 read_unlock(&tasklist_lock);
1239 return;
1240 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1241 /*
1242 * We've noticed that the thread is dead, but
1243 * not yet reaped. Take this opportunity to
1244 * drop our task ref.
1245 */
1246 clear_dead_task(timer, now);
1247 read_unlock(&tasklist_lock);
1248 return;
1249 }
1250 cpu_clock_sample_group(timer->it_clock, p, &now);
1251 bump_cpu_timer(timer, now);
1252 /* Leave the tasklist_lock locked for the call below. */
1253 }
1254
1255 /*
1256 * Now re-arm for the new expiry time.
1257 */
1258 arm_timer(timer, now);
1259
1260 read_unlock(&tasklist_lock);
1261}
1262
1263/*
1264 * This is called from the timer interrupt handler. The irq handler has
1265 * already updated our counts. We need to check if any timers fire now.
1266 * Interrupts are disabled.
1267 */
1268void run_posix_cpu_timers(struct task_struct *tsk)
1269{
1270 LIST_HEAD(firing);
1271 struct k_itimer *timer, *next;
1272
1273 BUG_ON(!irqs_disabled());
1274
1275#define UNEXPIRED(clock) \
1276 (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
1277 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
1278
1279 if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
1280 (tsk->it_sched_expires == 0 ||
1281 tsk->sched_time < tsk->it_sched_expires))
1282 return;
1283
1284#undef UNEXPIRED
1285
1286 BUG_ON(tsk->exit_state);
1287
1288 /*
1289 * Double-check with locks held.
1290 */
1291 read_lock(&tasklist_lock);
1292 spin_lock(&tsk->sighand->siglock);
1293
1294 /*
1295 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
1296 * all the timers that are firing, and put them on the firing list.
1297 */
1298 check_thread_timers(tsk, &firing);
1299 check_process_timers(tsk, &firing);
1300
1301 /*
1302 * We must release these locks before taking any timer's lock.
1303 * There is a potential race with timer deletion here, as the
1304 * siglock now protects our private firing list. We have set
1305 * the firing flag in each timer, so that a deletion attempt
1306 * that gets the timer lock before we do will give it up and
1307 * spin until we've taken care of that timer below.
1308 */
1309 spin_unlock(&tsk->sighand->siglock);
1310 read_unlock(&tasklist_lock);
1311
1312 /*
1313 * Now that all the timers on our list have the firing flag,
1314 * noone will touch their list entries but us. We'll take
1315 * each timer's lock before clearing its firing flag, so no
1316 * timer call will interfere.
1317 */
1318 list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
1319 int firing;
1320 spin_lock(&timer->it_lock);
1321 list_del_init(&timer->it.cpu.entry);
1322 firing = timer->it.cpu.firing;
1323 timer->it.cpu.firing = 0;
1324 /*
1325 * The firing flag is -1 if we collided with a reset
1326 * of the timer, which already reported this
1327 * almost-firing as an overrun. So don't generate an event.
1328 */
1329 if (likely(firing >= 0)) {
1330 cpu_timer_fire(timer);
1331 }
1332 spin_unlock(&timer->it_lock);
1333 }
1334}
1335
1336/*
1337 * Set one of the process-wide special case CPU timers.
1338 * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
1339 * The oldval argument is null for the RLIMIT_CPU timer, where *newval is
1340 * absolute; non-null for ITIMER_*, where *newval is relative and we update
1341 * it to be absolute, *oldval is absolute and we update it to be relative.
1342 */
1343void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1344 cputime_t *newval, cputime_t *oldval)
1345{
1346 union cpu_time_count now;
1347 struct list_head *head;
1348
1349 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1350 cpu_clock_sample_group_locked(clock_idx, tsk, &now);
1351
1352 if (oldval) {
1353 if (!cputime_eq(*oldval, cputime_zero)) {
1354 if (cputime_le(*oldval, now.cpu)) {
1355 /* Just about to fire. */
1356 *oldval = jiffies_to_cputime(1);
1357 } else {
1358 *oldval = cputime_sub(*oldval, now.cpu);
1359 }
1360 }
1361
1362 if (cputime_eq(*newval, cputime_zero))
1363 return;
1364 *newval = cputime_add(*newval, now.cpu);
1365
1366 /*
1367 * If the RLIMIT_CPU timer will expire before the
1368 * ITIMER_PROF timer, we have nothing else to do.
1369 */
1370 if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
1371 < cputime_to_secs(*newval))
1372 return;
1373 }
1374
1375 /*
1376 * Check whether there are any process timers already set to fire
1377 * before this one. If so, we don't have anything more to do.
1378 */
1379 head = &tsk->signal->cpu_timers[clock_idx];
1380 if (list_empty(head) ||
1381 cputime_ge(list_entry(head->next,
1382 struct cpu_timer_list, entry)->expires.cpu,
1383 *newval)) {
1384 /*
1385 * Rejigger each thread's expiry time so that one will
1386 * notice before we hit the process-cumulative expiry time.
1387 */
1388 union cpu_time_count expires = { .sched = 0 };
1389 expires.cpu = *newval;
1390 process_timer_rebalance(tsk, clock_idx, expires, now);
1391 }
1392}
1393
1394static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
1395
1396int posix_cpu_nsleep(clockid_t which_clock, int flags,
1397 struct timespec *rqtp)
1398{
1399 struct restart_block *restart_block =
1400 &current_thread_info()->restart_block;
1401 struct k_itimer timer;
1402 int error;
1403
1404 /*
1405 * Diagnose required errors first.
1406 */
1407 if (CPUCLOCK_PERTHREAD(which_clock) &&
1408 (CPUCLOCK_PID(which_clock) == 0 ||
1409 CPUCLOCK_PID(which_clock) == current->pid))
1410 return -EINVAL;
1411
1412 /*
1413 * Set up a temporary timer and then wait for it to go off.
1414 */
1415 memset(&timer, 0, sizeof timer);
1416 spin_lock_init(&timer.it_lock);
1417 timer.it_clock = which_clock;
1418 timer.it_overrun = -1;
1419 error = posix_cpu_timer_create(&timer);
1420 timer.it_process = current;
1421 if (!error) {
1422 struct timespec __user *rmtp;
1423 static struct itimerspec zero_it;
1424 struct itimerspec it = { .it_value = *rqtp,
1425 .it_interval = {} };
1426
1427 spin_lock_irq(&timer.it_lock);
1428 error = posix_cpu_timer_set(&timer, flags, &it, NULL);
1429 if (error) {
1430 spin_unlock_irq(&timer.it_lock);
1431 return error;
1432 }
1433
1434 while (!signal_pending(current)) {
1435 if (timer.it.cpu.expires.sched == 0) {
1436 /*
1437 * Our timer fired and was reset.
1438 */
1439 spin_unlock_irq(&timer.it_lock);
1440 return 0;
1441 }
1442
1443 /*
1444 * Block until cpu_timer_fire (or a signal) wakes us.
1445 */
1446 __set_current_state(TASK_INTERRUPTIBLE);
1447 spin_unlock_irq(&timer.it_lock);
1448 schedule();
1449 spin_lock_irq(&timer.it_lock);
1450 }
1451
1452 /*
1453 * We were interrupted by a signal.
1454 */
1455 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1456 posix_cpu_timer_set(&timer, 0, &zero_it, &it);
1457 spin_unlock_irq(&timer.it_lock);
1458
1459 if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
1460 /*
1461 * It actually did fire already.
1462 */
1463 return 0;
1464 }
1465
1466 /*
1467 * Report back to the user the time still remaining.
1468 */
1469 rmtp = (struct timespec __user *) restart_block->arg1;
1470 if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
1471 copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1472 return -EFAULT;
1473
1474 restart_block->fn = posix_cpu_clock_nanosleep_restart;
1475 /* Caller already set restart_block->arg1 */
1476 restart_block->arg0 = which_clock;
1477 restart_block->arg2 = rqtp->tv_sec;
1478 restart_block->arg3 = rqtp->tv_nsec;
1479
1480 error = -ERESTART_RESTARTBLOCK;
1481 }
1482
1483 return error;
1484}
1485
1486static long
1487posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
1488{
1489 clockid_t which_clock = restart_block->arg0;
1490 struct timespec t = { .tv_sec = restart_block->arg2,
1491 .tv_nsec = restart_block->arg3 };
1492 restart_block->fn = do_no_restart_syscall;
1493 return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
1494}
1495
1496
1497#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1498#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1499
1500static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
1501{
1502 return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1503}
1504static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
1505{
1506 return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1507}
1508static int process_cpu_timer_create(struct k_itimer *timer)
1509{
1510 timer->it_clock = PROCESS_CLOCK;
1511 return posix_cpu_timer_create(timer);
1512}
1513static int process_cpu_nsleep(clockid_t which_clock, int flags,
1514 struct timespec *rqtp)
1515{
1516 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
1517}
1518static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
1519{
1520 return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1521}
1522static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
1523{
1524 return posix_cpu_clock_get(THREAD_CLOCK, tp);
1525}
1526static int thread_cpu_timer_create(struct k_itimer *timer)
1527{
1528 timer->it_clock = THREAD_CLOCK;
1529 return posix_cpu_timer_create(timer);
1530}
1531static int thread_cpu_nsleep(clockid_t which_clock, int flags,
1532 struct timespec *rqtp)
1533{
1534 return -EINVAL;
1535}
1536
1537static __init int init_posix_cpu_timers(void)
1538{
1539 struct k_clock process = {
1540 .clock_getres = process_cpu_clock_getres,
1541 .clock_get = process_cpu_clock_get,
1542 .clock_set = do_posix_clock_nosettime,
1543 .timer_create = process_cpu_timer_create,
1544 .nsleep = process_cpu_nsleep,
1545 };
1546 struct k_clock thread = {
1547 .clock_getres = thread_cpu_clock_getres,
1548 .clock_get = thread_cpu_clock_get,
1549 .clock_set = do_posix_clock_nosettime,
1550 .timer_create = thread_cpu_timer_create,
1551 .nsleep = thread_cpu_nsleep,
1552 };
1553
1554 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1555 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1556
1557 return 0;
1558}
1559__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
new file mode 100644
index 000000000000..fd316c272260
--- /dev/null
+++ b/kernel/posix-timers.c
@@ -0,0 +1,1584 @@
1/*
2 * linux/kernel/posix_timers.c
3 *
4 *
5 * 2002-10-15 Posix Clocks & timers
6 * by George Anzinger george@mvista.com
7 *
8 * Copyright (C) 2002 2003 by MontaVista Software.
9 *
10 * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
11 * Copyright (C) 2004 Boris Hu
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or (at
16 * your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful, but
19 * WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
28 */
29
30/* These are all the functions necessary to implement
31 * POSIX clocks & timers
32 */
33#include <linux/mm.h>
34#include <linux/smp_lock.h>
35#include <linux/interrupt.h>
36#include <linux/slab.h>
37#include <linux/time.h>
38
39#include <asm/uaccess.h>
40#include <asm/semaphore.h>
41#include <linux/list.h>
42#include <linux/init.h>
43#include <linux/compiler.h>
44#include <linux/idr.h>
45#include <linux/posix-timers.h>
46#include <linux/syscalls.h>
47#include <linux/wait.h>
48#include <linux/workqueue.h>
49#include <linux/module.h>
50
51#ifndef div_long_long_rem
52#include <asm/div64.h>
53
54#define div_long_long_rem(dividend,divisor,remainder) ({ \
55 u64 result = dividend; \
56 *remainder = do_div(result,divisor); \
57 result; })
58
59#endif
60#define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */
61
62static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
63{
64 return (u64)mpy1 * mpy2;
65}
66/*
67 * Management arrays for POSIX timers. Timers are kept in slab memory
68 * Timer ids are allocated by an external routine that keeps track of the
69 * id and the timer. The external interface is:
70 *
71 * void *idr_find(struct idr *idp, int id); to find timer_id <id>
72 * int idr_get_new(struct idr *idp, void *ptr); to get a new id and
73 * related it to <ptr>
74 * void idr_remove(struct idr *idp, int id); to release <id>
75 * void idr_init(struct idr *idp); to initialize <idp>
76 * which we supply.
77 * The idr_get_new *may* call slab for more memory so it must not be
78 * called under a spin lock. Likewise idr_remore may release memory
79 * (but it may be ok to do this under a lock...).
80 * idr_find is just a memory look up and is quite fast. A -1 return
81 * indicates that the requested id does not exist.
82 */
83
84/*
85 * Lets keep our timers in a slab cache :-)
86 */
87static kmem_cache_t *posix_timers_cache;
88static struct idr posix_timers_id;
89static DEFINE_SPINLOCK(idr_lock);
90
91/*
92 * Just because the timer is not in the timer list does NOT mean it is
93 * inactive. It could be in the "fire" routine getting a new expire time.
94 */
95#define TIMER_INACTIVE 1
96
97#ifdef CONFIG_SMP
98# define timer_active(tmr) \
99 ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
100# define set_timer_inactive(tmr) \
101 do { \
102 (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
103 } while (0)
104#else
105# define timer_active(tmr) BARFY // error to use outside of SMP
106# define set_timer_inactive(tmr) do { } while (0)
107#endif
108/*
109 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
110 * SIGEV values. Here we put out an error if this assumption fails.
111 */
112#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
113 ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
114#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
115#endif
116
117
118/*
119 * The timer ID is turned into a timer address by idr_find().
120 * Verifying a valid ID consists of:
121 *
122 * a) checking that idr_find() returns other than -1.
123 * b) checking that the timer id matches the one in the timer itself.
124 * c) that the timer owner is in the callers thread group.
125 */
126
127/*
128 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
129 * to implement others. This structure defines the various
130 * clocks and allows the possibility of adding others. We
131 * provide an interface to add clocks to the table and expect
132 * the "arch" code to add at least one clock that is high
133 * resolution. Here we define the standard CLOCK_REALTIME as a
134 * 1/HZ resolution clock.
135 *
136 * RESOLUTION: Clock resolution is used to round up timer and interval
137 * times, NOT to report clock times, which are reported with as
138 * much resolution as the system can muster. In some cases this
139 * resolution may depend on the underlying clock hardware and
140 * may not be quantifiable until run time, and only then is the
141 * necessary code is written. The standard says we should say
142 * something about this issue in the documentation...
143 *
144 * FUNCTIONS: The CLOCKs structure defines possible functions to handle
145 * various clock functions. For clocks that use the standard
146 * system timer code these entries should be NULL. This will
147 * allow dispatch without the overhead of indirect function
148 * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
149 * must supply functions here, even if the function just returns
150 * ENOSYS. The standard POSIX timer management code assumes the
151 * following: 1.) The k_itimer struct (sched.h) is used for the
152 * timer. 2.) The list, it_lock, it_clock, it_id and it_process
153 * fields are not modified by timer code.
154 *
155 * At this time all functions EXCEPT clock_nanosleep can be
156 * redirected by the CLOCKS structure. Clock_nanosleep is in
157 * there, but the code ignores it.
158 *
159 * Permissions: It is assumed that the clock_settime() function defined
160 * for each clock will take care of permission checks. Some
161 * clocks may be set able by any user (i.e. local process
162 * clocks) others not. Currently the only set able clock we
163 * have is CLOCK_REALTIME and its high res counter part, both of
164 * which we beg off on and pass to do_sys_settimeofday().
165 */
166
167static struct k_clock posix_clocks[MAX_CLOCKS];
168/*
169 * We only have one real clock that can be set so we need only one abs list,
170 * even if we should want to have several clocks with differing resolutions.
171 */
172static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list),
173 .lock = SPIN_LOCK_UNLOCKED};
174
175static void posix_timer_fn(unsigned long);
176static u64 do_posix_clock_monotonic_gettime_parts(
177 struct timespec *tp, struct timespec *mo);
178int do_posix_clock_monotonic_gettime(struct timespec *tp);
179static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
180
181static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
182
183static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
184{
185 spin_unlock_irqrestore(&timr->it_lock, flags);
186}
187
188/*
189 * Call the k_clock hook function if non-null, or the default function.
190 */
191#define CLOCK_DISPATCH(clock, call, arglist) \
192 ((clock) < 0 ? posix_cpu_##call arglist : \
193 (posix_clocks[clock].call != NULL \
194 ? (*posix_clocks[clock].call) arglist : common_##call arglist))
195
196/*
197 * Default clock hook functions when the struct k_clock passed
198 * to register_posix_clock leaves a function pointer null.
199 *
200 * The function common_CALL is the default implementation for
201 * the function pointer CALL in struct k_clock.
202 */
203
204static inline int common_clock_getres(clockid_t which_clock,
205 struct timespec *tp)
206{
207 tp->tv_sec = 0;
208 tp->tv_nsec = posix_clocks[which_clock].res;
209 return 0;
210}
211
212static inline int common_clock_get(clockid_t which_clock, struct timespec *tp)
213{
214 getnstimeofday(tp);
215 return 0;
216}
217
218static inline int common_clock_set(clockid_t which_clock, struct timespec *tp)
219{
220 return do_sys_settimeofday(tp, NULL);
221}
222
223static inline int common_timer_create(struct k_itimer *new_timer)
224{
225 INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry);
226 init_timer(&new_timer->it.real.timer);
227 new_timer->it.real.timer.data = (unsigned long) new_timer;
228 new_timer->it.real.timer.function = posix_timer_fn;
229 set_timer_inactive(new_timer);
230 return 0;
231}
232
233/*
234 * These ones are defined below.
235 */
236static int common_nsleep(clockid_t, int flags, struct timespec *t);
237static void common_timer_get(struct k_itimer *, struct itimerspec *);
238static int common_timer_set(struct k_itimer *, int,
239 struct itimerspec *, struct itimerspec *);
240static int common_timer_del(struct k_itimer *timer);
241
242/*
243 * Return nonzero iff we know a priori this clockid_t value is bogus.
244 */
245static inline int invalid_clockid(clockid_t which_clock)
246{
247 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */
248 return 0;
249 if ((unsigned) which_clock >= MAX_CLOCKS)
250 return 1;
251 if (posix_clocks[which_clock].clock_getres != NULL)
252 return 0;
253#ifndef CLOCK_DISPATCH_DIRECT
254 if (posix_clocks[which_clock].res != 0)
255 return 0;
256#endif
257 return 1;
258}
259
260
261/*
262 * Initialize everything, well, just everything in Posix clocks/timers ;)
263 */
264static __init int init_posix_timers(void)
265{
266 struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES,
267 .abs_struct = &abs_list
268 };
269 struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
270 .abs_struct = NULL,
271 .clock_get = do_posix_clock_monotonic_get,
272 .clock_set = do_posix_clock_nosettime
273 };
274
275 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
276 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
277
278 posix_timers_cache = kmem_cache_create("posix_timers_cache",
279 sizeof (struct k_itimer), 0, 0, NULL, NULL);
280 idr_init(&posix_timers_id);
281 return 0;
282}
283
284__initcall(init_posix_timers);
285
286static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
287{
288 long sec = tp->tv_sec;
289 long nsec = tp->tv_nsec + res - 1;
290
291 if (nsec > NSEC_PER_SEC) {
292 sec++;
293 nsec -= NSEC_PER_SEC;
294 }
295
296 /*
297 * The scaling constants are defined in <linux/time.h>
298 * The difference between there and here is that we do the
299 * res rounding and compute a 64-bit result (well so does that
300 * but it then throws away the high bits).
301 */
302 *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
303 (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >>
304 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
305}
306
307/*
308 * This function adjusts the timer as needed as a result of the clock
309 * being set. It should only be called for absolute timers, and then
310 * under the abs_list lock. It computes the time difference and sets
311 * the new jiffies value in the timer. It also updates the timers
312 * reference wall_to_monotonic value. It is complicated by the fact
313 * that tstojiffies() only handles positive times and it needs to work
314 * with both positive and negative times. Also, for negative offsets,
315 * we need to defeat the res round up.
316 *
317 * Return is true if there is a new time, else false.
318 */
319static long add_clockset_delta(struct k_itimer *timr,
320 struct timespec *new_wall_to)
321{
322 struct timespec delta;
323 int sign = 0;
324 u64 exp;
325
326 set_normalized_timespec(&delta,
327 new_wall_to->tv_sec -
328 timr->it.real.wall_to_prev.tv_sec,
329 new_wall_to->tv_nsec -
330 timr->it.real.wall_to_prev.tv_nsec);
331 if (likely(!(delta.tv_sec | delta.tv_nsec)))
332 return 0;
333 if (delta.tv_sec < 0) {
334 set_normalized_timespec(&delta,
335 -delta.tv_sec,
336 1 - delta.tv_nsec -
337 posix_clocks[timr->it_clock].res);
338 sign++;
339 }
340 tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
341 timr->it.real.wall_to_prev = *new_wall_to;
342 timr->it.real.timer.expires += (sign ? -exp : exp);
343 return 1;
344}
345
346static void remove_from_abslist(struct k_itimer *timr)
347{
348 if (!list_empty(&timr->it.real.abs_timer_entry)) {
349 spin_lock(&abs_list.lock);
350 list_del_init(&timr->it.real.abs_timer_entry);
351 spin_unlock(&abs_list.lock);
352 }
353}
354
355static void schedule_next_timer(struct k_itimer *timr)
356{
357 struct timespec new_wall_to;
358 struct now_struct now;
359 unsigned long seq;
360
361 /*
362 * Set up the timer for the next interval (if there is one).
363 * Note: this code uses the abs_timer_lock to protect
364 * it.real.wall_to_prev and must hold it until exp is set, not exactly
365 * obvious...
366
367 * This function is used for CLOCK_REALTIME* and
368 * CLOCK_MONOTONIC* timers. If we ever want to handle other
369 * CLOCKs, the calling code (do_schedule_next_timer) would need
370 * to pull the "clock" info from the timer and dispatch the
371 * "other" CLOCKs "next timer" code (which, I suppose should
372 * also be added to the k_clock structure).
373 */
374 if (!timr->it.real.incr)
375 return;
376
377 do {
378 seq = read_seqbegin(&xtime_lock);
379 new_wall_to = wall_to_monotonic;
380 posix_get_now(&now);
381 } while (read_seqretry(&xtime_lock, seq));
382
383 if (!list_empty(&timr->it.real.abs_timer_entry)) {
384 spin_lock(&abs_list.lock);
385 add_clockset_delta(timr, &new_wall_to);
386
387 posix_bump_timer(timr, now);
388
389 spin_unlock(&abs_list.lock);
390 } else {
391 posix_bump_timer(timr, now);
392 }
393 timr->it_overrun_last = timr->it_overrun;
394 timr->it_overrun = -1;
395 ++timr->it_requeue_pending;
396 add_timer(&timr->it.real.timer);
397}
398
399/*
400 * This function is exported for use by the signal deliver code. It is
401 * called just prior to the info block being released and passes that
402 * block to us. It's function is to update the overrun entry AND to
403 * restart the timer. It should only be called if the timer is to be
404 * restarted (i.e. we have flagged this in the sys_private entry of the
405 * info block).
406 *
407 * To protect aginst the timer going away while the interrupt is queued,
408 * we require that the it_requeue_pending flag be set.
409 */
410void do_schedule_next_timer(struct siginfo *info)
411{
412 struct k_itimer *timr;
413 unsigned long flags;
414
415 timr = lock_timer(info->si_tid, &flags);
416
417 if (!timr || timr->it_requeue_pending != info->si_sys_private)
418 goto exit;
419
420 if (timr->it_clock < 0) /* CPU clock */
421 posix_cpu_timer_schedule(timr);
422 else
423 schedule_next_timer(timr);
424 info->si_overrun = timr->it_overrun_last;
425exit:
426 if (timr)
427 unlock_timer(timr, flags);
428}
429
430int posix_timer_event(struct k_itimer *timr,int si_private)
431{
432 memset(&timr->sigq->info, 0, sizeof(siginfo_t));
433 timr->sigq->info.si_sys_private = si_private;
434 /*
435 * Send signal to the process that owns this timer.
436
437 * This code assumes that all the possible abs_lists share the
438 * same lock (there is only one list at this time). If this is
439 * not the case, the CLOCK info would need to be used to find
440 * the proper abs list lock.
441 */
442
443 timr->sigq->info.si_signo = timr->it_sigev_signo;
444 timr->sigq->info.si_errno = 0;
445 timr->sigq->info.si_code = SI_TIMER;
446 timr->sigq->info.si_tid = timr->it_id;
447 timr->sigq->info.si_value = timr->it_sigev_value;
448 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
449 if (unlikely(timr->it_process->flags & PF_EXITING)) {
450 timr->it_sigev_notify = SIGEV_SIGNAL;
451 put_task_struct(timr->it_process);
452 timr->it_process = timr->it_process->group_leader;
453 goto group;
454 }
455 return send_sigqueue(timr->it_sigev_signo, timr->sigq,
456 timr->it_process);
457 }
458 else {
459 group:
460 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
461 timr->it_process);
462 }
463}
464EXPORT_SYMBOL_GPL(posix_timer_event);
465
466/*
467 * This function gets called when a POSIX.1b interval timer expires. It
468 * is used as a callback from the kernel internal timer. The
469 * run_timer_list code ALWAYS calls with interrupts on.
470
471 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
472 */
473static void posix_timer_fn(unsigned long __data)
474{
475 struct k_itimer *timr = (struct k_itimer *) __data;
476 unsigned long flags;
477 unsigned long seq;
478 struct timespec delta, new_wall_to;
479 u64 exp = 0;
480 int do_notify = 1;
481
482 spin_lock_irqsave(&timr->it_lock, flags);
483 set_timer_inactive(timr);
484 if (!list_empty(&timr->it.real.abs_timer_entry)) {
485 spin_lock(&abs_list.lock);
486 do {
487 seq = read_seqbegin(&xtime_lock);
488 new_wall_to = wall_to_monotonic;
489 } while (read_seqretry(&xtime_lock, seq));
490 set_normalized_timespec(&delta,
491 new_wall_to.tv_sec -
492 timr->it.real.wall_to_prev.tv_sec,
493 new_wall_to.tv_nsec -
494 timr->it.real.wall_to_prev.tv_nsec);
495 if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
496 /* do nothing, timer is on time */
497 } else if (delta.tv_sec < 0) {
498 /* do nothing, timer is already late */
499 } else {
500 /* timer is early due to a clock set */
501 tstojiffie(&delta,
502 posix_clocks[timr->it_clock].res,
503 &exp);
504 timr->it.real.wall_to_prev = new_wall_to;
505 timr->it.real.timer.expires += exp;
506 add_timer(&timr->it.real.timer);
507 do_notify = 0;
508 }
509 spin_unlock(&abs_list.lock);
510
511 }
512 if (do_notify) {
513 int si_private=0;
514
515 if (timr->it.real.incr)
516 si_private = ++timr->it_requeue_pending;
517 else {
518 remove_from_abslist(timr);
519 }
520
521 if (posix_timer_event(timr, si_private))
522 /*
523 * signal was not sent because of sig_ignor
524 * we will not get a call back to restart it AND
525 * it should be restarted.
526 */
527 schedule_next_timer(timr);
528 }
529 unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
530}
531
532
533static inline struct task_struct * good_sigevent(sigevent_t * event)
534{
535 struct task_struct *rtn = current->group_leader;
536
537 if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
538 (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
539 rtn->tgid != current->tgid ||
540 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
541 return NULL;
542
543 if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
544 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
545 return NULL;
546
547 return rtn;
548}
549
550void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock)
551{
552 if ((unsigned) clock_id >= MAX_CLOCKS) {
553 printk("POSIX clock register failed for clock_id %d\n",
554 clock_id);
555 return;
556 }
557
558 posix_clocks[clock_id] = *new_clock;
559}
560EXPORT_SYMBOL_GPL(register_posix_clock);
561
562static struct k_itimer * alloc_posix_timer(void)
563{
564 struct k_itimer *tmr;
565 tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL);
566 if (!tmr)
567 return tmr;
568 memset(tmr, 0, sizeof (struct k_itimer));
569 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
570 kmem_cache_free(posix_timers_cache, tmr);
571 tmr = NULL;
572 }
573 return tmr;
574}
575
576#define IT_ID_SET 1
577#define IT_ID_NOT_SET 0
578static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
579{
580 if (it_id_set) {
581 unsigned long flags;
582 spin_lock_irqsave(&idr_lock, flags);
583 idr_remove(&posix_timers_id, tmr->it_id);
584 spin_unlock_irqrestore(&idr_lock, flags);
585 }
586 sigqueue_free(tmr->sigq);
587 if (unlikely(tmr->it_process) &&
588 tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
589 put_task_struct(tmr->it_process);
590 kmem_cache_free(posix_timers_cache, tmr);
591}
592
593/* Create a POSIX.1b interval timer. */
594
595asmlinkage long
596sys_timer_create(clockid_t which_clock,
597 struct sigevent __user *timer_event_spec,
598 timer_t __user * created_timer_id)
599{
600 int error = 0;
601 struct k_itimer *new_timer = NULL;
602 int new_timer_id;
603 struct task_struct *process = NULL;
604 unsigned long flags;
605 sigevent_t event;
606 int it_id_set = IT_ID_NOT_SET;
607
608 if (invalid_clockid(which_clock))
609 return -EINVAL;
610
611 new_timer = alloc_posix_timer();
612 if (unlikely(!new_timer))
613 return -EAGAIN;
614
615 spin_lock_init(&new_timer->it_lock);
616 retry:
617 if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
618 error = -EAGAIN;
619 goto out;
620 }
621 spin_lock_irq(&idr_lock);
622 error = idr_get_new(&posix_timers_id,
623 (void *) new_timer,
624 &new_timer_id);
625 spin_unlock_irq(&idr_lock);
626 if (error == -EAGAIN)
627 goto retry;
628 else if (error) {
629 /*
630 * Wierd looking, but we return EAGAIN if the IDR is
631 * full (proper POSIX return value for this)
632 */
633 error = -EAGAIN;
634 goto out;
635 }
636
637 it_id_set = IT_ID_SET;
638 new_timer->it_id = (timer_t) new_timer_id;
639 new_timer->it_clock = which_clock;
640 new_timer->it_overrun = -1;
641 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
642 if (error)
643 goto out;
644
645 /*
646 * return the timer_id now. The next step is hard to
647 * back out if there is an error.
648 */
649 if (copy_to_user(created_timer_id,
650 &new_timer_id, sizeof (new_timer_id))) {
651 error = -EFAULT;
652 goto out;
653 }
654 if (timer_event_spec) {
655 if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
656 error = -EFAULT;
657 goto out;
658 }
659 new_timer->it_sigev_notify = event.sigev_notify;
660 new_timer->it_sigev_signo = event.sigev_signo;
661 new_timer->it_sigev_value = event.sigev_value;
662
663 read_lock(&tasklist_lock);
664 if ((process = good_sigevent(&event))) {
665 /*
666 * We may be setting up this process for another
667 * thread. It may be exiting. To catch this
668 * case the we check the PF_EXITING flag. If
669 * the flag is not set, the siglock will catch
670 * him before it is too late (in exit_itimers).
671 *
672 * The exec case is a bit more invloved but easy
673 * to code. If the process is in our thread
674 * group (and it must be or we would not allow
675 * it here) and is doing an exec, it will cause
676 * us to be killed. In this case it will wait
677 * for us to die which means we can finish this
678 * linkage with our last gasp. I.e. no code :)
679 */
680 spin_lock_irqsave(&process->sighand->siglock, flags);
681 if (!(process->flags & PF_EXITING)) {
682 new_timer->it_process = process;
683 list_add(&new_timer->list,
684 &process->signal->posix_timers);
685 spin_unlock_irqrestore(&process->sighand->siglock, flags);
686 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
687 get_task_struct(process);
688 } else {
689 spin_unlock_irqrestore(&process->sighand->siglock, flags);
690 process = NULL;
691 }
692 }
693 read_unlock(&tasklist_lock);
694 if (!process) {
695 error = -EINVAL;
696 goto out;
697 }
698 } else {
699 new_timer->it_sigev_notify = SIGEV_SIGNAL;
700 new_timer->it_sigev_signo = SIGALRM;
701 new_timer->it_sigev_value.sival_int = new_timer->it_id;
702 process = current->group_leader;
703 spin_lock_irqsave(&process->sighand->siglock, flags);
704 new_timer->it_process = process;
705 list_add(&new_timer->list, &process->signal->posix_timers);
706 spin_unlock_irqrestore(&process->sighand->siglock, flags);
707 }
708
709 /*
710 * In the case of the timer belonging to another task, after
711 * the task is unlocked, the timer is owned by the other task
712 * and may cease to exist at any time. Don't use or modify
713 * new_timer after the unlock call.
714 */
715
716out:
717 if (error)
718 release_posix_timer(new_timer, it_id_set);
719
720 return error;
721}
722
723/*
724 * good_timespec
725 *
726 * This function checks the elements of a timespec structure.
727 *
728 * Arguments:
729 * ts : Pointer to the timespec structure to check
730 *
731 * Return value:
732 * If a NULL pointer was passed in, or the tv_nsec field was less than 0
733 * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
734 * this function returns 0. Otherwise it returns 1.
735 */
736static int good_timespec(const struct timespec *ts)
737{
738 if ((!ts) || (ts->tv_sec < 0) ||
739 ((unsigned) ts->tv_nsec >= NSEC_PER_SEC))
740 return 0;
741 return 1;
742}
743
744/*
745 * Locking issues: We need to protect the result of the id look up until
746 * we get the timer locked down so it is not deleted under us. The
747 * removal is done under the idr spinlock so we use that here to bridge
748 * the find to the timer lock. To avoid a dead lock, the timer id MUST
749 * be release with out holding the timer lock.
750 */
751static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
752{
753 struct k_itimer *timr;
754 /*
755 * Watch out here. We do a irqsave on the idr_lock and pass the
756 * flags part over to the timer lock. Must not let interrupts in
757 * while we are moving the lock.
758 */
759
760 spin_lock_irqsave(&idr_lock, *flags);
761 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
762 if (timr) {
763 spin_lock(&timr->it_lock);
764 spin_unlock(&idr_lock);
765
766 if ((timr->it_id != timer_id) || !(timr->it_process) ||
767 timr->it_process->tgid != current->tgid) {
768 unlock_timer(timr, *flags);
769 timr = NULL;
770 }
771 } else
772 spin_unlock_irqrestore(&idr_lock, *flags);
773
774 return timr;
775}
776
777/*
778 * Get the time remaining on a POSIX.1b interval timer. This function
779 * is ALWAYS called with spin_lock_irq on the timer, thus it must not
780 * mess with irq.
781 *
782 * We have a couple of messes to clean up here. First there is the case
783 * of a timer that has a requeue pending. These timers should appear to
784 * be in the timer list with an expiry as if we were to requeue them
785 * now.
786 *
787 * The second issue is the SIGEV_NONE timer which may be active but is
788 * not really ever put in the timer list (to save system resources).
789 * This timer may be expired, and if so, we will do it here. Otherwise
790 * it is the same as a requeue pending timer WRT to what we should
791 * report.
792 */
793static void
794common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
795{
796 unsigned long expires;
797 struct now_struct now;
798
799 do
800 expires = timr->it.real.timer.expires;
801 while ((volatile long) (timr->it.real.timer.expires) != expires);
802
803 posix_get_now(&now);
804
805 if (expires &&
806 ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
807 !timr->it.real.incr &&
808 posix_time_before(&timr->it.real.timer, &now))
809 timr->it.real.timer.expires = expires = 0;
810 if (expires) {
811 if (timr->it_requeue_pending & REQUEUE_PENDING ||
812 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
813 posix_bump_timer(timr, now);
814 expires = timr->it.real.timer.expires;
815 }
816 else
817 if (!timer_pending(&timr->it.real.timer))
818 expires = 0;
819 if (expires)
820 expires -= now.jiffies;
821 }
822 jiffies_to_timespec(expires, &cur_setting->it_value);
823 jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
824
825 if (cur_setting->it_value.tv_sec < 0) {
826 cur_setting->it_value.tv_nsec = 1;
827 cur_setting->it_value.tv_sec = 0;
828 }
829}
830
831/* Get the time remaining on a POSIX.1b interval timer. */
832asmlinkage long
833sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
834{
835 struct k_itimer *timr;
836 struct itimerspec cur_setting;
837 unsigned long flags;
838
839 timr = lock_timer(timer_id, &flags);
840 if (!timr)
841 return -EINVAL;
842
843 CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
844
845 unlock_timer(timr, flags);
846
847 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
848 return -EFAULT;
849
850 return 0;
851}
852/*
853 * Get the number of overruns of a POSIX.1b interval timer. This is to
854 * be the overrun of the timer last delivered. At the same time we are
855 * accumulating overruns on the next timer. The overrun is frozen when
856 * the signal is delivered, either at the notify time (if the info block
857 * is not queued) or at the actual delivery time (as we are informed by
858 * the call back to do_schedule_next_timer(). So all we need to do is
859 * to pick up the frozen overrun.
860 */
861
862asmlinkage long
863sys_timer_getoverrun(timer_t timer_id)
864{
865 struct k_itimer *timr;
866 int overrun;
867 long flags;
868
869 timr = lock_timer(timer_id, &flags);
870 if (!timr)
871 return -EINVAL;
872
873 overrun = timr->it_overrun_last;
874 unlock_timer(timr, flags);
875
876 return overrun;
877}
878/*
879 * Adjust for absolute time
880 *
881 * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
882 * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
883 * what ever clock he is using.
884 *
885 * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
886 * time to it to get the proper time for the timer.
887 */
888static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
889 int abs, u64 *exp, struct timespec *wall_to)
890{
891 struct timespec now;
892 struct timespec oc = *tp;
893 u64 jiffies_64_f;
894 int rtn =0;
895
896 if (abs) {
897 /*
898 * The mask pick up the 4 basic clocks
899 */
900 if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
901 jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
902 &now, wall_to);
903 /*
904 * If we are doing a MONOTONIC clock
905 */
906 if((clock - &posix_clocks[0]) & CLOCKS_MONO){
907 now.tv_sec += wall_to->tv_sec;
908 now.tv_nsec += wall_to->tv_nsec;
909 }
910 } else {
911 /*
912 * Not one of the basic clocks
913 */
914 clock->clock_get(clock - posix_clocks, &now);
915 jiffies_64_f = get_jiffies_64();
916 }
917 /*
918 * Take away now to get delta
919 */
920 oc.tv_sec -= now.tv_sec;
921 oc.tv_nsec -= now.tv_nsec;
922 /*
923 * Normalize...
924 */
925 while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) {
926 oc.tv_nsec -= NSEC_PER_SEC;
927 oc.tv_sec++;
928 }
929 while ((oc.tv_nsec) < 0) {
930 oc.tv_nsec += NSEC_PER_SEC;
931 oc.tv_sec--;
932 }
933 }else{
934 jiffies_64_f = get_jiffies_64();
935 }
936 /*
937 * Check if the requested time is prior to now (if so set now)
938 */
939 if (oc.tv_sec < 0)
940 oc.tv_sec = oc.tv_nsec = 0;
941
942 if (oc.tv_sec | oc.tv_nsec)
943 set_normalized_timespec(&oc, oc.tv_sec,
944 oc.tv_nsec + clock->res);
945 tstojiffie(&oc, clock->res, exp);
946
947 /*
948 * Check if the requested time is more than the timer code
949 * can handle (if so we error out but return the value too).
950 */
951 if (*exp > ((u64)MAX_JIFFY_OFFSET))
952 /*
953 * This is a considered response, not exactly in
954 * line with the standard (in fact it is silent on
955 * possible overflows). We assume such a large
956 * value is ALMOST always a programming error and
957 * try not to compound it by setting a really dumb
958 * value.
959 */
960 rtn = -EINVAL;
961 /*
962 * return the actual jiffies expire time, full 64 bits
963 */
964 *exp += jiffies_64_f;
965 return rtn;
966}
967
968/* Set a POSIX.1b interval timer. */
969/* timr->it_lock is taken. */
970static inline int
971common_timer_set(struct k_itimer *timr, int flags,
972 struct itimerspec *new_setting, struct itimerspec *old_setting)
973{
974 struct k_clock *clock = &posix_clocks[timr->it_clock];
975 u64 expire_64;
976
977 if (old_setting)
978 common_timer_get(timr, old_setting);
979
980 /* disable the timer */
981 timr->it.real.incr = 0;
982 /*
983 * careful here. If smp we could be in the "fire" routine which will
984 * be spinning as we hold the lock. But this is ONLY an SMP issue.
985 */
986#ifdef CONFIG_SMP
987 if (timer_active(timr) && !del_timer(&timr->it.real.timer))
988 /*
989 * It can only be active if on an other cpu. Since
990 * we have cleared the interval stuff above, it should
991 * clear once we release the spin lock. Of course once
992 * we do that anything could happen, including the
993 * complete melt down of the timer. So return with
994 * a "retry" exit status.
995 */
996 return TIMER_RETRY;
997
998 set_timer_inactive(timr);
999#else
1000 del_timer(&timr->it.real.timer);
1001#endif
1002 remove_from_abslist(timr);
1003
1004 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
1005 ~REQUEUE_PENDING;
1006 timr->it_overrun_last = 0;
1007 timr->it_overrun = -1;
1008 /*
1009 *switch off the timer when it_value is zero
1010 */
1011 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
1012 timr->it.real.timer.expires = 0;
1013 return 0;
1014 }
1015
1016 if (adjust_abs_time(clock,
1017 &new_setting->it_value, flags & TIMER_ABSTIME,
1018 &expire_64, &(timr->it.real.wall_to_prev))) {
1019 return -EINVAL;
1020 }
1021 timr->it.real.timer.expires = (unsigned long)expire_64;
1022 tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
1023 timr->it.real.incr = (unsigned long)expire_64;
1024
1025 /*
1026 * We do not even queue SIGEV_NONE timers! But we do put them
1027 * in the abs list so we can do that right.
1028 */
1029 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE))
1030 add_timer(&timr->it.real.timer);
1031
1032 if (flags & TIMER_ABSTIME && clock->abs_struct) {
1033 spin_lock(&clock->abs_struct->lock);
1034 list_add_tail(&(timr->it.real.abs_timer_entry),
1035 &(clock->abs_struct->list));
1036 spin_unlock(&clock->abs_struct->lock);
1037 }
1038 return 0;
1039}
1040
1041/* Set a POSIX.1b interval timer */
1042asmlinkage long
1043sys_timer_settime(timer_t timer_id, int flags,
1044 const struct itimerspec __user *new_setting,
1045 struct itimerspec __user *old_setting)
1046{
1047 struct k_itimer *timr;
1048 struct itimerspec new_spec, old_spec;
1049 int error = 0;
1050 long flag;
1051 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
1052
1053 if (!new_setting)
1054 return -EINVAL;
1055
1056 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
1057 return -EFAULT;
1058
1059 if ((!good_timespec(&new_spec.it_interval)) ||
1060 (!good_timespec(&new_spec.it_value)))
1061 return -EINVAL;
1062retry:
1063 timr = lock_timer(timer_id, &flag);
1064 if (!timr)
1065 return -EINVAL;
1066
1067 error = CLOCK_DISPATCH(timr->it_clock, timer_set,
1068 (timr, flags, &new_spec, rtn));
1069
1070 unlock_timer(timr, flag);
1071 if (error == TIMER_RETRY) {
1072 rtn = NULL; // We already got the old time...
1073 goto retry;
1074 }
1075
1076 if (old_setting && !error && copy_to_user(old_setting,
1077 &old_spec, sizeof (old_spec)))
1078 error = -EFAULT;
1079
1080 return error;
1081}
1082
1083static inline int common_timer_del(struct k_itimer *timer)
1084{
1085 timer->it.real.incr = 0;
1086#ifdef CONFIG_SMP
1087 if (timer_active(timer) && !del_timer(&timer->it.real.timer))
1088 /*
1089 * It can only be active if on an other cpu. Since
1090 * we have cleared the interval stuff above, it should
1091 * clear once we release the spin lock. Of course once
1092 * we do that anything could happen, including the
1093 * complete melt down of the timer. So return with
1094 * a "retry" exit status.
1095 */
1096 return TIMER_RETRY;
1097#else
1098 del_timer(&timer->it.real.timer);
1099#endif
1100 remove_from_abslist(timer);
1101
1102 return 0;
1103}
1104
1105static inline int timer_delete_hook(struct k_itimer *timer)
1106{
1107 return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
1108}
1109
1110/* Delete a POSIX.1b interval timer. */
1111asmlinkage long
1112sys_timer_delete(timer_t timer_id)
1113{
1114 struct k_itimer *timer;
1115 long flags;
1116
1117#ifdef CONFIG_SMP
1118 int error;
1119retry_delete:
1120#endif
1121 timer = lock_timer(timer_id, &flags);
1122 if (!timer)
1123 return -EINVAL;
1124
1125#ifdef CONFIG_SMP
1126 error = timer_delete_hook(timer);
1127
1128 if (error == TIMER_RETRY) {
1129 unlock_timer(timer, flags);
1130 goto retry_delete;
1131 }
1132#else
1133 timer_delete_hook(timer);
1134#endif
1135 spin_lock(&current->sighand->siglock);
1136 list_del(&timer->list);
1137 spin_unlock(&current->sighand->siglock);
1138 /*
1139 * This keeps any tasks waiting on the spin lock from thinking
1140 * they got something (see the lock code above).
1141 */
1142 if (timer->it_process) {
1143 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
1144 put_task_struct(timer->it_process);
1145 timer->it_process = NULL;
1146 }
1147 unlock_timer(timer, flags);
1148 release_posix_timer(timer, IT_ID_SET);
1149 return 0;
1150}
1151/*
1152 * return timer owned by the process, used by exit_itimers
1153 */
1154static inline void itimer_delete(struct k_itimer *timer)
1155{
1156 unsigned long flags;
1157
1158#ifdef CONFIG_SMP
1159 int error;
1160retry_delete:
1161#endif
1162 spin_lock_irqsave(&timer->it_lock, flags);
1163
1164#ifdef CONFIG_SMP
1165 error = timer_delete_hook(timer);
1166
1167 if (error == TIMER_RETRY) {
1168 unlock_timer(timer, flags);
1169 goto retry_delete;
1170 }
1171#else
1172 timer_delete_hook(timer);
1173#endif
1174 list_del(&timer->list);
1175 /*
1176 * This keeps any tasks waiting on the spin lock from thinking
1177 * they got something (see the lock code above).
1178 */
1179 if (timer->it_process) {
1180 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
1181 put_task_struct(timer->it_process);
1182 timer->it_process = NULL;
1183 }
1184 unlock_timer(timer, flags);
1185 release_posix_timer(timer, IT_ID_SET);
1186}
1187
1188/*
1189 * This is called by __exit_signal, only when there are no more
1190 * references to the shared signal_struct.
1191 */
1192void exit_itimers(struct signal_struct *sig)
1193{
1194 struct k_itimer *tmr;
1195
1196 while (!list_empty(&sig->posix_timers)) {
1197 tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
1198 itimer_delete(tmr);
1199 }
1200}
1201
1202/*
1203 * And now for the "clock" calls
1204 *
1205 * These functions are called both from timer functions (with the timer
1206 * spin_lock_irq() held and from clock calls with no locking. They must
1207 * use the save flags versions of locks.
1208 */
1209
1210/*
1211 * We do ticks here to avoid the irq lock ( they take sooo long).
1212 * The seqlock is great here. Since we a reader, we don't really care
1213 * if we are interrupted since we don't take lock that will stall us or
1214 * any other cpu. Voila, no irq lock is needed.
1215 *
1216 */
1217
1218static u64 do_posix_clock_monotonic_gettime_parts(
1219 struct timespec *tp, struct timespec *mo)
1220{
1221 u64 jiff;
1222 unsigned int seq;
1223
1224 do {
1225 seq = read_seqbegin(&xtime_lock);
1226 getnstimeofday(tp);
1227 *mo = wall_to_monotonic;
1228 jiff = jiffies_64;
1229
1230 } while(read_seqretry(&xtime_lock, seq));
1231
1232 return jiff;
1233}
1234
1235static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
1236{
1237 struct timespec wall_to_mono;
1238
1239 do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
1240
1241 tp->tv_sec += wall_to_mono.tv_sec;
1242 tp->tv_nsec += wall_to_mono.tv_nsec;
1243
1244 if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
1245 tp->tv_nsec -= NSEC_PER_SEC;
1246 tp->tv_sec++;
1247 }
1248 return 0;
1249}
1250
1251int do_posix_clock_monotonic_gettime(struct timespec *tp)
1252{
1253 return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
1254}
1255
1256int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
1257{
1258 return -EINVAL;
1259}
1260EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
1261
1262int do_posix_clock_notimer_create(struct k_itimer *timer)
1263{
1264 return -EINVAL;
1265}
1266EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
1267
1268int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
1269{
1270#ifndef ENOTSUP
1271 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
1272#else /* parisc does define it separately. */
1273 return -ENOTSUP;
1274#endif
1275}
1276EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
1277
1278asmlinkage long
1279sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
1280{
1281 struct timespec new_tp;
1282
1283 if (invalid_clockid(which_clock))
1284 return -EINVAL;
1285 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
1286 return -EFAULT;
1287
1288 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
1289}
1290
1291asmlinkage long
1292sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
1293{
1294 struct timespec kernel_tp;
1295 int error;
1296
1297 if (invalid_clockid(which_clock))
1298 return -EINVAL;
1299 error = CLOCK_DISPATCH(which_clock, clock_get,
1300 (which_clock, &kernel_tp));
1301 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
1302 error = -EFAULT;
1303
1304 return error;
1305
1306}
1307
1308asmlinkage long
1309sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
1310{
1311 struct timespec rtn_tp;
1312 int error;
1313
1314 if (invalid_clockid(which_clock))
1315 return -EINVAL;
1316
1317 error = CLOCK_DISPATCH(which_clock, clock_getres,
1318 (which_clock, &rtn_tp));
1319
1320 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
1321 error = -EFAULT;
1322 }
1323
1324 return error;
1325}
1326
1327static void nanosleep_wake_up(unsigned long __data)
1328{
1329 struct task_struct *p = (struct task_struct *) __data;
1330
1331 wake_up_process(p);
1332}
1333
1334/*
1335 * The standard says that an absolute nanosleep call MUST wake up at
1336 * the requested time in spite of clock settings. Here is what we do:
1337 * For each nanosleep call that needs it (only absolute and not on
1338 * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
1339 * into the "nanosleep_abs_list". All we need is the task_struct pointer.
1340 * When ever the clock is set we just wake up all those tasks. The rest
1341 * is done by the while loop in clock_nanosleep().
1342 *
1343 * On locking, clock_was_set() is called from update_wall_clock which
1344 * holds (or has held for it) a write_lock_irq( xtime_lock) and is
1345 * called from the timer bh code. Thus we need the irq save locks.
1346 *
1347 * Also, on the call from update_wall_clock, that is done as part of a
1348 * softirq thing. We don't want to delay the system that much (possibly
1349 * long list of timers to fix), so we defer that work to keventd.
1350 */
1351
1352static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue);
1353static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL);
1354
1355static DECLARE_MUTEX(clock_was_set_lock);
1356
1357void clock_was_set(void)
1358{
1359 struct k_itimer *timr;
1360 struct timespec new_wall_to;
1361 LIST_HEAD(cws_list);
1362 unsigned long seq;
1363
1364
1365 if (unlikely(in_interrupt())) {
1366 schedule_work(&clock_was_set_work);
1367 return;
1368 }
1369 wake_up_all(&nanosleep_abs_wqueue);
1370
1371 /*
1372 * Check if there exist TIMER_ABSTIME timers to correct.
1373 *
1374 * Notes on locking: This code is run in task context with irq
1375 * on. We CAN be interrupted! All other usage of the abs list
1376 * lock is under the timer lock which holds the irq lock as
1377 * well. We REALLY don't want to scan the whole list with the
1378 * interrupt system off, AND we would like a sequence lock on
1379 * this code as well. Since we assume that the clock will not
1380 * be set often, it seems ok to take and release the irq lock
1381 * for each timer. In fact add_timer will do this, so this is
1382 * not an issue. So we know when we are done, we will move the
1383 * whole list to a new location. Then as we process each entry,
1384 * we will move it to the actual list again. This way, when our
1385 * copy is empty, we are done. We are not all that concerned
1386 * about preemption so we will use a semaphore lock to protect
1387 * aginst reentry. This way we will not stall another
1388 * processor. It is possible that this may delay some timers
1389 * that should have expired, given the new clock, but even this
1390 * will be minimal as we will always update to the current time,
1391 * even if it was set by a task that is waiting for entry to
1392 * this code. Timers that expire too early will be caught by
1393 * the expire code and restarted.
1394
1395 * Absolute timers that repeat are left in the abs list while
1396 * waiting for the task to pick up the signal. This means we
1397 * may find timers that are not in the "add_timer" list, but are
1398 * in the abs list. We do the same thing for these, save
1399 * putting them back in the "add_timer" list. (Note, these are
1400 * left in the abs list mainly to indicate that they are
1401 * ABSOLUTE timers, a fact that is used by the re-arm code, and
1402 * for which we have no other flag.)
1403
1404 */
1405
1406 down(&clock_was_set_lock);
1407 spin_lock_irq(&abs_list.lock);
1408 list_splice_init(&abs_list.list, &cws_list);
1409 spin_unlock_irq(&abs_list.lock);
1410 do {
1411 do {
1412 seq = read_seqbegin(&xtime_lock);
1413 new_wall_to = wall_to_monotonic;
1414 } while (read_seqretry(&xtime_lock, seq));
1415
1416 spin_lock_irq(&abs_list.lock);
1417 if (list_empty(&cws_list)) {
1418 spin_unlock_irq(&abs_list.lock);
1419 break;
1420 }
1421 timr = list_entry(cws_list.next, struct k_itimer,
1422 it.real.abs_timer_entry);
1423
1424 list_del_init(&timr->it.real.abs_timer_entry);
1425 if (add_clockset_delta(timr, &new_wall_to) &&
1426 del_timer(&timr->it.real.timer)) /* timer run yet? */
1427 add_timer(&timr->it.real.timer);
1428 list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
1429 spin_unlock_irq(&abs_list.lock);
1430 } while (1);
1431
1432 up(&clock_was_set_lock);
1433}
1434
1435long clock_nanosleep_restart(struct restart_block *restart_block);
1436
1437asmlinkage long
1438sys_clock_nanosleep(clockid_t which_clock, int flags,
1439 const struct timespec __user *rqtp,
1440 struct timespec __user *rmtp)
1441{
1442 struct timespec t;
1443 struct restart_block *restart_block =
1444 &(current_thread_info()->restart_block);
1445 int ret;
1446
1447 if (invalid_clockid(which_clock))
1448 return -EINVAL;
1449
1450 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1451 return -EFAULT;
1452
1453 if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
1454 return -EINVAL;
1455
1456 /*
1457 * Do this here as nsleep function does not have the real address.
1458 */
1459 restart_block->arg1 = (unsigned long)rmtp;
1460
1461 ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
1462
1463 if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
1464 copy_to_user(rmtp, &t, sizeof (t)))
1465 return -EFAULT;
1466 return ret;
1467}
1468
1469
1470static int common_nsleep(clockid_t which_clock,
1471 int flags, struct timespec *tsave)
1472{
1473 struct timespec t, dum;
1474 struct timer_list new_timer;
1475 DECLARE_WAITQUEUE(abs_wqueue, current);
1476 u64 rq_time = (u64)0;
1477 s64 left;
1478 int abs;
1479 struct restart_block *restart_block =
1480 &current_thread_info()->restart_block;
1481
1482 abs_wqueue.flags = 0;
1483 init_timer(&new_timer);
1484 new_timer.expires = 0;
1485 new_timer.data = (unsigned long) current;
1486 new_timer.function = nanosleep_wake_up;
1487 abs = flags & TIMER_ABSTIME;
1488
1489 if (restart_block->fn == clock_nanosleep_restart) {
1490 /*
1491 * Interrupted by a non-delivered signal, pick up remaining
1492 * time and continue. Remaining time is in arg2 & 3.
1493 */
1494 restart_block->fn = do_no_restart_syscall;
1495
1496 rq_time = restart_block->arg3;
1497 rq_time = (rq_time << 32) + restart_block->arg2;
1498 if (!rq_time)
1499 return -EINTR;
1500 left = rq_time - get_jiffies_64();
1501 if (left <= (s64)0)
1502 return 0; /* Already passed */
1503 }
1504
1505 if (abs && (posix_clocks[which_clock].clock_get !=
1506 posix_clocks[CLOCK_MONOTONIC].clock_get))
1507 add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
1508
1509 do {
1510 t = *tsave;
1511 if (abs || !rq_time) {
1512 adjust_abs_time(&posix_clocks[which_clock], &t, abs,
1513 &rq_time, &dum);
1514 }
1515
1516 left = rq_time - get_jiffies_64();
1517 if (left >= (s64)MAX_JIFFY_OFFSET)
1518 left = (s64)MAX_JIFFY_OFFSET;
1519 if (left < (s64)0)
1520 break;
1521
1522 new_timer.expires = jiffies + left;
1523 __set_current_state(TASK_INTERRUPTIBLE);
1524 add_timer(&new_timer);
1525
1526 schedule();
1527
1528 del_timer_sync(&new_timer);
1529 left = rq_time - get_jiffies_64();
1530 } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
1531
1532 if (abs_wqueue.task_list.next)
1533 finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
1534
1535 if (left > (s64)0) {
1536
1537 /*
1538 * Always restart abs calls from scratch to pick up any
1539 * clock shifting that happened while we are away.
1540 */
1541 if (abs)
1542 return -ERESTARTNOHAND;
1543
1544 left *= TICK_NSEC;
1545 tsave->tv_sec = div_long_long_rem(left,
1546 NSEC_PER_SEC,
1547 &tsave->tv_nsec);
1548 /*
1549 * Restart works by saving the time remaing in
1550 * arg2 & 3 (it is 64-bits of jiffies). The other
1551 * info we need is the clock_id (saved in arg0).
1552 * The sys_call interface needs the users
1553 * timespec return address which _it_ saves in arg1.
1554 * Since we have cast the nanosleep call to a clock_nanosleep
1555 * both can be restarted with the same code.
1556 */
1557 restart_block->fn = clock_nanosleep_restart;
1558 restart_block->arg0 = which_clock;
1559 /*
1560 * Caller sets arg1
1561 */
1562 restart_block->arg2 = rq_time & 0xffffffffLL;
1563 restart_block->arg3 = rq_time >> 32;
1564
1565 return -ERESTART_RESTARTBLOCK;
1566 }
1567
1568 return 0;
1569}
1570/*
1571 * This will restart clock_nanosleep.
1572 */
1573long
1574clock_nanosleep_restart(struct restart_block *restart_block)
1575{
1576 struct timespec t;
1577 int ret = common_nsleep(restart_block->arg0, 0, &t);
1578
1579 if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
1580 copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
1581 sizeof (t)))
1582 return -EFAULT;
1583 return ret;
1584}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
new file mode 100644
index 000000000000..696387ffe49c
--- /dev/null
+++ b/kernel/power/Kconfig
@@ -0,0 +1,74 @@
1config PM
2 bool "Power Management support"
3 ---help---
4 "Power Management" means that parts of your computer are shut
5 off or put into a power conserving "sleep" mode if they are not
6 being used. There are two competing standards for doing this: APM
7 and ACPI. If you want to use either one, say Y here and then also
8 to the requisite support below.
9
10 Power Management is most important for battery powered laptop
11 computers; if you have a laptop, check out the Linux Laptop home
12 page on the WWW at <http://www.linux-on-laptops.com/> or
13 Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
14 and the Battery Powered Linux mini-HOWTO, available from
15 <http://www.tldp.org/docs.html#howto>.
16
17 Note that, even if you say N here, Linux on the x86 architecture
18 will issue the hlt instruction if nothing is to be done, thereby
19 sending the processor to sleep and saving power.
20
21config PM_DEBUG
22 bool "Power Management Debug Support"
23 depends on PM
24 ---help---
25 This option enables verbose debugging support in the Power Management
26 code. This is helpful when debugging and reporting various PM bugs,
27 like suspend support.
28
29config SOFTWARE_SUSPEND
30 bool "Software Suspend (EXPERIMENTAL)"
31 depends on EXPERIMENTAL && PM && SWAP
32 ---help---
33 Enable the possibility of suspending the machine.
34 It doesn't need APM.
35 You may suspend your machine by 'swsusp' or 'shutdown -z <time>'
36 (patch for sysvinit needed).
37
38 It creates an image which is saved in your active swap. Upon next
39 boot, pass the 'resume=/dev/swappartition' argument to the kernel to
40 have it detect the saved image, restore memory state from it, and
41 continue to run as before. If you do not want the previous state to
42 be reloaded, then use the 'noresume' kernel argument. However, note
43 that your partitions will be fsck'd and you must re-mkswap your swap
44 partitions. It does not work with swap files.
45
46 Right now you may boot without resuming and then later resume but
47 in meantime you cannot use those swap partitions/files which were
48 involved in suspending. Also in this case there is a risk that buffers
49 on disk won't match with saved ones.
50
51 For more information take a look at <file:Documentation/power/swsusp.txt>.
52
53config PM_STD_PARTITION
54 string "Default resume partition"
55 depends on SOFTWARE_SUSPEND
56 default ""
57 ---help---
58 The default resume partition is the partition that the suspend-
59 to-disk implementation will look for a suspended disk image.
60
61 The partition specified here will be different for almost every user.
62 It should be a valid swap partition (at least for now) that is turned
63 on before suspending.
64
65 The partition specified can be overridden by specifying:
66
67 resume=/dev/<other device>
68
69 which will set the resume partition to the device specified.
70
71 Note there is currently not a way to specify which device to save the
72 suspended image to. It will simply pick the first available swap
73 device.
74
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
new file mode 100644
index 000000000000..fbdc634135a7
--- /dev/null
+++ b/kernel/power/Makefile
@@ -0,0 +1,11 @@
1
2ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG
4endif
5
6swsusp-smp-$(CONFIG_SMP) += smp.o
7
8obj-y := main.o process.o console.o pm.o
9obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o
10
11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
new file mode 100644
index 000000000000..7ff375e7c95f
--- /dev/null
+++ b/kernel/power/console.c
@@ -0,0 +1,58 @@
1/*
2 * drivers/power/process.c - Functions for saving/restoring console.
3 *
4 * Originally from swsusp.
5 */
6
7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h>
9#include <linux/console.h>
10#include "power.h"
11
12static int new_loglevel = 10;
13static int orig_loglevel;
14#ifdef SUSPEND_CONSOLE
15static int orig_fgconsole, orig_kmsg;
16#endif
17
18int pm_prepare_console(void)
19{
20 orig_loglevel = console_loglevel;
21 console_loglevel = new_loglevel;
22
23#ifdef SUSPEND_CONSOLE
24 acquire_console_sem();
25
26 orig_fgconsole = fg_console;
27
28 if (vc_allocate(SUSPEND_CONSOLE)) {
29 /* we can't have a free VC for now. Too bad,
30 * we don't want to mess the screen for now. */
31 release_console_sem();
32 return 1;
33 }
34
35 set_console(SUSPEND_CONSOLE);
36 release_console_sem();
37
38 if (vt_waitactive(SUSPEND_CONSOLE)) {
39 pr_debug("Suspend: Can't switch VCs.");
40 return 1;
41 }
42 orig_kmsg = kmsg_redirect;
43 kmsg_redirect = SUSPEND_CONSOLE;
44#endif
45 return 0;
46}
47
48void pm_restore_console(void)
49{
50 console_loglevel = orig_loglevel;
51#ifdef SUSPEND_CONSOLE
52 acquire_console_sem();
53 set_console(orig_fgconsole);
54 release_console_sem();
55 kmsg_redirect = orig_kmsg;
56#endif
57 return;
58}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
new file mode 100644
index 000000000000..02b6764034dc
--- /dev/null
+++ b/kernel/power/disk.c
@@ -0,0 +1,431 @@
1/*
2 * kernel/power/disk.c - Suspend-to-disk support.
3 *
4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
7 *
8 * This file is released under the GPLv2.
9 *
10 */
11
12#include <linux/suspend.h>
13#include <linux/syscalls.h>
14#include <linux/reboot.h>
15#include <linux/string.h>
16#include <linux/device.h>
17#include <linux/delay.h>
18#include <linux/fs.h>
19#include "power.h"
20
21
22extern suspend_disk_method_t pm_disk_mode;
23extern struct pm_ops * pm_ops;
24
25extern int swsusp_suspend(void);
26extern int swsusp_write(void);
27extern int swsusp_check(void);
28extern int swsusp_read(void);
29extern void swsusp_close(void);
30extern int swsusp_resume(void);
31extern int swsusp_free(void);
32
33
34static int noresume = 0;
35char resume_file[256] = CONFIG_PM_STD_PARTITION;
36dev_t swsusp_resume_device;
37
38/**
39 * power_down - Shut machine down for hibernate.
40 * @mode: Suspend-to-disk mode
41 *
42 * Use the platform driver, if configured so, and return gracefully if it
43 * fails.
44 * Otherwise, try to power off and reboot. If they fail, halt the machine,
45 * there ain't no turning back.
46 */
47
48static void power_down(suspend_disk_method_t mode)
49{
50 unsigned long flags;
51 int error = 0;
52
53 local_irq_save(flags);
54 switch(mode) {
55 case PM_DISK_PLATFORM:
56 device_shutdown();
57 error = pm_ops->enter(PM_SUSPEND_DISK);
58 break;
59 case PM_DISK_SHUTDOWN:
60 printk("Powering off system\n");
61 device_shutdown();
62 machine_power_off();
63 break;
64 case PM_DISK_REBOOT:
65 device_shutdown();
66 machine_restart(NULL);
67 break;
68 }
69 machine_halt();
70 /* Valid image is on the disk, if we continue we risk serious data corruption
71 after resume. */
72 printk(KERN_CRIT "Please power me down manually\n");
73 while(1);
74}
75
76
77static int in_suspend __nosavedata = 0;
78
79
80/**
81 * free_some_memory - Try to free as much memory as possible
82 *
83 * ... but do not OOM-kill anyone
84 *
85 * Notice: all userland should be stopped at this point, or
86 * livelock is possible.
87 */
88
89static void free_some_memory(void)
90{
91 unsigned int i = 0;
92 unsigned int tmp;
93 unsigned long pages = 0;
94 char *p = "-\\|/";
95
96 printk("Freeing memory... ");
97 while ((tmp = shrink_all_memory(10000))) {
98 pages += tmp;
99 printk("\b%c", p[i]);
100 i++;
101 if (i > 3)
102 i = 0;
103 }
104 printk("\bdone (%li pages freed)\n", pages);
105}
106
107
108static inline void platform_finish(void)
109{
110 if (pm_disk_mode == PM_DISK_PLATFORM) {
111 if (pm_ops && pm_ops->finish)
112 pm_ops->finish(PM_SUSPEND_DISK);
113 }
114}
115
116static void finish(void)
117{
118 device_resume();
119 platform_finish();
120 enable_nonboot_cpus();
121 thaw_processes();
122 pm_restore_console();
123}
124
125
126static int prepare_processes(void)
127{
128 int error;
129
130 pm_prepare_console();
131
132 sys_sync();
133
134 if (freeze_processes()) {
135 error = -EBUSY;
136 return error;
137 }
138
139 if (pm_disk_mode == PM_DISK_PLATFORM) {
140 if (pm_ops && pm_ops->prepare) {
141 if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
142 return error;
143 }
144 }
145
146 /* Free memory before shutting down devices. */
147 free_some_memory();
148
149 return 0;
150}
151
152static void unprepare_processes(void)
153{
154 enable_nonboot_cpus();
155 thaw_processes();
156 pm_restore_console();
157}
158
159static int prepare_devices(void)
160{
161 int error;
162
163 disable_nonboot_cpus();
164 if ((error = device_suspend(PMSG_FREEZE))) {
165 printk("Some devices failed to suspend\n");
166 platform_finish();
167 enable_nonboot_cpus();
168 return error;
169 }
170
171 return 0;
172}
173
174/**
175 * pm_suspend_disk - The granpappy of power management.
176 *
177 * If we're going through the firmware, then get it over with quickly.
178 *
179 * If not, then call swsusp to do its thing, then figure out how
180 * to power down the system.
181 */
182
183int pm_suspend_disk(void)
184{
185 int error;
186
187 error = prepare_processes();
188 if (!error) {
189 error = prepare_devices();
190 }
191
192 if (error) {
193 unprepare_processes();
194 return error;
195 }
196
197 pr_debug("PM: Attempting to suspend to disk.\n");
198 if (pm_disk_mode == PM_DISK_FIRMWARE)
199 return pm_ops->enter(PM_SUSPEND_DISK);
200
201 pr_debug("PM: snapshotting memory.\n");
202 in_suspend = 1;
203 if ((error = swsusp_suspend()))
204 goto Done;
205
206 if (in_suspend) {
207 pr_debug("PM: writing image.\n");
208 error = swsusp_write();
209 if (!error)
210 power_down(pm_disk_mode);
211 } else
212 pr_debug("PM: Image restored successfully.\n");
213 swsusp_free();
214 Done:
215 finish();
216 return error;
217}
218
219
220/**
221 * software_resume - Resume from a saved image.
222 *
223 * Called as a late_initcall (so all devices are discovered and
224 * initialized), we call swsusp to see if we have a saved image or not.
225 * If so, we quiesce devices, the restore the saved image. We will
226 * return above (in pm_suspend_disk() ) if everything goes well.
227 * Otherwise, we fail gracefully and return to the normally
228 * scheduled program.
229 *
230 */
231
232static int software_resume(void)
233{
234 int error;
235
236 if (noresume) {
237 /**
238 * FIXME: If noresume is specified, we need to find the partition
239 * and reset it back to normal swap space.
240 */
241 return 0;
242 }
243
244 pr_debug("PM: Checking swsusp image.\n");
245
246 if ((error = swsusp_check()))
247 goto Done;
248
249 pr_debug("PM: Preparing processes for restore.\n");
250
251 if ((error = prepare_processes())) {
252 swsusp_close();
253 goto Cleanup;
254 }
255
256 pr_debug("PM: Reading swsusp image.\n");
257
258 if ((error = swsusp_read()))
259 goto Cleanup;
260
261 pr_debug("PM: Preparing devices for restore.\n");
262
263 if ((error = prepare_devices()))
264 goto Free;
265
266 mb();
267
268 pr_debug("PM: Restoring saved image.\n");
269 swsusp_resume();
270 pr_debug("PM: Restore failed, recovering.n");
271 finish();
272 Free:
273 swsusp_free();
274 Cleanup:
275 unprepare_processes();
276 Done:
277 pr_debug("PM: Resume from disk failed.\n");
278 return 0;
279}
280
281late_initcall(software_resume);
282
283
284static char * pm_disk_modes[] = {
285 [PM_DISK_FIRMWARE] = "firmware",
286 [PM_DISK_PLATFORM] = "platform",
287 [PM_DISK_SHUTDOWN] = "shutdown",
288 [PM_DISK_REBOOT] = "reboot",
289};
290
291/**
292 * disk - Control suspend-to-disk mode
293 *
294 * Suspend-to-disk can be handled in several ways. The greatest
295 * distinction is who writes memory to disk - the firmware or the OS.
296 * If the firmware does it, we assume that it also handles suspending
297 * the system.
298 * If the OS does it, then we have three options for putting the system
299 * to sleep - using the platform driver (e.g. ACPI or other PM registers),
300 * powering off the system or rebooting the system (for testing).
301 *
302 * The system will support either 'firmware' or 'platform', and that is
303 * known a priori (and encoded in pm_ops). But, the user may choose
304 * 'shutdown' or 'reboot' as alternatives.
305 *
306 * show() will display what the mode is currently set to.
307 * store() will accept one of
308 *
309 * 'firmware'
310 * 'platform'
311 * 'shutdown'
312 * 'reboot'
313 *
314 * It will only change to 'firmware' or 'platform' if the system
315 * supports it (as determined from pm_ops->pm_disk_mode).
316 */
317
318static ssize_t disk_show(struct subsystem * subsys, char * buf)
319{
320 return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
321}
322
323
324static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
325{
326 int error = 0;
327 int i;
328 int len;
329 char *p;
330 suspend_disk_method_t mode = 0;
331
332 p = memchr(buf, '\n', n);
333 len = p ? p - buf : n;
334
335 down(&pm_sem);
336 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
337 if (!strncmp(buf, pm_disk_modes[i], len)) {
338 mode = i;
339 break;
340 }
341 }
342 if (mode) {
343 if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
344 pm_disk_mode = mode;
345 else {
346 if (pm_ops && pm_ops->enter &&
347 (mode == pm_ops->pm_disk_mode))
348 pm_disk_mode = mode;
349 else
350 error = -EINVAL;
351 }
352 } else
353 error = -EINVAL;
354
355 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
356 pm_disk_modes[mode]);
357 up(&pm_sem);
358 return error ? error : n;
359}
360
361power_attr(disk);
362
363static ssize_t resume_show(struct subsystem * subsys, char *buf)
364{
365 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
366 MINOR(swsusp_resume_device));
367}
368
369static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n)
370{
371 int len;
372 char *p;
373 unsigned int maj, min;
374 int error = -EINVAL;
375 dev_t res;
376
377 p = memchr(buf, '\n', n);
378 len = p ? p - buf : n;
379
380 if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
381 res = MKDEV(maj,min);
382 if (maj == MAJOR(res) && min == MINOR(res)) {
383 swsusp_resume_device = res;
384 printk("Attempting manual resume\n");
385 noresume = 0;
386 software_resume();
387 }
388 }
389
390 return error >= 0 ? n : error;
391}
392
393power_attr(resume);
394
395static struct attribute * g[] = {
396 &disk_attr.attr,
397 &resume_attr.attr,
398 NULL,
399};
400
401
402static struct attribute_group attr_group = {
403 .attrs = g,
404};
405
406
407static int __init pm_disk_init(void)
408{
409 return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
410}
411
412core_initcall(pm_disk_init);
413
414
415static int __init resume_setup(char *str)
416{
417 if (noresume)
418 return 1;
419
420 strncpy( resume_file, str, 255 );
421 return 1;
422}
423
424static int __init noresume_setup(char *str)
425{
426 noresume = 1;
427 return 1;
428}
429
430__setup("noresume", noresume_setup);
431__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
new file mode 100644
index 000000000000..7960ddf04a57
--- /dev/null
+++ b/kernel/power/main.c
@@ -0,0 +1,269 @@
1/*
2 * kernel/power/main.c - PM subsystem core functionality.
3 *
4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab
6 *
7 * This file is released under the GPLv2
8 *
9 */
10
11#include <linux/suspend.h>
12#include <linux/kobject.h>
13#include <linux/string.h>
14#include <linux/delay.h>
15#include <linux/errno.h>
16#include <linux/init.h>
17#include <linux/pm.h>
18
19
20#include "power.h"
21
22DECLARE_MUTEX(pm_sem);
23
24struct pm_ops * pm_ops = NULL;
25suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
26
27/**
28 * pm_set_ops - Set the global power method table.
29 * @ops: Pointer to ops structure.
30 */
31
32void pm_set_ops(struct pm_ops * ops)
33{
34 down(&pm_sem);
35 pm_ops = ops;
36 up(&pm_sem);
37}
38
39
40/**
41 * suspend_prepare - Do prep work before entering low-power state.
42 * @state: State we're entering.
43 *
44 * This is common code that is called for each state that we're
45 * entering. Allocate a console, stop all processes, then make sure
46 * the platform can enter the requested state.
47 */
48
49static int suspend_prepare(suspend_state_t state)
50{
51 int error = 0;
52
53 if (!pm_ops || !pm_ops->enter)
54 return -EPERM;
55
56 pm_prepare_console();
57
58 if (freeze_processes()) {
59 error = -EAGAIN;
60 goto Thaw;
61 }
62
63 if (pm_ops->prepare) {
64 if ((error = pm_ops->prepare(state)))
65 goto Thaw;
66 }
67
68 if ((error = device_suspend(PMSG_SUSPEND))) {
69 printk(KERN_ERR "Some devices failed to suspend\n");
70 goto Finish;
71 }
72 return 0;
73 Finish:
74 if (pm_ops->finish)
75 pm_ops->finish(state);
76 Thaw:
77 thaw_processes();
78 pm_restore_console();
79 return error;
80}
81
82
83static int suspend_enter(suspend_state_t state)
84{
85 int error = 0;
86 unsigned long flags;
87
88 local_irq_save(flags);
89
90 if ((error = device_power_down(PMSG_SUSPEND))) {
91 printk(KERN_ERR "Some devices failed to power down\n");
92 goto Done;
93 }
94 error = pm_ops->enter(state);
95 device_power_up();
96 Done:
97 local_irq_restore(flags);
98 return error;
99}
100
101
102/**
103 * suspend_finish - Do final work before exiting suspend sequence.
104 * @state: State we're coming out of.
105 *
106 * Call platform code to clean up, restart processes, and free the
107 * console that we've allocated. This is not called for suspend-to-disk.
108 */
109
110static void suspend_finish(suspend_state_t state)
111{
112 device_resume();
113 if (pm_ops && pm_ops->finish)
114 pm_ops->finish(state);
115 thaw_processes();
116 pm_restore_console();
117}
118
119
120
121
122static char * pm_states[] = {
123 [PM_SUSPEND_STANDBY] = "standby",
124 [PM_SUSPEND_MEM] = "mem",
125 [PM_SUSPEND_DISK] = "disk",
126 NULL,
127};
128
129
130/**
131 * enter_state - Do common work of entering low-power state.
132 * @state: pm_state structure for state we're entering.
133 *
134 * Make sure we're the only ones trying to enter a sleep state. Fail
135 * if someone has beat us to it, since we don't want anything weird to
136 * happen when we wake up.
137 * Then, do the setup for suspend, enter the state, and cleaup (after
138 * we've woken up).
139 */
140
141static int enter_state(suspend_state_t state)
142{
143 int error;
144
145 if (down_trylock(&pm_sem))
146 return -EBUSY;
147
148 if (state == PM_SUSPEND_DISK) {
149 error = pm_suspend_disk();
150 goto Unlock;
151 }
152
153 /* Suspend is hard to get right on SMP. */
154 if (num_online_cpus() != 1) {
155 error = -EPERM;
156 goto Unlock;
157 }
158
159 pr_debug("PM: Preparing system for suspend\n");
160 if ((error = suspend_prepare(state)))
161 goto Unlock;
162
163 pr_debug("PM: Entering state.\n");
164 error = suspend_enter(state);
165
166 pr_debug("PM: Finishing up.\n");
167 suspend_finish(state);
168 Unlock:
169 up(&pm_sem);
170 return error;
171}
172
173/*
174 * This is main interface to the outside world. It needs to be
175 * called from process context.
176 */
177int software_suspend(void)
178{
179 return enter_state(PM_SUSPEND_DISK);
180}
181
182
183/**
184 * pm_suspend - Externally visible function for suspending system.
185 * @state: Enumarted value of state to enter.
186 *
187 * Determine whether or not value is within range, get state
188 * structure, and enter (above).
189 */
190
191int pm_suspend(suspend_state_t state)
192{
193 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
194 return enter_state(state);
195 return -EINVAL;
196}
197
198
199
200decl_subsys(power,NULL,NULL);
201
202
203/**
204 * state - control system power state.
205 *
206 * show() returns what states are supported, which is hard-coded to
207 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
208 * 'disk' (Suspend-to-Disk).
209 *
210 * store() accepts one of those strings, translates it into the
211 * proper enumerated value, and initiates a suspend transition.
212 */
213
214static ssize_t state_show(struct subsystem * subsys, char * buf)
215{
216 int i;
217 char * s = buf;
218
219 for (i = 0; i < PM_SUSPEND_MAX; i++) {
220 if (pm_states[i])
221 s += sprintf(s,"%s ",pm_states[i]);
222 }
223 s += sprintf(s,"\n");
224 return (s - buf);
225}
226
227static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
228{
229 suspend_state_t state = PM_SUSPEND_STANDBY;
230 char ** s;
231 char *p;
232 int error;
233 int len;
234
235 p = memchr(buf, '\n', n);
236 len = p ? p - buf : n;
237
238 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
239 if (*s && !strncmp(buf, *s, len))
240 break;
241 }
242 if (*s)
243 error = enter_state(state);
244 else
245 error = -EINVAL;
246 return error ? error : n;
247}
248
249power_attr(state);
250
251static struct attribute * g[] = {
252 &state_attr.attr,
253 NULL,
254};
255
256static struct attribute_group attr_group = {
257 .attrs = g,
258};
259
260
261static int __init pm_init(void)
262{
263 int error = subsystem_register(&power_subsys);
264 if (!error)
265 error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
266 return error;
267}
268
269core_initcall(pm_init);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
new file mode 100644
index 000000000000..61deda04e39e
--- /dev/null
+++ b/kernel/power/pm.c
@@ -0,0 +1,265 @@
1/*
2 * pm.c - Power management interface
3 *
4 * Copyright (C) 2000 Andrew Henroid
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20#include <linux/init.h>
21#include <linux/module.h>
22#include <linux/spinlock.h>
23#include <linux/mm.h>
24#include <linux/slab.h>
25#include <linux/pm.h>
26#include <linux/interrupt.h>
27
28int pm_active;
29
30/*
31 * Locking notes:
32 * pm_devs_lock can be a semaphore providing pm ops are not called
33 * from an interrupt handler (already a bad idea so no change here). Each
34 * change must be protected so that an unlink of an entry doesn't clash
35 * with a pm send - which is permitted to sleep in the current architecture
36 *
37 * Module unloads clashing with pm events now work out safely, the module
38 * unload path will block until the event has been sent. It may well block
39 * until a resume but that will be fine.
40 */
41
42static DECLARE_MUTEX(pm_devs_lock);
43static LIST_HEAD(pm_devs);
44
45/**
46 * pm_register - register a device with power management
47 * @type: device type
48 * @id: device ID
49 * @callback: callback function
50 *
51 * Add a device to the list of devices that wish to be notified about
52 * power management events. A &pm_dev structure is returned on success,
53 * on failure the return is %NULL.
54 *
55 * The callback function will be called in process context and
56 * it may sleep.
57 */
58
59struct pm_dev *pm_register(pm_dev_t type,
60 unsigned long id,
61 pm_callback callback)
62{
63 struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
64 if (dev) {
65 memset(dev, 0, sizeof(*dev));
66 dev->type = type;
67 dev->id = id;
68 dev->callback = callback;
69
70 down(&pm_devs_lock);
71 list_add(&dev->entry, &pm_devs);
72 up(&pm_devs_lock);
73 }
74 return dev;
75}
76
77/**
78 * pm_unregister - unregister a device with power management
79 * @dev: device to unregister
80 *
81 * Remove a device from the power management notification lists. The
82 * dev passed must be a handle previously returned by pm_register.
83 */
84
85void pm_unregister(struct pm_dev *dev)
86{
87 if (dev) {
88 down(&pm_devs_lock);
89 list_del(&dev->entry);
90 up(&pm_devs_lock);
91
92 kfree(dev);
93 }
94}
95
96static void __pm_unregister(struct pm_dev *dev)
97{
98 if (dev) {
99 list_del(&dev->entry);
100 kfree(dev);
101 }
102}
103
104/**
105 * pm_unregister_all - unregister all devices with matching callback
106 * @callback: callback function pointer
107 *
108 * Unregister every device that would call the callback passed. This
109 * is primarily meant as a helper function for loadable modules. It
110 * enables a module to give up all its managed devices without keeping
111 * its own private list.
112 */
113
114void pm_unregister_all(pm_callback callback)
115{
116 struct list_head *entry;
117
118 if (!callback)
119 return;
120
121 down(&pm_devs_lock);
122 entry = pm_devs.next;
123 while (entry != &pm_devs) {
124 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
125 entry = entry->next;
126 if (dev->callback == callback)
127 __pm_unregister(dev);
128 }
129 up(&pm_devs_lock);
130}
131
132/**
133 * pm_send - send request to a single device
134 * @dev: device to send to
135 * @rqst: power management request
136 * @data: data for the callback
137 *
138 * Issue a power management request to a given device. The
139 * %PM_SUSPEND and %PM_RESUME events are handled specially. The
140 * data field must hold the intended next state. No call is made
141 * if the state matches.
142 *
143 * BUGS: what stops two power management requests occurring in parallel
144 * and conflicting.
145 *
146 * WARNING: Calling pm_send directly is not generally recommended, in
147 * particular there is no locking against the pm_dev going away. The
148 * caller must maintain all needed locking or have 'inside knowledge'
149 * on the safety. Also remember that this function is not locked against
150 * pm_unregister. This means that you must handle SMP races on callback
151 * execution and unload yourself.
152 */
153
154static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
155{
156 int status = 0;
157 unsigned long prev_state, next_state;
158
159 if (in_interrupt())
160 BUG();
161
162 switch (rqst) {
163 case PM_SUSPEND:
164 case PM_RESUME:
165 prev_state = dev->state;
166 next_state = (unsigned long) data;
167 if (prev_state != next_state) {
168 if (dev->callback)
169 status = (*dev->callback)(dev, rqst, data);
170 if (!status) {
171 dev->state = next_state;
172 dev->prev_state = prev_state;
173 }
174 }
175 else {
176 dev->prev_state = prev_state;
177 }
178 break;
179 default:
180 if (dev->callback)
181 status = (*dev->callback)(dev, rqst, data);
182 break;
183 }
184 return status;
185}
186
187/*
188 * Undo incomplete request
189 */
190static void pm_undo_all(struct pm_dev *last)
191{
192 struct list_head *entry = last->entry.prev;
193 while (entry != &pm_devs) {
194 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
195 if (dev->state != dev->prev_state) {
196 /* previous state was zero (running) resume or
197 * previous state was non-zero (suspended) suspend
198 */
199 pm_request_t undo = (dev->prev_state
200 ? PM_SUSPEND:PM_RESUME);
201 pm_send(dev, undo, (void*) dev->prev_state);
202 }
203 entry = entry->prev;
204 }
205}
206
207/**
208 * pm_send_all - send request to all managed devices
209 * @rqst: power management request
210 * @data: data for the callback
211 *
212 * Issue a power management request to a all devices. The
213 * %PM_SUSPEND events are handled specially. Any device is
214 * permitted to fail a suspend by returning a non zero (error)
215 * value from its callback function. If any device vetoes a
216 * suspend request then all other devices that have suspended
217 * during the processing of this request are restored to their
218 * previous state.
219 *
220 * WARNING: This function takes the pm_devs_lock. The lock is not dropped until
221 * the callbacks have completed. This prevents races against pm locking
222 * functions, races against module unload pm_unregister code. It does
223 * mean however that you must not issue pm_ functions within the callback
224 * or you will deadlock and users will hate you.
225 *
226 * Zero is returned on success. If a suspend fails then the status
227 * from the device that vetoes the suspend is returned.
228 *
229 * BUGS: what stops two power management requests occurring in parallel
230 * and conflicting.
231 */
232
233int pm_send_all(pm_request_t rqst, void *data)
234{
235 struct list_head *entry;
236
237 down(&pm_devs_lock);
238 entry = pm_devs.next;
239 while (entry != &pm_devs) {
240 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
241 if (dev->callback) {
242 int status = pm_send(dev, rqst, data);
243 if (status) {
244 /* return devices to previous state on
245 * failed suspend request
246 */
247 if (rqst == PM_SUSPEND)
248 pm_undo_all(dev);
249 up(&pm_devs_lock);
250 return status;
251 }
252 }
253 entry = entry->next;
254 }
255 up(&pm_devs_lock);
256 return 0;
257}
258
259EXPORT_SYMBOL(pm_register);
260EXPORT_SYMBOL(pm_unregister);
261EXPORT_SYMBOL(pm_unregister_all);
262EXPORT_SYMBOL(pm_send_all);
263EXPORT_SYMBOL(pm_active);
264
265
diff --git a/kernel/power/power.h b/kernel/power/power.h
new file mode 100644
index 000000000000..cd6a3493cc0d
--- /dev/null
+++ b/kernel/power/power.h
@@ -0,0 +1,52 @@
1#include <linux/suspend.h>
2#include <linux/utsname.h>
3
4/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
5 we probably do not take enough locks for switching consoles, etc,
6 so bad things might happen.
7*/
8#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
10#endif
11
12
13struct swsusp_info {
14 struct new_utsname uts;
15 u32 version_code;
16 unsigned long num_physpages;
17 int cpus;
18 unsigned long image_pages;
19 unsigned long pagedir_pages;
20 suspend_pagedir_t * suspend_pagedir;
21 swp_entry_t pagedir[768];
22} __attribute__((aligned(PAGE_SIZE)));
23
24
25
26#ifdef CONFIG_SOFTWARE_SUSPEND
27extern int pm_suspend_disk(void);
28
29#else
30static inline int pm_suspend_disk(void)
31{
32 return -EPERM;
33}
34#endif
35extern struct semaphore pm_sem;
36#define power_attr(_name) \
37static struct subsys_attribute _name##_attr = { \
38 .attr = { \
39 .name = __stringify(_name), \
40 .mode = 0644, \
41 }, \
42 .show = _name##_show, \
43 .store = _name##_store, \
44}
45
46extern struct subsystem power_subsys;
47
48extern int freeze_processes(void);
49extern void thaw_processes(void);
50
51extern int pm_prepare_console(void);
52extern void pm_restore_console(void);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
new file mode 100644
index 000000000000..715081b2d829
--- /dev/null
+++ b/kernel/power/poweroff.c
@@ -0,0 +1,45 @@
1/*
2 * poweroff.c - sysrq handler to gracefully power down machine.
3 *
4 * This file is released under the GPL v2
5 */
6
7#include <linux/kernel.h>
8#include <linux/sysrq.h>
9#include <linux/init.h>
10#include <linux/pm.h>
11#include <linux/workqueue.h>
12
13/*
14 * When the user hits Sys-Rq o to power down the machine this is the
15 * callback we use.
16 */
17
18static void do_poweroff(void *dummy)
19{
20 if (pm_power_off)
21 pm_power_off();
22}
23
24static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
25
26static void handle_poweroff(int key, struct pt_regs *pt_regs,
27 struct tty_struct *tty)
28{
29 schedule_work(&poweroff_work);
30}
31
32static struct sysrq_key_op sysrq_poweroff_op = {
33 .handler = handle_poweroff,
34 .help_msg = "powerOff",
35 .action_msg = "Power Off",
36 .enable_mask = SYSRQ_ENABLE_BOOT,
37};
38
39static int pm_sysrq_init(void)
40{
41 register_sysrq_key('o', &sysrq_poweroff_op);
42 return 0;
43}
44
45subsys_initcall(pm_sysrq_init);
diff --git a/kernel/power/process.c b/kernel/power/process.c
new file mode 100644
index 000000000000..78d92dc6a1ed
--- /dev/null
+++ b/kernel/power/process.c
@@ -0,0 +1,121 @@
1/*
2 * drivers/power/process.c - Functions for starting/stopping processes on
3 * suspend transitions.
4 *
5 * Originally from swsusp.
6 */
7
8
9#undef DEBUG
10
11#include <linux/smp_lock.h>
12#include <linux/interrupt.h>
13#include <linux/suspend.h>
14#include <linux/module.h>
15
16/*
17 * Timeout for stopping processes
18 */
19#define TIMEOUT (6 * HZ)
20
21
22static inline int freezeable(struct task_struct * p)
23{
24 if ((p == current) ||
25 (p->flags & PF_NOFREEZE) ||
26 (p->exit_state == EXIT_ZOMBIE) ||
27 (p->exit_state == EXIT_DEAD) ||
28 (p->state == TASK_STOPPED) ||
29 (p->state == TASK_TRACED))
30 return 0;
31 return 1;
32}
33
34/* Refrigerator is place where frozen processes are stored :-). */
35void refrigerator(unsigned long flag)
36{
37 /* Hmm, should we be allowed to suspend when there are realtime
38 processes around? */
39 long save;
40 save = current->state;
41 current->state = TASK_UNINTERRUPTIBLE;
42 pr_debug("%s entered refrigerator\n", current->comm);
43 printk("=");
44 current->flags &= ~PF_FREEZE;
45
46 spin_lock_irq(&current->sighand->siglock);
47 recalc_sigpending(); /* We sent fake signal, clean it up */
48 spin_unlock_irq(&current->sighand->siglock);
49
50 current->flags |= PF_FROZEN;
51 while (current->flags & PF_FROZEN)
52 schedule();
53 pr_debug("%s left refrigerator\n", current->comm);
54 current->state = save;
55}
56
57/* 0 = success, else # of processes that we failed to stop */
58int freeze_processes(void)
59{
60 int todo;
61 unsigned long start_time;
62 struct task_struct *g, *p;
63
64 printk( "Stopping tasks: " );
65 start_time = jiffies;
66 do {
67 todo = 0;
68 read_lock(&tasklist_lock);
69 do_each_thread(g, p) {
70 unsigned long flags;
71 if (!freezeable(p))
72 continue;
73 if ((p->flags & PF_FROZEN) ||
74 (p->state == TASK_TRACED) ||
75 (p->state == TASK_STOPPED))
76 continue;
77
78 /* FIXME: smp problem here: we may not access other process' flags
79 without locking */
80 p->flags |= PF_FREEZE;
81 spin_lock_irqsave(&p->sighand->siglock, flags);
82 signal_wake_up(p, 0);
83 spin_unlock_irqrestore(&p->sighand->siglock, flags);
84 todo++;
85 } while_each_thread(g, p);
86 read_unlock(&tasklist_lock);
87 yield(); /* Yield is okay here */
88 if (time_after(jiffies, start_time + TIMEOUT)) {
89 printk( "\n" );
90 printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
91 return todo;
92 }
93 } while(todo);
94
95 printk( "|\n" );
96 BUG_ON(in_atomic());
97 return 0;
98}
99
100void thaw_processes(void)
101{
102 struct task_struct *g, *p;
103
104 printk( "Restarting tasks..." );
105 read_lock(&tasklist_lock);
106 do_each_thread(g, p) {
107 if (!freezeable(p))
108 continue;
109 if (p->flags & PF_FROZEN) {
110 p->flags &= ~PF_FROZEN;
111 wake_up_process(p);
112 } else
113 printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
114 } while_each_thread(g, p);
115
116 read_unlock(&tasklist_lock);
117 schedule();
118 printk( " done\n" );
119}
120
121EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
new file mode 100644
index 000000000000..7fa7f6e2b7fb
--- /dev/null
+++ b/kernel/power/smp.c
@@ -0,0 +1,85 @@
1/*
2 * drivers/power/smp.c - Functions for stopping other CPUs.
3 *
4 * Copyright 2004 Pavel Machek <pavel@suse.cz>
5 * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#undef DEBUG
11
12#include <linux/smp_lock.h>
13#include <linux/interrupt.h>
14#include <linux/suspend.h>
15#include <linux/module.h>
16#include <asm/atomic.h>
17#include <asm/tlbflush.h>
18
19static atomic_t cpu_counter, freeze;
20
21
22static void smp_pause(void * data)
23{
24 struct saved_context ctxt;
25 __save_processor_state(&ctxt);
26 printk("Sleeping in:\n");
27 dump_stack();
28 atomic_inc(&cpu_counter);
29 while (atomic_read(&freeze)) {
30 /* FIXME: restore takes place at random piece inside this.
31 This should probably be written in assembly, and
32 preserve general-purpose registers, too
33
34 What about stack? We may need to move to new stack here.
35
36 This should better be ran with interrupts disabled.
37 */
38 cpu_relax();
39 barrier();
40 }
41 atomic_dec(&cpu_counter);
42 __restore_processor_state(&ctxt);
43}
44
45static cpumask_t oldmask;
46
47void disable_nonboot_cpus(void)
48{
49 printk("Freezing CPUs (at %d)", smp_processor_id());
50 oldmask = current->cpus_allowed;
51 set_cpus_allowed(current, cpumask_of_cpu(0));
52 current->state = TASK_INTERRUPTIBLE;
53 schedule_timeout(HZ);
54 printk("...");
55 BUG_ON(smp_processor_id() != 0);
56
57 /* FIXME: for this to work, all the CPUs must be running
58 * "idle" thread (or we deadlock). Is that guaranteed? */
59
60 atomic_set(&cpu_counter, 0);
61 atomic_set(&freeze, 1);
62 smp_call_function(smp_pause, NULL, 0, 0);
63 while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
64 cpu_relax();
65 barrier();
66 }
67 printk("ok\n");
68}
69
70void enable_nonboot_cpus(void)
71{
72 printk("Restarting CPUs");
73 atomic_set(&freeze, 0);
74 while (atomic_read(&cpu_counter)) {
75 cpu_relax();
76 barrier();
77 }
78 printk("...");
79 set_cpus_allowed(current, oldmask);
80 schedule();
81 printk("ok\n");
82
83}
84
85
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
new file mode 100644
index 000000000000..ae5bebc3b18f
--- /dev/null
+++ b/kernel/power/swsusp.c
@@ -0,0 +1,1433 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file is to realize architecture-independent
5 * machine suspend feature using pretty near only high-level routines
6 *
7 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8 * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
9 *
10 * This file is released under the GPLv2.
11 *
12 * I'd like to thank the following people for their work:
13 *
14 * Pavel Machek <pavel@ucw.cz>:
15 * Modifications, defectiveness pointing, being with me at the very beginning,
16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17 *
18 * Steve Doddi <dirk@loth.demon.co.uk>:
19 * Support the possibility of hardware state restoring.
20 *
21 * Raph <grey.havens@earthling.net>:
22 * Support for preserving states of network devices and virtual console
23 * (including X and svgatextmode)
24 *
25 * Kurt Garloff <garloff@suse.de>:
26 * Straightened the critical function in order to prevent compilers from
27 * playing tricks with local variables.
28 *
29 * Andreas Mohr <a.mohr@mailto.de>
30 *
31 * Alex Badea <vampire@go.ro>:
32 * Fixed runaway init
33 *
34 * More state savers are welcome. Especially for the scsi layer...
35 *
36 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
37 */
38
39#include <linux/module.h>
40#include <linux/mm.h>
41#include <linux/suspend.h>
42#include <linux/smp_lock.h>
43#include <linux/file.h>
44#include <linux/utsname.h>
45#include <linux/version.h>
46#include <linux/delay.h>
47#include <linux/reboot.h>
48#include <linux/bitops.h>
49#include <linux/vt_kern.h>
50#include <linux/kbd_kern.h>
51#include <linux/keyboard.h>
52#include <linux/spinlock.h>
53#include <linux/genhd.h>
54#include <linux/kernel.h>
55#include <linux/major.h>
56#include <linux/swap.h>
57#include <linux/pm.h>
58#include <linux/device.h>
59#include <linux/buffer_head.h>
60#include <linux/swapops.h>
61#include <linux/bootmem.h>
62#include <linux/syscalls.h>
63#include <linux/console.h>
64#include <linux/highmem.h>
65#include <linux/bio.h>
66
67#include <asm/uaccess.h>
68#include <asm/mmu_context.h>
69#include <asm/pgtable.h>
70#include <asm/tlbflush.h>
71#include <asm/io.h>
72
73#include "power.h"
74
75/* References to section boundaries */
76extern const void __nosave_begin, __nosave_end;
77
78/* Variables to be preserved over suspend */
79static int nr_copy_pages_check;
80
81extern char resume_file[];
82
83/* Local variables that should not be affected by save */
84unsigned int nr_copy_pages __nosavedata = 0;
85
86/* Suspend pagedir is allocated before final copy, therefore it
87 must be freed after resume
88
89 Warning: this is evil. There are actually two pagedirs at time of
90 resume. One is "pagedir_save", which is empty frame allocated at
91 time of suspend, that must be freed. Second is "pagedir_nosave",
92 allocated at time of resume, that travels through memory not to
93 collide with anything.
94
95 Warning: this is even more evil than it seems. Pagedirs this file
96 talks about are completely different from page directories used by
97 MMU hardware.
98 */
99suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
100static suspend_pagedir_t *pagedir_save;
101
102#define SWSUSP_SIG "S1SUSPEND"
103
104static struct swsusp_header {
105 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
106 swp_entry_t swsusp_info;
107 char orig_sig[10];
108 char sig[10];
109} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
110
111static struct swsusp_info swsusp_info;
112
113/*
114 * XXX: We try to keep some more pages free so that I/O operations succeed
115 * without paging. Might this be more?
116 */
117#define PAGES_FOR_IO 512
118
119/*
120 * Saving part...
121 */
122
123/* We memorize in swapfile_used what swap devices are used for suspension */
124#define SWAPFILE_UNUSED 0
125#define SWAPFILE_SUSPEND 1 /* This is the suspending device */
126#define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */
127
128static unsigned short swapfile_used[MAX_SWAPFILES];
129static unsigned short root_swap;
130
131static int mark_swapfiles(swp_entry_t prev)
132{
133 int error;
134
135 rw_swap_page_sync(READ,
136 swp_entry(root_swap, 0),
137 virt_to_page((unsigned long)&swsusp_header));
138 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
139 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
140 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
141 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
142 swsusp_header.swsusp_info = prev;
143 error = rw_swap_page_sync(WRITE,
144 swp_entry(root_swap, 0),
145 virt_to_page((unsigned long)
146 &swsusp_header));
147 } else {
148 pr_debug("swsusp: Partition is not swap space.\n");
149 error = -ENODEV;
150 }
151 return error;
152}
153
154/*
155 * Check whether the swap device is the specified resume
156 * device, irrespective of whether they are specified by
157 * identical names.
158 *
159 * (Thus, device inode aliasing is allowed. You can say /dev/hda4
160 * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
161 * and they'll be considered the same device. This is *necessary* for
162 * devfs, since the resume code can only recognize the form /dev/hda4,
163 * but the suspend code would see the long name.)
164 */
165static int is_resume_device(const struct swap_info_struct *swap_info)
166{
167 struct file *file = swap_info->swap_file;
168 struct inode *inode = file->f_dentry->d_inode;
169
170 return S_ISBLK(inode->i_mode) &&
171 swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
172}
173
174static int swsusp_swap_check(void) /* This is called before saving image */
175{
176 int i, len;
177
178 len=strlen(resume_file);
179 root_swap = 0xFFFF;
180
181 swap_list_lock();
182 for(i=0; i<MAX_SWAPFILES; i++) {
183 if (swap_info[i].flags == 0) {
184 swapfile_used[i]=SWAPFILE_UNUSED;
185 } else {
186 if(!len) {
187 printk(KERN_WARNING "resume= option should be used to set suspend device" );
188 if(root_swap == 0xFFFF) {
189 swapfile_used[i] = SWAPFILE_SUSPEND;
190 root_swap = i;
191 } else
192 swapfile_used[i] = SWAPFILE_IGNORED;
193 } else {
194 /* we ignore all swap devices that are not the resume_file */
195 if (is_resume_device(&swap_info[i])) {
196 swapfile_used[i] = SWAPFILE_SUSPEND;
197 root_swap = i;
198 } else {
199 swapfile_used[i] = SWAPFILE_IGNORED;
200 }
201 }
202 }
203 }
204 swap_list_unlock();
205 return (root_swap != 0xffff) ? 0 : -ENODEV;
206}
207
208/**
209 * This is called after saving image so modification
210 * will be lost after resume... and that's what we want.
211 * we make the device unusable. A new call to
212 * lock_swapdevices can unlock the devices.
213 */
214static void lock_swapdevices(void)
215{
216 int i;
217
218 swap_list_lock();
219 for(i = 0; i< MAX_SWAPFILES; i++)
220 if(swapfile_used[i] == SWAPFILE_IGNORED) {
221 swap_info[i].flags ^= 0xFF;
222 }
223 swap_list_unlock();
224}
225
226/**
227 * write_swap_page - Write one page to a fresh swap location.
228 * @addr: Address we're writing.
229 * @loc: Place to store the entry we used.
230 *
231 * Allocate a new swap entry and 'sync' it. Note we discard -EIO
232 * errors. That is an artifact left over from swsusp. It did not
233 * check the return of rw_swap_page_sync() at all, since most pages
234 * written back to swap would return -EIO.
235 * This is a partial improvement, since we will at least return other
236 * errors, though we need to eventually fix the damn code.
237 */
238static int write_page(unsigned long addr, swp_entry_t * loc)
239{
240 swp_entry_t entry;
241 int error = 0;
242
243 entry = get_swap_page();
244 if (swp_offset(entry) &&
245 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
246 error = rw_swap_page_sync(WRITE, entry,
247 virt_to_page(addr));
248 if (error == -EIO)
249 error = 0;
250 if (!error)
251 *loc = entry;
252 } else
253 error = -ENOSPC;
254 return error;
255}
256
257/**
258 * data_free - Free the swap entries used by the saved image.
259 *
260 * Walk the list of used swap entries and free each one.
261 * This is only used for cleanup when suspend fails.
262 */
263static void data_free(void)
264{
265 swp_entry_t entry;
266 int i;
267
268 for (i = 0; i < nr_copy_pages; i++) {
269 entry = (pagedir_nosave + i)->swap_address;
270 if (entry.val)
271 swap_free(entry);
272 else
273 break;
274 (pagedir_nosave + i)->swap_address = (swp_entry_t){0};
275 }
276}
277
278/**
279 * data_write - Write saved image to swap.
280 *
281 * Walk the list of pages in the image and sync each one to swap.
282 */
283static int data_write(void)
284{
285 int error = 0, i = 0;
286 unsigned int mod = nr_copy_pages / 100;
287 struct pbe *p;
288
289 if (!mod)
290 mod = 1;
291
292 printk( "Writing data to swap (%d pages)... ", nr_copy_pages );
293 for_each_pbe(p, pagedir_nosave) {
294 if (!(i%mod))
295 printk( "\b\b\b\b%3d%%", i / mod );
296 if ((error = write_page(p->address, &(p->swap_address))))
297 return error;
298 i++;
299 }
300 printk("\b\b\b\bdone\n");
301 return error;
302}
303
304static void dump_info(void)
305{
306 pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
307 pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
308 pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
309 pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
310 pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
311 pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
312 pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
313 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
314 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
315 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
316 pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages);
317}
318
319static void init_header(void)
320{
321 memset(&swsusp_info, 0, sizeof(swsusp_info));
322 swsusp_info.version_code = LINUX_VERSION_CODE;
323 swsusp_info.num_physpages = num_physpages;
324 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
325
326 swsusp_info.suspend_pagedir = pagedir_nosave;
327 swsusp_info.cpus = num_online_cpus();
328 swsusp_info.image_pages = nr_copy_pages;
329}
330
331static int close_swap(void)
332{
333 swp_entry_t entry;
334 int error;
335
336 dump_info();
337 error = write_page((unsigned long)&swsusp_info, &entry);
338 if (!error) {
339 printk( "S" );
340 error = mark_swapfiles(entry);
341 printk( "|\n" );
342 }
343 return error;
344}
345
346/**
347 * free_pagedir_entries - Free pages used by the page directory.
348 *
349 * This is used during suspend for error recovery.
350 */
351
352static void free_pagedir_entries(void)
353{
354 int i;
355
356 for (i = 0; i < swsusp_info.pagedir_pages; i++)
357 swap_free(swsusp_info.pagedir[i]);
358}
359
360
361/**
362 * write_pagedir - Write the array of pages holding the page directory.
363 * @last: Last swap entry we write (needed for header).
364 */
365
366static int write_pagedir(void)
367{
368 int error = 0;
369 unsigned n = 0;
370 struct pbe * pbe;
371
372 printk( "Writing pagedir...");
373 for_each_pb_page(pbe, pagedir_nosave) {
374 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
375 return error;
376 }
377
378 swsusp_info.pagedir_pages = n;
379 printk("done (%u pages)\n", n);
380 return error;
381}
382
383/**
384 * write_suspend_image - Write entire image and metadata.
385 *
386 */
387
388static int write_suspend_image(void)
389{
390 int error;
391
392 init_header();
393 if ((error = data_write()))
394 goto FreeData;
395
396 if ((error = write_pagedir()))
397 goto FreePagedir;
398
399 if ((error = close_swap()))
400 goto FreePagedir;
401 Done:
402 return error;
403 FreePagedir:
404 free_pagedir_entries();
405 FreeData:
406 data_free();
407 goto Done;
408}
409
410
411#ifdef CONFIG_HIGHMEM
412struct highmem_page {
413 char *data;
414 struct page *page;
415 struct highmem_page *next;
416};
417
418static struct highmem_page *highmem_copy;
419
420static int save_highmem_zone(struct zone *zone)
421{
422 unsigned long zone_pfn;
423 mark_free_pages(zone);
424 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
425 struct page *page;
426 struct highmem_page *save;
427 void *kaddr;
428 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
429
430 if (!(pfn%1000))
431 printk(".");
432 if (!pfn_valid(pfn))
433 continue;
434 page = pfn_to_page(pfn);
435 /*
436 * This condition results from rvmalloc() sans vmalloc_32()
437 * and architectural memory reservations. This should be
438 * corrected eventually when the cases giving rise to this
439 * are better understood.
440 */
441 if (PageReserved(page)) {
442 printk("highmem reserved page?!\n");
443 continue;
444 }
445 BUG_ON(PageNosave(page));
446 if (PageNosaveFree(page))
447 continue;
448 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
449 if (!save)
450 return -ENOMEM;
451 save->next = highmem_copy;
452 save->page = page;
453 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
454 if (!save->data) {
455 kfree(save);
456 return -ENOMEM;
457 }
458 kaddr = kmap_atomic(page, KM_USER0);
459 memcpy(save->data, kaddr, PAGE_SIZE);
460 kunmap_atomic(kaddr, KM_USER0);
461 highmem_copy = save;
462 }
463 return 0;
464}
465#endif /* CONFIG_HIGHMEM */
466
467
468static int save_highmem(void)
469{
470#ifdef CONFIG_HIGHMEM
471 struct zone *zone;
472 int res = 0;
473
474 pr_debug("swsusp: Saving Highmem\n");
475 for_each_zone(zone) {
476 if (is_highmem(zone))
477 res = save_highmem_zone(zone);
478 if (res)
479 return res;
480 }
481#endif
482 return 0;
483}
484
485static int restore_highmem(void)
486{
487#ifdef CONFIG_HIGHMEM
488 printk("swsusp: Restoring Highmem\n");
489 while (highmem_copy) {
490 struct highmem_page *save = highmem_copy;
491 void *kaddr;
492 highmem_copy = save->next;
493
494 kaddr = kmap_atomic(save->page, KM_USER0);
495 memcpy(kaddr, save->data, PAGE_SIZE);
496 kunmap_atomic(kaddr, KM_USER0);
497 free_page((long) save->data);
498 kfree(save);
499 }
500#endif
501 return 0;
502}
503
504
505static int pfn_is_nosave(unsigned long pfn)
506{
507 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
508 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
509 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
510}
511
512/**
513 * saveable - Determine whether a page should be cloned or not.
514 * @pfn: The page
515 *
516 * We save a page if it's Reserved, and not in the range of pages
517 * statically defined as 'unsaveable', or if it isn't reserved, and
518 * isn't part of a free chunk of pages.
519 */
520
521static int saveable(struct zone * zone, unsigned long * zone_pfn)
522{
523 unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
524 struct page * page;
525
526 if (!pfn_valid(pfn))
527 return 0;
528
529 page = pfn_to_page(pfn);
530 BUG_ON(PageReserved(page) && PageNosave(page));
531 if (PageNosave(page))
532 return 0;
533 if (PageReserved(page) && pfn_is_nosave(pfn)) {
534 pr_debug("[nosave pfn 0x%lx]", pfn);
535 return 0;
536 }
537 if (PageNosaveFree(page))
538 return 0;
539
540 return 1;
541}
542
543static void count_data_pages(void)
544{
545 struct zone *zone;
546 unsigned long zone_pfn;
547
548 nr_copy_pages = 0;
549
550 for_each_zone(zone) {
551 if (is_highmem(zone))
552 continue;
553 mark_free_pages(zone);
554 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
555 nr_copy_pages += saveable(zone, &zone_pfn);
556 }
557}
558
559
560static void copy_data_pages(void)
561{
562 struct zone *zone;
563 unsigned long zone_pfn;
564 struct pbe * pbe = pagedir_nosave;
565
566 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
567 for_each_zone(zone) {
568 if (is_highmem(zone))
569 continue;
570 mark_free_pages(zone);
571 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
572 if (saveable(zone, &zone_pfn)) {
573 struct page * page;
574 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
575 BUG_ON(!pbe);
576 pbe->orig_address = (long) page_address(page);
577 /* copy_page is not usable for copying task structs. */
578 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
579 pbe = pbe->next;
580 }
581 }
582 }
583 BUG_ON(pbe);
584}
585
586
587/**
588 * calc_nr - Determine the number of pages needed for a pbe list.
589 */
590
591static int calc_nr(int nr_copy)
592{
593 int extra = 0;
594 int mod = !!(nr_copy % PBES_PER_PAGE);
595 int diff = (nr_copy / PBES_PER_PAGE) + mod;
596
597 do {
598 extra += diff;
599 nr_copy += diff;
600 mod = !!(nr_copy % PBES_PER_PAGE);
601 diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
602 } while (diff > 0);
603
604 return nr_copy;
605}
606
607/**
608 * free_pagedir - free pages allocated with alloc_pagedir()
609 */
610
611static inline void free_pagedir(struct pbe *pblist)
612{
613 struct pbe *pbe;
614
615 while (pblist) {
616 pbe = (pblist + PB_PAGE_SKIP)->next;
617 free_page((unsigned long)pblist);
618 pblist = pbe;
619 }
620}
621
622/**
623 * fill_pb_page - Create a list of PBEs on a given memory page
624 */
625
626static inline void fill_pb_page(struct pbe *pbpage)
627{
628 struct pbe *p;
629
630 p = pbpage;
631 pbpage += PB_PAGE_SKIP;
632 do
633 p->next = p + 1;
634 while (++p < pbpage);
635}
636
637/**
638 * create_pbe_list - Create a list of PBEs on top of a given chain
639 * of memory pages allocated with alloc_pagedir()
640 */
641
642static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
643{
644 struct pbe *pbpage, *p;
645 unsigned num = PBES_PER_PAGE;
646
647 for_each_pb_page (pbpage, pblist) {
648 if (num >= nr_pages)
649 break;
650
651 fill_pb_page(pbpage);
652 num += PBES_PER_PAGE;
653 }
654 if (pbpage) {
655 for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
656 p->next = p + 1;
657 p->next = NULL;
658 }
659 pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
660}
661
662/**
663 * alloc_pagedir - Allocate the page directory.
664 *
665 * First, determine exactly how many pages we need and
666 * allocate them.
667 *
668 * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
669 * struct pbe elements (pbes) and the last element in the page points
670 * to the next page.
671 *
672 * On each page we set up a list of struct_pbe elements.
673 */
674
675static struct pbe * alloc_pagedir(unsigned nr_pages)
676{
677 unsigned num;
678 struct pbe *pblist, *pbe;
679
680 if (!nr_pages)
681 return NULL;
682
683 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
684 pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
685 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
686 pbe = pbe->next, num += PBES_PER_PAGE) {
687 pbe += PB_PAGE_SKIP;
688 pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
689 }
690 if (!pbe) { /* get_zeroed_page() failed */
691 free_pagedir(pblist);
692 pblist = NULL;
693 }
694 return pblist;
695}
696
697/**
698 * free_image_pages - Free pages allocated for snapshot
699 */
700
701static void free_image_pages(void)
702{
703 struct pbe * p;
704
705 for_each_pbe(p, pagedir_save) {
706 if (p->address) {
707 ClearPageNosave(virt_to_page(p->address));
708 free_page(p->address);
709 p->address = 0;
710 }
711 }
712}
713
714/**
715 * alloc_image_pages - Allocate pages for the snapshot.
716 */
717
718static int alloc_image_pages(void)
719{
720 struct pbe * p;
721
722 for_each_pbe(p, pagedir_save) {
723 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
724 if (!p->address)
725 return -ENOMEM;
726 SetPageNosave(virt_to_page(p->address));
727 }
728 return 0;
729}
730
731void swsusp_free(void)
732{
733 BUG_ON(PageNosave(virt_to_page(pagedir_save)));
734 BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
735 free_image_pages();
736 free_pagedir(pagedir_save);
737}
738
739
740/**
741 * enough_free_mem - Make sure we enough free memory to snapshot.
742 *
743 * Returns TRUE or FALSE after checking the number of available
744 * free pages.
745 */
746
747static int enough_free_mem(void)
748{
749 if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
750 pr_debug("swsusp: Not enough free pages: Have %d\n",
751 nr_free_pages());
752 return 0;
753 }
754 return 1;
755}
756
757
758/**
759 * enough_swap - Make sure we have enough swap to save the image.
760 *
761 * Returns TRUE or FALSE after checking the total amount of swap
762 * space avaiable.
763 *
764 * FIXME: si_swapinfo(&i) returns all swap devices information.
765 * We should only consider resume_device.
766 */
767
768static int enough_swap(void)
769{
770 struct sysinfo i;
771
772 si_swapinfo(&i);
773 if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) {
774 pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
775 return 0;
776 }
777 return 1;
778}
779
780static int swsusp_alloc(void)
781{
782 int error;
783
784 pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
785 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
786
787 pagedir_nosave = NULL;
788 if (!enough_free_mem())
789 return -ENOMEM;
790
791 if (!enough_swap())
792 return -ENOSPC;
793
794 nr_copy_pages = calc_nr(nr_copy_pages);
795
796 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
797 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
798 return -ENOMEM;
799 }
800 create_pbe_list(pagedir_save, nr_copy_pages);
801 pagedir_nosave = pagedir_save;
802 if ((error = alloc_image_pages())) {
803 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
804 swsusp_free();
805 return error;
806 }
807
808 nr_copy_pages_check = nr_copy_pages;
809 return 0;
810}
811
812static int suspend_prepare_image(void)
813{
814 int error;
815
816 pr_debug("swsusp: critical section: \n");
817 if (save_highmem()) {
818 printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
819 restore_highmem();
820 return -ENOMEM;
821 }
822
823 drain_local_pages();
824 count_data_pages();
825 printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
826
827 error = swsusp_alloc();
828 if (error)
829 return error;
830
831 /* During allocating of suspend pagedir, new cold pages may appear.
832 * Kill them.
833 */
834 drain_local_pages();
835 copy_data_pages();
836
837 /*
838 * End of critical section. From now on, we can write to memory,
839 * but we should not touch disk. This specially means we must _not_
840 * touch swap space! Except we must write out our image of course.
841 */
842
843 printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
844 return 0;
845}
846
847
848/* It is important _NOT_ to umount filesystems at this point. We want
849 * them synced (in case something goes wrong) but we DO not want to mark
850 * filesystem clean: it is not. (And it does not matter, if we resume
851 * correctly, we'll mark system clean, anyway.)
852 */
853int swsusp_write(void)
854{
855 int error;
856 device_resume();
857 lock_swapdevices();
858 error = write_suspend_image();
859 /* This will unlock ignored swap devices since writing is finished */
860 lock_swapdevices();
861 return error;
862
863}
864
865
866extern asmlinkage int swsusp_arch_suspend(void);
867extern asmlinkage int swsusp_arch_resume(void);
868
869
870asmlinkage int swsusp_save(void)
871{
872 int error = 0;
873
874 if ((error = swsusp_swap_check())) {
875 printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
876 "swapon -a!\n");
877 return error;
878 }
879 return suspend_prepare_image();
880}
881
882int swsusp_suspend(void)
883{
884 int error;
885 if ((error = arch_prepare_suspend()))
886 return error;
887 local_irq_disable();
888 /* At this point, device_suspend() has been called, but *not*
889 * device_power_down(). We *must* device_power_down() now.
890 * Otherwise, drivers for some devices (e.g. interrupt controllers)
891 * become desynchronized with the actual state of the hardware
892 * at resume time, and evil weirdness ensues.
893 */
894 if ((error = device_power_down(PMSG_FREEZE))) {
895 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
896 local_irq_enable();
897 swsusp_free();
898 return error;
899 }
900 save_processor_state();
901 if ((error = swsusp_arch_suspend()))
902 swsusp_free();
903 /* Restore control flow magically appears here */
904 restore_processor_state();
905 BUG_ON (nr_copy_pages_check != nr_copy_pages);
906 restore_highmem();
907 device_power_up();
908 local_irq_enable();
909 return error;
910}
911
912int swsusp_resume(void)
913{
914 int error;
915 local_irq_disable();
916 if (device_power_down(PMSG_FREEZE))
917 printk(KERN_ERR "Some devices failed to power down, very bad\n");
918 /* We'll ignore saved state, but this gets preempt count (etc) right */
919 save_processor_state();
920 error = swsusp_arch_resume();
921 /* Code below is only ever reached in case of failure. Otherwise
922 * execution continues at place where swsusp_arch_suspend was called
923 */
924 BUG_ON(!error);
925 restore_processor_state();
926 restore_highmem();
927 device_power_up();
928 local_irq_enable();
929 return error;
930}
931
932/* More restore stuff */
933
934/*
935 * Returns true if given address/order collides with any orig_address
936 */
937static int does_collide_order(unsigned long addr, int order)
938{
939 int i;
940
941 for (i=0; i < (1<<order); i++)
942 if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
943 return 1;
944 return 0;
945}
946
947/**
948 * On resume, for storing the PBE list and the image,
949 * we can only use memory pages that do not conflict with the pages
950 * which had been used before suspend.
951 *
952 * We don't know which pages are usable until we allocate them.
953 *
954 * Allocated but unusable (ie eaten) memory pages are linked together
955 * to create a list, so that we can free them easily
956 *
957 * We could have used a type other than (void *)
958 * for this purpose, but ...
959 */
960static void **eaten_memory = NULL;
961
962static inline void eat_page(void *page)
963{
964 void **c;
965
966 c = eaten_memory;
967 eaten_memory = page;
968 *eaten_memory = c;
969}
970
971static unsigned long get_usable_page(unsigned gfp_mask)
972{
973 unsigned long m;
974
975 m = get_zeroed_page(gfp_mask);
976 while (does_collide_order(m, 0)) {
977 eat_page((void *)m);
978 m = get_zeroed_page(gfp_mask);
979 if (!m)
980 break;
981 }
982 return m;
983}
984
985static void free_eaten_memory(void)
986{
987 unsigned long m;
988 void **c;
989 int i = 0;
990
991 c = eaten_memory;
992 while (c) {
993 m = (unsigned long)c;
994 c = *c;
995 free_page(m);
996 i++;
997 }
998 eaten_memory = NULL;
999 pr_debug("swsusp: %d unused pages freed\n", i);
1000}
1001
1002/**
1003 * check_pagedir - We ensure here that pages that the PBEs point to
1004 * won't collide with pages where we're going to restore from the loaded
1005 * pages later
1006 */
1007
1008static int check_pagedir(struct pbe *pblist)
1009{
1010 struct pbe *p;
1011
1012 /* This is necessary, so that we can free allocated pages
1013 * in case of failure
1014 */
1015 for_each_pbe (p, pblist)
1016 p->address = 0UL;
1017
1018 for_each_pbe (p, pblist) {
1019 p->address = get_usable_page(GFP_ATOMIC);
1020 if (!p->address)
1021 return -ENOMEM;
1022 }
1023 return 0;
1024}
1025
1026/**
1027 * swsusp_pagedir_relocate - It is possible, that some memory pages
1028 * occupied by the list of PBEs collide with pages where we're going to
1029 * restore from the loaded pages later. We relocate them here.
1030 */
1031
1032static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1033{
1034 struct zone *zone;
1035 unsigned long zone_pfn;
1036 struct pbe *pbpage, *tail, *p;
1037 void *m;
1038 int rel = 0, error = 0;
1039
1040 if (!pblist) /* a sanity check */
1041 return NULL;
1042
1043 pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
1044 swsusp_info.pagedir_pages);
1045
1046 /* Set page flags */
1047
1048 for_each_zone(zone) {
1049 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
1050 SetPageNosaveFree(pfn_to_page(zone_pfn +
1051 zone->zone_start_pfn));
1052 }
1053
1054 /* Clear orig addresses */
1055
1056 for_each_pbe (p, pblist)
1057 ClearPageNosaveFree(virt_to_page(p->orig_address));
1058
1059 tail = pblist + PB_PAGE_SKIP;
1060
1061 /* Relocate colliding pages */
1062
1063 for_each_pb_page (pbpage, pblist) {
1064 if (does_collide_order((unsigned long)pbpage, 0)) {
1065 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
1066 if (!m) {
1067 error = -ENOMEM;
1068 break;
1069 }
1070 memcpy(m, (void *)pbpage, PAGE_SIZE);
1071 if (pbpage == pblist)
1072 pblist = (struct pbe *)m;
1073 else
1074 tail->next = (struct pbe *)m;
1075
1076 eat_page((void *)pbpage);
1077 pbpage = (struct pbe *)m;
1078
1079 /* We have to link the PBEs again */
1080
1081 for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
1082 if (p->next) /* needed to save the end */
1083 p->next = p + 1;
1084
1085 rel++;
1086 }
1087 tail = pbpage + PB_PAGE_SKIP;
1088 }
1089
1090 if (error) {
1091 printk("\nswsusp: Out of memory\n\n");
1092 free_pagedir(pblist);
1093 free_eaten_memory();
1094 pblist = NULL;
1095 }
1096 else
1097 printk("swsusp: Relocated %d pages\n", rel);
1098
1099 return pblist;
1100}
1101
1102/**
1103 * Using bio to read from swap.
1104 * This code requires a bit more work than just using buffer heads
1105 * but, it is the recommended way for 2.5/2.6.
1106 * The following are to signal the beginning and end of I/O. Bios
1107 * finish asynchronously, while we want them to happen synchronously.
1108 * A simple atomic_t, and a wait loop take care of this problem.
1109 */
1110
1111static atomic_t io_done = ATOMIC_INIT(0);
1112
1113static int end_io(struct bio * bio, unsigned int num, int err)
1114{
1115 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1116 panic("I/O error reading memory image");
1117 atomic_set(&io_done, 0);
1118 return 0;
1119}
1120
1121static struct block_device * resume_bdev;
1122
1123/**
1124 * submit - submit BIO request.
1125 * @rw: READ or WRITE.
1126 * @off physical offset of page.
1127 * @page: page we're reading or writing.
1128 *
1129 * Straight from the textbook - allocate and initialize the bio.
1130 * If we're writing, make sure the page is marked as dirty.
1131 * Then submit it and wait.
1132 */
1133
1134static int submit(int rw, pgoff_t page_off, void * page)
1135{
1136 int error = 0;
1137 struct bio * bio;
1138
1139 bio = bio_alloc(GFP_ATOMIC, 1);
1140 if (!bio)
1141 return -ENOMEM;
1142 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
1143 bio_get(bio);
1144 bio->bi_bdev = resume_bdev;
1145 bio->bi_end_io = end_io;
1146
1147 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
1148 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
1149 error = -EFAULT;
1150 goto Done;
1151 }
1152
1153 if (rw == WRITE)
1154 bio_set_pages_dirty(bio);
1155
1156 atomic_set(&io_done, 1);
1157 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
1158 while (atomic_read(&io_done))
1159 yield();
1160
1161 Done:
1162 bio_put(bio);
1163 return error;
1164}
1165
1166static int bio_read_page(pgoff_t page_off, void * page)
1167{
1168 return submit(READ, page_off, page);
1169}
1170
1171static int bio_write_page(pgoff_t page_off, void * page)
1172{
1173 return submit(WRITE, page_off, page);
1174}
1175
1176/*
1177 * Sanity check if this image makes sense with this kernel/swap context
1178 * I really don't think that it's foolproof but more than nothing..
1179 */
1180
1181static const char * sanity_check(void)
1182{
1183 dump_info();
1184 if(swsusp_info.version_code != LINUX_VERSION_CODE)
1185 return "kernel version";
1186 if(swsusp_info.num_physpages != num_physpages)
1187 return "memory size";
1188 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
1189 return "system type";
1190 if (strcmp(swsusp_info.uts.release,system_utsname.release))
1191 return "kernel release";
1192 if (strcmp(swsusp_info.uts.version,system_utsname.version))
1193 return "version";
1194 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
1195 return "machine";
1196 if(swsusp_info.cpus != num_online_cpus())
1197 return "number of cpus";
1198 return NULL;
1199}
1200
1201
1202static int check_header(void)
1203{
1204 const char * reason = NULL;
1205 int error;
1206
1207 if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
1208 return error;
1209
1210 /* Is this same machine? */
1211 if ((reason = sanity_check())) {
1212 printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
1213 return -EPERM;
1214 }
1215 nr_copy_pages = swsusp_info.image_pages;
1216 return error;
1217}
1218
1219static int check_sig(void)
1220{
1221 int error;
1222
1223 memset(&swsusp_header, 0, sizeof(swsusp_header));
1224 if ((error = bio_read_page(0, &swsusp_header)))
1225 return error;
1226 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1227 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1228
1229 /*
1230 * Reset swap signature now.
1231 */
1232 error = bio_write_page(0, &swsusp_header);
1233 } else {
1234 printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
1235 return -EINVAL;
1236 }
1237 if (!error)
1238 pr_debug("swsusp: Signature found, resuming\n");
1239 return error;
1240}
1241
1242/**
1243 * data_read - Read image pages from swap.
1244 *
1245 * You do not need to check for overlaps, check_pagedir()
1246 * already did that.
1247 */
1248
1249static int data_read(struct pbe *pblist)
1250{
1251 struct pbe * p;
1252 int error = 0;
1253 int i = 0;
1254 int mod = swsusp_info.image_pages / 100;
1255
1256 if (!mod)
1257 mod = 1;
1258
1259 printk("swsusp: Reading image data (%lu pages): ",
1260 swsusp_info.image_pages);
1261
1262 for_each_pbe (p, pblist) {
1263 if (!(i % mod))
1264 printk("\b\b\b\b%3d%%", i / mod);
1265
1266 error = bio_read_page(swp_offset(p->swap_address),
1267 (void *)p->address);
1268 if (error)
1269 return error;
1270
1271 i++;
1272 }
1273 printk("\b\b\b\bdone\n");
1274 return error;
1275}
1276
1277extern dev_t name_to_dev_t(const char *line);
1278
1279/**
1280 * read_pagedir - Read page backup list pages from swap
1281 */
1282
1283static int read_pagedir(struct pbe *pblist)
1284{
1285 struct pbe *pbpage, *p;
1286 unsigned i = 0;
1287 int error;
1288
1289 if (!pblist)
1290 return -EFAULT;
1291
1292 printk("swsusp: Reading pagedir (%lu pages)\n",
1293 swsusp_info.pagedir_pages);
1294
1295 for_each_pb_page (pbpage, pblist) {
1296 unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);
1297
1298 error = -EFAULT;
1299 if (offset) {
1300 p = (pbpage + PB_PAGE_SKIP)->next;
1301 error = bio_read_page(offset, (void *)pbpage);
1302 (pbpage + PB_PAGE_SKIP)->next = p;
1303 }
1304 if (error)
1305 break;
1306 }
1307
1308 if (error)
1309 free_page((unsigned long)pblist);
1310
1311 BUG_ON(i != swsusp_info.pagedir_pages);
1312
1313 return error;
1314}
1315
1316
1317static int check_suspend_image(void)
1318{
1319 int error = 0;
1320
1321 if ((error = check_sig()))
1322 return error;
1323
1324 if ((error = check_header()))
1325 return error;
1326
1327 return 0;
1328}
1329
1330static int read_suspend_image(void)
1331{
1332 int error = 0;
1333 struct pbe *p;
1334
1335 if (!(p = alloc_pagedir(nr_copy_pages)))
1336 return -ENOMEM;
1337
1338 if ((error = read_pagedir(p)))
1339 return error;
1340
1341 create_pbe_list(p, nr_copy_pages);
1342
1343 if (!(pagedir_nosave = swsusp_pagedir_relocate(p)))
1344 return -ENOMEM;
1345
1346 /* Allocate memory for the image and read the data from swap */
1347
1348 error = check_pagedir(pagedir_nosave);
1349 free_eaten_memory();
1350 if (!error)
1351 error = data_read(pagedir_nosave);
1352
1353 if (error) { /* We fail cleanly */
1354 for_each_pbe (p, pagedir_nosave)
1355 if (p->address) {
1356 free_page(p->address);
1357 p->address = 0UL;
1358 }
1359 free_pagedir(pagedir_nosave);
1360 }
1361 return error;
1362}
1363
1364/**
1365 * swsusp_check - Check for saved image in swap
1366 */
1367
1368int swsusp_check(void)
1369{
1370 int error;
1371
1372 if (!swsusp_resume_device) {
1373 if (!strlen(resume_file))
1374 return -ENOENT;
1375 swsusp_resume_device = name_to_dev_t(resume_file);
1376 pr_debug("swsusp: Resume From Partition %s\n", resume_file);
1377 } else {
1378 pr_debug("swsusp: Resume From Partition %d:%d\n",
1379 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
1380 }
1381
1382 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
1383 if (!IS_ERR(resume_bdev)) {
1384 set_blocksize(resume_bdev, PAGE_SIZE);
1385 error = check_suspend_image();
1386 if (error)
1387 blkdev_put(resume_bdev);
1388 } else
1389 error = PTR_ERR(resume_bdev);
1390
1391 if (!error)
1392 pr_debug("swsusp: resume file found\n");
1393 else
1394 pr_debug("swsusp: Error %d check for resume file\n", error);
1395 return error;
1396}
1397
1398/**
1399 * swsusp_read - Read saved image from swap.
1400 */
1401
1402int swsusp_read(void)
1403{
1404 int error;
1405
1406 if (IS_ERR(resume_bdev)) {
1407 pr_debug("swsusp: block device not initialised\n");
1408 return PTR_ERR(resume_bdev);
1409 }
1410
1411 error = read_suspend_image();
1412 blkdev_put(resume_bdev);
1413
1414 if (!error)
1415 pr_debug("swsusp: Reading resume file was successful\n");
1416 else
1417 pr_debug("swsusp: Error %d resuming\n", error);
1418 return error;
1419}
1420
1421/**
1422 * swsusp_close - close swap device.
1423 */
1424
1425void swsusp_close(void)
1426{
1427 if (IS_ERR(resume_bdev)) {
1428 pr_debug("swsusp: block device not initialised\n");
1429 return;
1430 }
1431
1432 blkdev_put(resume_bdev);
1433}
diff --git a/kernel/printk.c b/kernel/printk.c
new file mode 100644
index 000000000000..1498689548d1
--- /dev/null
+++ b/kernel/printk.c
@@ -0,0 +1,996 @@
1/*
2 * linux/kernel/printk.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * Modified to make sys_syslog() more flexible: added commands to
7 * return the last 4k of kernel messages, regardless of whether
8 * they've been read or not. Added option to suppress kernel printk's
9 * to the console. Added hook for sending the console messages
10 * elsewhere, in preparation for a serial line console (someday).
11 * Ted Ts'o, 2/11/93.
12 * Modified for sysctl support, 1/8/97, Chris Horn.
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfreds@colorfullife.com
15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au>
17 */
18
19#include <linux/kernel.h>
20#include <linux/mm.h>
21#include <linux/tty.h>
22#include <linux/tty_driver.h>
23#include <linux/smp_lock.h>
24#include <linux/console.h>
25#include <linux/init.h>
26#include <linux/module.h>
27#include <linux/interrupt.h> /* For in_interrupt() */
28#include <linux/config.h>
29#include <linux/delay.h>
30#include <linux/smp.h>
31#include <linux/security.h>
32#include <linux/bootmem.h>
33#include <linux/syscalls.h>
34
35#include <asm/uaccess.h>
36
37#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
38
39/* printk's without a loglevel use this.. */
40#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
41
42/* We show everything that is MORE important than this.. */
43#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
44#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
45
46DECLARE_WAIT_QUEUE_HEAD(log_wait);
47
48int console_printk[4] = {
49 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
50 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
51 MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */
52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
53};
54
55EXPORT_SYMBOL(console_printk);
56
57/*
58 * Low lever drivers may need that to know if they can schedule in
59 * their unblank() callback or not. So let's export it.
60 */
61int oops_in_progress;
62EXPORT_SYMBOL(oops_in_progress);
63
64/*
65 * console_sem protects the console_drivers list, and also
66 * provides serialisation for access to the entire console
67 * driver system.
68 */
69static DECLARE_MUTEX(console_sem);
70struct console *console_drivers;
71/*
72 * This is used for debugging the mess that is the VT code by
73 * keeping track if we have the console semaphore held. It's
74 * definitely not the perfect debug tool (we don't know if _WE_
75 * hold it are racing, but it helps tracking those weird code
76 * path in the console code where we end up in places I want
77 * locked without the console sempahore held
78 */
79static int console_locked;
80
81/*
82 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
83 * It is also used in interesting ways to provide interlocking in
84 * release_console_sem().
85 */
86static DEFINE_SPINLOCK(logbuf_lock);
87
88static char __log_buf[__LOG_BUF_LEN];
89static char *log_buf = __log_buf;
90static int log_buf_len = __LOG_BUF_LEN;
91
92#define LOG_BUF_MASK (log_buf_len-1)
93#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
94
95/*
96 * The indices into log_buf are not constrained to log_buf_len - they
97 * must be masked before subscripting
98 */
99static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */
100static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */
101static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */
102static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
103
104/*
105 * Array of consoles built from command line options (console=)
106 */
107struct console_cmdline
108{
109 char name[8]; /* Name of the driver */
110 int index; /* Minor dev. to use */
111 char *options; /* Options for the driver */
112};
113
114#define MAX_CMDLINECONSOLES 8
115
116static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
117static int selected_console = -1;
118static int preferred_console = -1;
119
120/* Flag: console code may call schedule() */
121static int console_may_schedule;
122
123/*
124 * Setup a list of consoles. Called from init/main.c
125 */
126static int __init console_setup(char *str)
127{
128 char name[sizeof(console_cmdline[0].name)];
129 char *s, *options;
130 int idx;
131
132 /*
133 * Decode str into name, index, options.
134 */
135 if (str[0] >= '0' && str[0] <= '9') {
136 strcpy(name, "ttyS");
137 strncpy(name + 4, str, sizeof(name) - 5);
138 } else
139 strncpy(name, str, sizeof(name) - 1);
140 name[sizeof(name) - 1] = 0;
141 if ((options = strchr(str, ',')) != NULL)
142 *(options++) = 0;
143#ifdef __sparc__
144 if (!strcmp(str, "ttya"))
145 strcpy(name, "ttyS0");
146 if (!strcmp(str, "ttyb"))
147 strcpy(name, "ttyS1");
148#endif
149 for(s = name; *s; s++)
150 if ((*s >= '0' && *s <= '9') || *s == ',')
151 break;
152 idx = simple_strtoul(s, NULL, 10);
153 *s = 0;
154
155 add_preferred_console(name, idx, options);
156 return 1;
157}
158
159__setup("console=", console_setup);
160
161/**
162 * add_preferred_console - add a device to the list of preferred consoles.
163 *
164 * The last preferred console added will be used for kernel messages
165 * and stdin/out/err for init. Normally this is used by console_setup
166 * above to handle user-supplied console arguments; however it can also
167 * be used by arch-specific code either to override the user or more
168 * commonly to provide a default console (ie from PROM variables) when
169 * the user has not supplied one.
170 */
171int __init add_preferred_console(char *name, int idx, char *options)
172{
173 struct console_cmdline *c;
174 int i;
175
176 /*
177 * See if this tty is not yet registered, and
178 * if we have a slot free.
179 */
180 for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
181 if (strcmp(console_cmdline[i].name, name) == 0 &&
182 console_cmdline[i].index == idx) {
183 selected_console = i;
184 return 0;
185 }
186 if (i == MAX_CMDLINECONSOLES)
187 return -E2BIG;
188 selected_console = i;
189 c = &console_cmdline[i];
190 memcpy(c->name, name, sizeof(c->name));
191 c->name[sizeof(c->name) - 1] = 0;
192 c->options = options;
193 c->index = idx;
194 return 0;
195}
196
197static int __init log_buf_len_setup(char *str)
198{
199 unsigned long size = memparse(str, &str);
200 unsigned long flags;
201
202 if (size)
203 size = roundup_pow_of_two(size);
204 if (size > log_buf_len) {
205 unsigned long start, dest_idx, offset;
206 char * new_log_buf;
207
208 new_log_buf = alloc_bootmem(size);
209 if (!new_log_buf) {
210 printk("log_buf_len: allocation failed\n");
211 goto out;
212 }
213
214 spin_lock_irqsave(&logbuf_lock, flags);
215 log_buf_len = size;
216 log_buf = new_log_buf;
217
218 offset = start = min(con_start, log_start);
219 dest_idx = 0;
220 while (start != log_end) {
221 log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
222 start++;
223 dest_idx++;
224 }
225 log_start -= offset;
226 con_start -= offset;
227 log_end -= offset;
228 spin_unlock_irqrestore(&logbuf_lock, flags);
229
230 printk("log_buf_len: %d\n", log_buf_len);
231 }
232out:
233
234 return 1;
235}
236
237__setup("log_buf_len=", log_buf_len_setup);
238
239/*
240 * Commands to do_syslog:
241 *
242 * 0 -- Close the log. Currently a NOP.
243 * 1 -- Open the log. Currently a NOP.
244 * 2 -- Read from the log.
245 * 3 -- Read all messages remaining in the ring buffer.
246 * 4 -- Read and clear all messages remaining in the ring buffer
247 * 5 -- Clear ring buffer.
248 * 6 -- Disable printk's to console
249 * 7 -- Enable printk's to console
250 * 8 -- Set level of messages printed to console
251 * 9 -- Return number of unread characters in the log buffer
252 * 10 -- Return size of the log buffer
253 */
254int do_syslog(int type, char __user * buf, int len)
255{
256 unsigned long i, j, limit, count;
257 int do_clear = 0;
258 char c;
259 int error = 0;
260
261 error = security_syslog(type);
262 if (error)
263 return error;
264
265 switch (type) {
266 case 0: /* Close log */
267 break;
268 case 1: /* Open log */
269 break;
270 case 2: /* Read from log */
271 error = -EINVAL;
272 if (!buf || len < 0)
273 goto out;
274 error = 0;
275 if (!len)
276 goto out;
277 if (!access_ok(VERIFY_WRITE, buf, len)) {
278 error = -EFAULT;
279 goto out;
280 }
281 error = wait_event_interruptible(log_wait, (log_start - log_end));
282 if (error)
283 goto out;
284 i = 0;
285 spin_lock_irq(&logbuf_lock);
286 while (!error && (log_start != log_end) && i < len) {
287 c = LOG_BUF(log_start);
288 log_start++;
289 spin_unlock_irq(&logbuf_lock);
290 error = __put_user(c,buf);
291 buf++;
292 i++;
293 cond_resched();
294 spin_lock_irq(&logbuf_lock);
295 }
296 spin_unlock_irq(&logbuf_lock);
297 if (!error)
298 error = i;
299 break;
300 case 4: /* Read/clear last kernel messages */
301 do_clear = 1;
302 /* FALL THRU */
303 case 3: /* Read last kernel messages */
304 error = -EINVAL;
305 if (!buf || len < 0)
306 goto out;
307 error = 0;
308 if (!len)
309 goto out;
310 if (!access_ok(VERIFY_WRITE, buf, len)) {
311 error = -EFAULT;
312 goto out;
313 }
314 count = len;
315 if (count > log_buf_len)
316 count = log_buf_len;
317 spin_lock_irq(&logbuf_lock);
318 if (count > logged_chars)
319 count = logged_chars;
320 if (do_clear)
321 logged_chars = 0;
322 limit = log_end;
323 /*
324 * __put_user() could sleep, and while we sleep
325 * printk() could overwrite the messages
326 * we try to copy to user space. Therefore
327 * the messages are copied in reverse. <manfreds>
328 */
329 for(i = 0; i < count && !error; i++) {
330 j = limit-1-i;
331 if (j + log_buf_len < log_end)
332 break;
333 c = LOG_BUF(j);
334 spin_unlock_irq(&logbuf_lock);
335 error = __put_user(c,&buf[count-1-i]);
336 cond_resched();
337 spin_lock_irq(&logbuf_lock);
338 }
339 spin_unlock_irq(&logbuf_lock);
340 if (error)
341 break;
342 error = i;
343 if(i != count) {
344 int offset = count-error;
345 /* buffer overflow during copy, correct user buffer. */
346 for(i=0;i<error;i++) {
347 if (__get_user(c,&buf[i+offset]) ||
348 __put_user(c,&buf[i])) {
349 error = -EFAULT;
350 break;
351 }
352 cond_resched();
353 }
354 }
355 break;
356 case 5: /* Clear ring buffer */
357 logged_chars = 0;
358 break;
359 case 6: /* Disable logging to console */
360 console_loglevel = minimum_console_loglevel;
361 break;
362 case 7: /* Enable logging to console */
363 console_loglevel = default_console_loglevel;
364 break;
365 case 8: /* Set level of messages printed to console */
366 error = -EINVAL;
367 if (len < 1 || len > 8)
368 goto out;
369 if (len < minimum_console_loglevel)
370 len = minimum_console_loglevel;
371 console_loglevel = len;
372 error = 0;
373 break;
374 case 9: /* Number of chars in the log buffer */
375 error = log_end - log_start;
376 break;
377 case 10: /* Size of the log buffer */
378 error = log_buf_len;
379 break;
380 default:
381 error = -EINVAL;
382 break;
383 }
384out:
385 return error;
386}
387
388asmlinkage long sys_syslog(int type, char __user * buf, int len)
389{
390 return do_syslog(type, buf, len);
391}
392
393/*
394 * Call the console drivers on a range of log_buf
395 */
396static void __call_console_drivers(unsigned long start, unsigned long end)
397{
398 struct console *con;
399
400 for (con = console_drivers; con; con = con->next) {
401 if ((con->flags & CON_ENABLED) && con->write)
402 con->write(con, &LOG_BUF(start), end - start);
403 }
404}
405
406/*
407 * Write out chars from start to end - 1 inclusive
408 */
409static void _call_console_drivers(unsigned long start,
410 unsigned long end, int msg_log_level)
411{
412 if (msg_log_level < console_loglevel &&
413 console_drivers && start != end) {
414 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
415 /* wrapped write */
416 __call_console_drivers(start & LOG_BUF_MASK,
417 log_buf_len);
418 __call_console_drivers(0, end & LOG_BUF_MASK);
419 } else {
420 __call_console_drivers(start, end);
421 }
422 }
423}
424
425/*
426 * Call the console drivers, asking them to write out
427 * log_buf[start] to log_buf[end - 1].
428 * The console_sem must be held.
429 */
430static void call_console_drivers(unsigned long start, unsigned long end)
431{
432 unsigned long cur_index, start_print;
433 static int msg_level = -1;
434
435 if (((long)(start - end)) > 0)
436 BUG();
437
438 cur_index = start;
439 start_print = start;
440 while (cur_index != end) {
441 if ( msg_level < 0 &&
442 ((end - cur_index) > 2) &&
443 LOG_BUF(cur_index + 0) == '<' &&
444 LOG_BUF(cur_index + 1) >= '0' &&
445 LOG_BUF(cur_index + 1) <= '7' &&
446 LOG_BUF(cur_index + 2) == '>')
447 {
448 msg_level = LOG_BUF(cur_index + 1) - '0';
449 cur_index += 3;
450 start_print = cur_index;
451 }
452 while (cur_index != end) {
453 char c = LOG_BUF(cur_index);
454 cur_index++;
455
456 if (c == '\n') {
457 if (msg_level < 0) {
458 /*
459 * printk() has already given us loglevel tags in
460 * the buffer. This code is here in case the
461 * log buffer has wrapped right round and scribbled
462 * on those tags
463 */
464 msg_level = default_message_loglevel;
465 }
466 _call_console_drivers(start_print, cur_index, msg_level);
467 msg_level = -1;
468 start_print = cur_index;
469 break;
470 }
471 }
472 }
473 _call_console_drivers(start_print, end, msg_level);
474}
475
476static void emit_log_char(char c)
477{
478 LOG_BUF(log_end) = c;
479 log_end++;
480 if (log_end - log_start > log_buf_len)
481 log_start = log_end - log_buf_len;
482 if (log_end - con_start > log_buf_len)
483 con_start = log_end - log_buf_len;
484 if (logged_chars < log_buf_len)
485 logged_chars++;
486}
487
488/*
489 * Zap console related locks when oopsing. Only zap at most once
490 * every 10 seconds, to leave time for slow consoles to print a
491 * full oops.
492 */
493static void zap_locks(void)
494{
495 static unsigned long oops_timestamp;
496
497 if (time_after_eq(jiffies, oops_timestamp) &&
498 !time_after(jiffies, oops_timestamp + 30*HZ))
499 return;
500
501 oops_timestamp = jiffies;
502
503 /* If a crash is occurring, make sure we can't deadlock */
504 spin_lock_init(&logbuf_lock);
505 /* And make sure that we print immediately */
506 init_MUTEX(&console_sem);
507}
508
509#if defined(CONFIG_PRINTK_TIME)
510static int printk_time = 1;
511#else
512static int printk_time = 0;
513#endif
514
515static int __init printk_time_setup(char *str)
516{
517 if (*str)
518 return 0;
519 printk_time = 1;
520 return 1;
521}
522
523__setup("time", printk_time_setup);
524
525/*
526 * This is printk. It can be called from any context. We want it to work.
527 *
528 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
529 * call the console drivers. If we fail to get the semaphore we place the output
530 * into the log buffer and return. The current holder of the console_sem will
531 * notice the new output in release_console_sem() and will send it to the
532 * consoles before releasing the semaphore.
533 *
534 * One effect of this deferred printing is that code which calls printk() and
535 * then changes console_loglevel may break. This is because console_loglevel
536 * is inspected when the actual printing occurs.
537 */
538asmlinkage int printk(const char *fmt, ...)
539{
540 va_list args;
541 int r;
542
543 va_start(args, fmt);
544 r = vprintk(fmt, args);
545 va_end(args);
546
547 return r;
548}
549
550asmlinkage int vprintk(const char *fmt, va_list args)
551{
552 unsigned long flags;
553 int printed_len;
554 char *p;
555 static char printk_buf[1024];
556 static int log_level_unknown = 1;
557
558 if (unlikely(oops_in_progress))
559 zap_locks();
560
561 /* This stops the holder of console_sem just where we want him */
562 spin_lock_irqsave(&logbuf_lock, flags);
563
564 /* Emit the output into the temporary buffer */
565 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
566
567 /*
568 * Copy the output into log_buf. If the caller didn't provide
569 * appropriate log level tags, we insert them here
570 */
571 for (p = printk_buf; *p; p++) {
572 if (log_level_unknown) {
573 /* log_level_unknown signals the start of a new line */
574 if (printk_time) {
575 int loglev_char;
576 char tbuf[50], *tp;
577 unsigned tlen;
578 unsigned long long t;
579 unsigned long nanosec_rem;
580
581 /*
582 * force the log level token to be
583 * before the time output.
584 */
585 if (p[0] == '<' && p[1] >='0' &&
586 p[1] <= '7' && p[2] == '>') {
587 loglev_char = p[1];
588 p += 3;
589 printed_len += 3;
590 } else {
591 loglev_char = default_message_loglevel
592 + '0';
593 }
594 t = sched_clock();
595 nanosec_rem = do_div(t, 1000000000);
596 tlen = sprintf(tbuf,
597 "<%c>[%5lu.%06lu] ",
598 loglev_char,
599 (unsigned long)t,
600 nanosec_rem/1000);
601
602 for (tp = tbuf; tp < tbuf + tlen; tp++)
603 emit_log_char(*tp);
604 printed_len += tlen - 3;
605 } else {
606 if (p[0] != '<' || p[1] < '0' ||
607 p[1] > '7' || p[2] != '>') {
608 emit_log_char('<');
609 emit_log_char(default_message_loglevel
610 + '0');
611 emit_log_char('>');
612 }
613 printed_len += 3;
614 }
615 log_level_unknown = 0;
616 if (!*p)
617 break;
618 }
619 emit_log_char(*p);
620 if (*p == '\n')
621 log_level_unknown = 1;
622 }
623
624 if (!cpu_online(smp_processor_id()) &&
625 system_state != SYSTEM_RUNNING) {
626 /*
627 * Some console drivers may assume that per-cpu resources have
628 * been allocated. So don't allow them to be called by this
629 * CPU until it is officially up. We shouldn't be calling into
630 * random console drivers on a CPU which doesn't exist yet..
631 */
632 spin_unlock_irqrestore(&logbuf_lock, flags);
633 goto out;
634 }
635 if (!down_trylock(&console_sem)) {
636 console_locked = 1;
637 /*
638 * We own the drivers. We can drop the spinlock and let
639 * release_console_sem() print the text
640 */
641 spin_unlock_irqrestore(&logbuf_lock, flags);
642 console_may_schedule = 0;
643 release_console_sem();
644 } else {
645 /*
646 * Someone else owns the drivers. We drop the spinlock, which
647 * allows the semaphore holder to proceed and to call the
648 * console drivers with the output which we just produced.
649 */
650 spin_unlock_irqrestore(&logbuf_lock, flags);
651 }
652out:
653 return printed_len;
654}
655EXPORT_SYMBOL(printk);
656EXPORT_SYMBOL(vprintk);
657
658/**
659 * acquire_console_sem - lock the console system for exclusive use.
660 *
661 * Acquires a semaphore which guarantees that the caller has
662 * exclusive access to the console system and the console_drivers list.
663 *
664 * Can sleep, returns nothing.
665 */
666void acquire_console_sem(void)
667{
668 if (in_interrupt())
669 BUG();
670 down(&console_sem);
671 console_locked = 1;
672 console_may_schedule = 1;
673}
674EXPORT_SYMBOL(acquire_console_sem);
675
676int try_acquire_console_sem(void)
677{
678 if (down_trylock(&console_sem))
679 return -1;
680 console_locked = 1;
681 console_may_schedule = 0;
682 return 0;
683}
684EXPORT_SYMBOL(try_acquire_console_sem);
685
686int is_console_locked(void)
687{
688 return console_locked;
689}
690EXPORT_SYMBOL(is_console_locked);
691
692/**
693 * release_console_sem - unlock the console system
694 *
695 * Releases the semaphore which the caller holds on the console system
696 * and the console driver list.
697 *
698 * While the semaphore was held, console output may have been buffered
699 * by printk(). If this is the case, release_console_sem() emits
700 * the output prior to releasing the semaphore.
701 *
702 * If there is output waiting for klogd, we wake it up.
703 *
704 * release_console_sem() may be called from any context.
705 */
706void release_console_sem(void)
707{
708 unsigned long flags;
709 unsigned long _con_start, _log_end;
710 unsigned long wake_klogd = 0;
711
712 for ( ; ; ) {
713 spin_lock_irqsave(&logbuf_lock, flags);
714 wake_klogd |= log_start - log_end;
715 if (con_start == log_end)
716 break; /* Nothing to print */
717 _con_start = con_start;
718 _log_end = log_end;
719 con_start = log_end; /* Flush */
720 spin_unlock(&logbuf_lock);
721 call_console_drivers(_con_start, _log_end);
722 local_irq_restore(flags);
723 }
724 console_locked = 0;
725 console_may_schedule = 0;
726 up(&console_sem);
727 spin_unlock_irqrestore(&logbuf_lock, flags);
728 if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
729 wake_up_interruptible(&log_wait);
730}
731EXPORT_SYMBOL(release_console_sem);
732
733/** console_conditional_schedule - yield the CPU if required
734 *
735 * If the console code is currently allowed to sleep, and
736 * if this CPU should yield the CPU to another task, do
737 * so here.
738 *
739 * Must be called within acquire_console_sem().
740 */
741void __sched console_conditional_schedule(void)
742{
743 if (console_may_schedule)
744 cond_resched();
745}
746EXPORT_SYMBOL(console_conditional_schedule);
747
748void console_print(const char *s)
749{
750 printk(KERN_EMERG "%s", s);
751}
752EXPORT_SYMBOL(console_print);
753
754void console_unblank(void)
755{
756 struct console *c;
757
758 /*
759 * console_unblank can no longer be called in interrupt context unless
760 * oops_in_progress is set to 1..
761 */
762 if (oops_in_progress) {
763 if (down_trylock(&console_sem) != 0)
764 return;
765 } else
766 acquire_console_sem();
767
768 console_locked = 1;
769 console_may_schedule = 0;
770 for (c = console_drivers; c != NULL; c = c->next)
771 if ((c->flags & CON_ENABLED) && c->unblank)
772 c->unblank();
773 release_console_sem();
774}
775EXPORT_SYMBOL(console_unblank);
776
777/*
778 * Return the console tty driver structure and its associated index
779 */
780struct tty_driver *console_device(int *index)
781{
782 struct console *c;
783 struct tty_driver *driver = NULL;
784
785 acquire_console_sem();
786 for (c = console_drivers; c != NULL; c = c->next) {
787 if (!c->device)
788 continue;
789 driver = c->device(c, index);
790 if (driver)
791 break;
792 }
793 release_console_sem();
794 return driver;
795}
796
797/*
798 * Prevent further output on the passed console device so that (for example)
799 * serial drivers can disable console output before suspending a port, and can
800 * re-enable output afterwards.
801 */
802void console_stop(struct console *console)
803{
804 acquire_console_sem();
805 console->flags &= ~CON_ENABLED;
806 release_console_sem();
807}
808EXPORT_SYMBOL(console_stop);
809
810void console_start(struct console *console)
811{
812 acquire_console_sem();
813 console->flags |= CON_ENABLED;
814 release_console_sem();
815}
816EXPORT_SYMBOL(console_start);
817
818/*
819 * The console driver calls this routine during kernel initialization
820 * to register the console printing procedure with printk() and to
821 * print any messages that were printed by the kernel before the
822 * console driver was initialized.
823 */
824void register_console(struct console * console)
825{
826 int i;
827 unsigned long flags;
828
829 if (preferred_console < 0)
830 preferred_console = selected_console;
831
832 /*
833 * See if we want to use this console driver. If we
834 * didn't select a console we take the first one
835 * that registers here.
836 */
837 if (preferred_console < 0) {
838 if (console->index < 0)
839 console->index = 0;
840 if (console->setup == NULL ||
841 console->setup(console, NULL) == 0) {
842 console->flags |= CON_ENABLED | CON_CONSDEV;
843 preferred_console = 0;
844 }
845 }
846
847 /*
848 * See if this console matches one we selected on
849 * the command line.
850 */
851 for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
852 if (strcmp(console_cmdline[i].name, console->name) != 0)
853 continue;
854 if (console->index >= 0 &&
855 console->index != console_cmdline[i].index)
856 continue;
857 if (console->index < 0)
858 console->index = console_cmdline[i].index;
859 if (console->setup &&
860 console->setup(console, console_cmdline[i].options) != 0)
861 break;
862 console->flags |= CON_ENABLED;
863 console->index = console_cmdline[i].index;
864 if (i == preferred_console)
865 console->flags |= CON_CONSDEV;
866 break;
867 }
868
869 if (!(console->flags & CON_ENABLED))
870 return;
871
872 if (console_drivers && (console_drivers->flags & CON_BOOT)) {
873 unregister_console(console_drivers);
874 console->flags &= ~CON_PRINTBUFFER;
875 }
876
877 /*
878 * Put this console in the list - keep the
879 * preferred driver at the head of the list.
880 */
881 acquire_console_sem();
882 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
883 console->next = console_drivers;
884 console_drivers = console;
885 } else {
886 console->next = console_drivers->next;
887 console_drivers->next = console;
888 }
889 if (console->flags & CON_PRINTBUFFER) {
890 /*
891 * release_console_sem() will print out the buffered messages
892 * for us.
893 */
894 spin_lock_irqsave(&logbuf_lock, flags);
895 con_start = log_start;
896 spin_unlock_irqrestore(&logbuf_lock, flags);
897 }
898 release_console_sem();
899}
900EXPORT_SYMBOL(register_console);
901
902int unregister_console(struct console * console)
903{
904 struct console *a,*b;
905 int res = 1;
906
907 acquire_console_sem();
908 if (console_drivers == console) {
909 console_drivers=console->next;
910 res = 0;
911 } else {
912 for (a=console_drivers->next, b=console_drivers ;
913 a; b=a, a=b->next) {
914 if (a == console) {
915 b->next = a->next;
916 res = 0;
917 break;
918 }
919 }
920 }
921
922 /* If last console is removed, we re-enable picking the first
923 * one that gets registered. Without that, pmac early boot console
924 * would prevent fbcon from taking over.
925 */
926 if (console_drivers == NULL)
927 preferred_console = selected_console;
928
929
930 release_console_sem();
931 return res;
932}
933EXPORT_SYMBOL(unregister_console);
934
935/**
936 * tty_write_message - write a message to a certain tty, not just the console.
937 *
938 * This is used for messages that need to be redirected to a specific tty.
939 * We don't put it into the syslog queue right now maybe in the future if
940 * really needed.
941 */
942void tty_write_message(struct tty_struct *tty, char *msg)
943{
944 if (tty && tty->driver->write)
945 tty->driver->write(tty, msg, strlen(msg));
946 return;
947}
948
949/*
950 * printk rate limiting, lifted from the networking subsystem.
951 *
952 * This enforces a rate limit: not more than one kernel message
953 * every printk_ratelimit_jiffies to make a denial-of-service
954 * attack impossible.
955 */
956int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
957{
958 static DEFINE_SPINLOCK(ratelimit_lock);
959 static unsigned long toks = 10*5*HZ;
960 static unsigned long last_msg;
961 static int missed;
962 unsigned long flags;
963 unsigned long now = jiffies;
964
965 spin_lock_irqsave(&ratelimit_lock, flags);
966 toks += now - last_msg;
967 last_msg = now;
968 if (toks > (ratelimit_burst * ratelimit_jiffies))
969 toks = ratelimit_burst * ratelimit_jiffies;
970 if (toks >= ratelimit_jiffies) {
971 int lost = missed;
972 missed = 0;
973 toks -= ratelimit_jiffies;
974 spin_unlock_irqrestore(&ratelimit_lock, flags);
975 if (lost)
976 printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
977 return 1;
978 }
979 missed++;
980 spin_unlock_irqrestore(&ratelimit_lock, flags);
981 return 0;
982}
983EXPORT_SYMBOL(__printk_ratelimit);
984
985/* minimum time in jiffies between messages */
986int printk_ratelimit_jiffies = 5*HZ;
987
988/* number of messages we send before ratelimiting */
989int printk_ratelimit_burst = 10;
990
991int printk_ratelimit(void)
992{
993 return __printk_ratelimit(printk_ratelimit_jiffies,
994 printk_ratelimit_burst);
995}
996EXPORT_SYMBOL(printk_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
new file mode 100644
index 000000000000..a38fa70075fe
--- /dev/null
+++ b/kernel/profile.c
@@ -0,0 +1,563 @@
1/*
2 * linux/kernel/profile.c
3 * Simple profiling. Manages a direct-mapped profile hit count buffer,
4 * with configurable resolution, support for restricting the cpus on
5 * which profiling is done, and switching between cpu time and
6 * schedule() calls via kernel command line parameters passed at boot.
7 *
8 * Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
9 * Red Hat, July 2004
10 * Consolidation of architecture support code for profiling,
11 * William Irwin, Oracle, July 2004
12 * Amortized hit count accounting via per-cpu open-addressed hashtables
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/profile.h>
19#include <linux/bootmem.h>
20#include <linux/notifier.h>
21#include <linux/mm.h>
22#include <linux/cpumask.h>
23#include <linux/cpu.h>
24#include <linux/profile.h>
25#include <linux/highmem.h>
26#include <asm/sections.h>
27#include <asm/semaphore.h>
28
29struct profile_hit {
30 u32 pc, hits;
31};
32#define PROFILE_GRPSHIFT 3
33#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT)
34#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
35#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
36
37/* Oprofile timer tick hook */
38int (*timer_hook)(struct pt_regs *);
39
40static atomic_t *prof_buffer;
41static unsigned long prof_len, prof_shift;
42static int prof_on;
43static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
44#ifdef CONFIG_SMP
45static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
46static DEFINE_PER_CPU(int, cpu_profile_flip);
47static DECLARE_MUTEX(profile_flip_mutex);
48#endif /* CONFIG_SMP */
49
50static int __init profile_setup(char * str)
51{
52 int par;
53
54 if (!strncmp(str, "schedule", 8)) {
55 prof_on = SCHED_PROFILING;
56 printk(KERN_INFO "kernel schedule profiling enabled\n");
57 if (str[7] == ',')
58 str += 8;
59 }
60 if (get_option(&str,&par)) {
61 prof_shift = par;
62 prof_on = CPU_PROFILING;
63 printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
64 prof_shift);
65 }
66 return 1;
67}
68__setup("profile=", profile_setup);
69
70
71void __init profile_init(void)
72{
73 if (!prof_on)
74 return;
75
76 /* only text is profiled */
77 prof_len = (_etext - _stext) >> prof_shift;
78 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
79}
80
81/* Profile event notifications */
82
83#ifdef CONFIG_PROFILING
84
85static DECLARE_RWSEM(profile_rwsem);
86static DEFINE_RWLOCK(handoff_lock);
87static struct notifier_block * task_exit_notifier;
88static struct notifier_block * task_free_notifier;
89static struct notifier_block * munmap_notifier;
90
91void profile_task_exit(struct task_struct * task)
92{
93 down_read(&profile_rwsem);
94 notifier_call_chain(&task_exit_notifier, 0, task);
95 up_read(&profile_rwsem);
96}
97
98int profile_handoff_task(struct task_struct * task)
99{
100 int ret;
101 read_lock(&handoff_lock);
102 ret = notifier_call_chain(&task_free_notifier, 0, task);
103 read_unlock(&handoff_lock);
104 return (ret == NOTIFY_OK) ? 1 : 0;
105}
106
107void profile_munmap(unsigned long addr)
108{
109 down_read(&profile_rwsem);
110 notifier_call_chain(&munmap_notifier, 0, (void *)addr);
111 up_read(&profile_rwsem);
112}
113
114int task_handoff_register(struct notifier_block * n)
115{
116 int err = -EINVAL;
117
118 write_lock(&handoff_lock);
119 err = notifier_chain_register(&task_free_notifier, n);
120 write_unlock(&handoff_lock);
121 return err;
122}
123
124int task_handoff_unregister(struct notifier_block * n)
125{
126 int err = -EINVAL;
127
128 write_lock(&handoff_lock);
129 err = notifier_chain_unregister(&task_free_notifier, n);
130 write_unlock(&handoff_lock);
131 return err;
132}
133
134int profile_event_register(enum profile_type type, struct notifier_block * n)
135{
136 int err = -EINVAL;
137
138 down_write(&profile_rwsem);
139
140 switch (type) {
141 case PROFILE_TASK_EXIT:
142 err = notifier_chain_register(&task_exit_notifier, n);
143 break;
144 case PROFILE_MUNMAP:
145 err = notifier_chain_register(&munmap_notifier, n);
146 break;
147 }
148
149 up_write(&profile_rwsem);
150
151 return err;
152}
153
154
155int profile_event_unregister(enum profile_type type, struct notifier_block * n)
156{
157 int err = -EINVAL;
158
159 down_write(&profile_rwsem);
160
161 switch (type) {
162 case PROFILE_TASK_EXIT:
163 err = notifier_chain_unregister(&task_exit_notifier, n);
164 break;
165 case PROFILE_MUNMAP:
166 err = notifier_chain_unregister(&munmap_notifier, n);
167 break;
168 }
169
170 up_write(&profile_rwsem);
171 return err;
172}
173
174int register_timer_hook(int (*hook)(struct pt_regs *))
175{
176 if (timer_hook)
177 return -EBUSY;
178 timer_hook = hook;
179 return 0;
180}
181
182void unregister_timer_hook(int (*hook)(struct pt_regs *))
183{
184 WARN_ON(hook != timer_hook);
185 timer_hook = NULL;
186 /* make sure all CPUs see the NULL hook */
187 synchronize_kernel();
188}
189
190EXPORT_SYMBOL_GPL(register_timer_hook);
191EXPORT_SYMBOL_GPL(unregister_timer_hook);
192EXPORT_SYMBOL_GPL(task_handoff_register);
193EXPORT_SYMBOL_GPL(task_handoff_unregister);
194
195#endif /* CONFIG_PROFILING */
196
197EXPORT_SYMBOL_GPL(profile_event_register);
198EXPORT_SYMBOL_GPL(profile_event_unregister);
199
200#ifdef CONFIG_SMP
201/*
202 * Each cpu has a pair of open-addressed hashtables for pending
203 * profile hits. read_profile() IPI's all cpus to request them
204 * to flip buffers and flushes their contents to prof_buffer itself.
205 * Flip requests are serialized by the profile_flip_mutex. The sole
206 * use of having a second hashtable is for avoiding cacheline
207 * contention that would otherwise happen during flushes of pending
208 * profile hits required for the accuracy of reported profile hits
209 * and so resurrect the interrupt livelock issue.
210 *
211 * The open-addressed hashtables are indexed by profile buffer slot
212 * and hold the number of pending hits to that profile buffer slot on
213 * a cpu in an entry. When the hashtable overflows, all pending hits
214 * are accounted to their corresponding profile buffer slots with
215 * atomic_add() and the hashtable emptied. As numerous pending hits
216 * may be accounted to a profile buffer slot in a hashtable entry,
217 * this amortizes a number of atomic profile buffer increments likely
218 * to be far larger than the number of entries in the hashtable,
219 * particularly given that the number of distinct profile buffer
220 * positions to which hits are accounted during short intervals (e.g.
221 * several seconds) is usually very small. Exclusion from buffer
222 * flipping is provided by interrupt disablement (note that for
223 * SCHED_PROFILING profile_hit() may be called from process context).
224 * The hash function is meant to be lightweight as opposed to strong,
225 * and was vaguely inspired by ppc64 firmware-supported inverted
226 * pagetable hash functions, but uses a full hashtable full of finite
227 * collision chains, not just pairs of them.
228 *
229 * -- wli
230 */
231static void __profile_flip_buffers(void *unused)
232{
233 int cpu = smp_processor_id();
234
235 per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
236}
237
238static void profile_flip_buffers(void)
239{
240 int i, j, cpu;
241
242 down(&profile_flip_mutex);
243 j = per_cpu(cpu_profile_flip, get_cpu());
244 put_cpu();
245 on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
246 for_each_online_cpu(cpu) {
247 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
248 for (i = 0; i < NR_PROFILE_HIT; ++i) {
249 if (!hits[i].hits) {
250 if (hits[i].pc)
251 hits[i].pc = 0;
252 continue;
253 }
254 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
255 hits[i].hits = hits[i].pc = 0;
256 }
257 }
258 up(&profile_flip_mutex);
259}
260
261static void profile_discard_flip_buffers(void)
262{
263 int i, cpu;
264
265 down(&profile_flip_mutex);
266 i = per_cpu(cpu_profile_flip, get_cpu());
267 put_cpu();
268 on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
269 for_each_online_cpu(cpu) {
270 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
271 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
272 }
273 up(&profile_flip_mutex);
274}
275
276void profile_hit(int type, void *__pc)
277{
278 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
279 int i, j, cpu;
280 struct profile_hit *hits;
281
282 if (prof_on != type || !prof_buffer)
283 return;
284 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
285 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
286 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
287 cpu = get_cpu();
288 hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
289 if (!hits) {
290 put_cpu();
291 return;
292 }
293 local_irq_save(flags);
294 do {
295 for (j = 0; j < PROFILE_GRPSZ; ++j) {
296 if (hits[i + j].pc == pc) {
297 hits[i + j].hits++;
298 goto out;
299 } else if (!hits[i + j].hits) {
300 hits[i + j].pc = pc;
301 hits[i + j].hits = 1;
302 goto out;
303 }
304 }
305 i = (i + secondary) & (NR_PROFILE_HIT - 1);
306 } while (i != primary);
307 atomic_inc(&prof_buffer[pc]);
308 for (i = 0; i < NR_PROFILE_HIT; ++i) {
309 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
310 hits[i].pc = hits[i].hits = 0;
311 }
312out:
313 local_irq_restore(flags);
314 put_cpu();
315}
316
317#ifdef CONFIG_HOTPLUG_CPU
318static int __devinit profile_cpu_callback(struct notifier_block *info,
319 unsigned long action, void *__cpu)
320{
321 int node, cpu = (unsigned long)__cpu;
322 struct page *page;
323
324 switch (action) {
325 case CPU_UP_PREPARE:
326 node = cpu_to_node(cpu);
327 per_cpu(cpu_profile_flip, cpu) = 0;
328 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
329 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
330 if (!page)
331 return NOTIFY_BAD;
332 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
333 }
334 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
335 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
336 if (!page)
337 goto out_free;
338 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
339 }
340 break;
341 out_free:
342 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
343 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
344 __free_page(page);
345 return NOTIFY_BAD;
346 case CPU_ONLINE:
347 cpu_set(cpu, prof_cpu_mask);
348 break;
349 case CPU_UP_CANCELED:
350 case CPU_DEAD:
351 cpu_clear(cpu, prof_cpu_mask);
352 if (per_cpu(cpu_profile_hits, cpu)[0]) {
353 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
354 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
355 __free_page(page);
356 }
357 if (per_cpu(cpu_profile_hits, cpu)[1]) {
358 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
359 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
360 __free_page(page);
361 }
362 break;
363 }
364 return NOTIFY_OK;
365}
366#endif /* CONFIG_HOTPLUG_CPU */
367#else /* !CONFIG_SMP */
368#define profile_flip_buffers() do { } while (0)
369#define profile_discard_flip_buffers() do { } while (0)
370
371void profile_hit(int type, void *__pc)
372{
373 unsigned long pc;
374
375 if (prof_on != type || !prof_buffer)
376 return;
377 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
378 atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
379}
380#endif /* !CONFIG_SMP */
381
382void profile_tick(int type, struct pt_regs *regs)
383{
384 if (type == CPU_PROFILING && timer_hook)
385 timer_hook(regs);
386 if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
387 profile_hit(type, (void *)profile_pc(regs));
388}
389
390#ifdef CONFIG_PROC_FS
391#include <linux/proc_fs.h>
392#include <asm/uaccess.h>
393#include <asm/ptrace.h>
394
395static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
396 int count, int *eof, void *data)
397{
398 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
399 if (count - len < 2)
400 return -EINVAL;
401 len += sprintf(page + len, "\n");
402 return len;
403}
404
405static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
406 unsigned long count, void *data)
407{
408 cpumask_t *mask = (cpumask_t *)data;
409 unsigned long full_count = count, err;
410 cpumask_t new_value;
411
412 err = cpumask_parse(buffer, count, new_value);
413 if (err)
414 return err;
415
416 *mask = new_value;
417 return full_count;
418}
419
420void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
421{
422 struct proc_dir_entry *entry;
423
424 /* create /proc/irq/prof_cpu_mask */
425 if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
426 return;
427 entry->nlink = 1;
428 entry->data = (void *)&prof_cpu_mask;
429 entry->read_proc = prof_cpu_mask_read_proc;
430 entry->write_proc = prof_cpu_mask_write_proc;
431}
432
433/*
434 * This function accesses profiling information. The returned data is
435 * binary: the sampling step and the actual contents of the profile
436 * buffer. Use of the program readprofile is recommended in order to
437 * get meaningful info out of these data.
438 */
439static ssize_t
440read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
441{
442 unsigned long p = *ppos;
443 ssize_t read;
444 char * pnt;
445 unsigned int sample_step = 1 << prof_shift;
446
447 profile_flip_buffers();
448 if (p >= (prof_len+1)*sizeof(unsigned int))
449 return 0;
450 if (count > (prof_len+1)*sizeof(unsigned int) - p)
451 count = (prof_len+1)*sizeof(unsigned int) - p;
452 read = 0;
453
454 while (p < sizeof(unsigned int) && count > 0) {
455 put_user(*((char *)(&sample_step)+p),buf);
456 buf++; p++; count--; read++;
457 }
458 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
459 if (copy_to_user(buf,(void *)pnt,count))
460 return -EFAULT;
461 read += count;
462 *ppos += read;
463 return read;
464}
465
466/*
467 * Writing to /proc/profile resets the counters
468 *
469 * Writing a 'profiling multiplier' value into it also re-sets the profiling
470 * interrupt frequency, on architectures that support this.
471 */
472static ssize_t write_profile(struct file *file, const char __user *buf,
473 size_t count, loff_t *ppos)
474{
475#ifdef CONFIG_SMP
476 extern int setup_profiling_timer (unsigned int multiplier);
477
478 if (count == sizeof(int)) {
479 unsigned int multiplier;
480
481 if (copy_from_user(&multiplier, buf, sizeof(int)))
482 return -EFAULT;
483
484 if (setup_profiling_timer(multiplier))
485 return -EINVAL;
486 }
487#endif
488 profile_discard_flip_buffers();
489 memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
490 return count;
491}
492
493static struct file_operations proc_profile_operations = {
494 .read = read_profile,
495 .write = write_profile,
496};
497
498#ifdef CONFIG_SMP
499static void __init profile_nop(void *unused)
500{
501}
502
503static int __init create_hash_tables(void)
504{
505 int cpu;
506
507 for_each_online_cpu(cpu) {
508 int node = cpu_to_node(cpu);
509 struct page *page;
510
511 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
512 if (!page)
513 goto out_cleanup;
514 per_cpu(cpu_profile_hits, cpu)[1]
515 = (struct profile_hit *)page_address(page);
516 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
517 if (!page)
518 goto out_cleanup;
519 per_cpu(cpu_profile_hits, cpu)[0]
520 = (struct profile_hit *)page_address(page);
521 }
522 return 0;
523out_cleanup:
524 prof_on = 0;
525 mb();
526 on_each_cpu(profile_nop, NULL, 0, 1);
527 for_each_online_cpu(cpu) {
528 struct page *page;
529
530 if (per_cpu(cpu_profile_hits, cpu)[0]) {
531 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
532 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
533 __free_page(page);
534 }
535 if (per_cpu(cpu_profile_hits, cpu)[1]) {
536 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
537 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
538 __free_page(page);
539 }
540 }
541 return -1;
542}
543#else
544#define create_hash_tables() ({ 0; })
545#endif
546
547static int __init create_proc_profile(void)
548{
549 struct proc_dir_entry *entry;
550
551 if (!prof_on)
552 return 0;
553 if (create_hash_tables())
554 return -1;
555 if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
556 return 0;
557 entry->proc_fops = &proc_profile_operations;
558 entry->size = (1+prof_len) * sizeof(atomic_t);
559 hotcpu_notifier(profile_cpu_callback, 0);
560 return 0;
561}
562module_init(create_proc_profile);
563#endif /* CONFIG_PROC_FS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
new file mode 100644
index 000000000000..88b306c4e841
--- /dev/null
+++ b/kernel/ptrace.c
@@ -0,0 +1,389 @@
1/*
2 * linux/kernel/ptrace.c
3 *
4 * (C) Copyright 1999 Linus Torvalds
5 *
6 * Common interfaces for "ptrace()" which we do not want
7 * to continually duplicate across every architecture.
8 */
9
10#include <linux/module.h>
11#include <linux/sched.h>
12#include <linux/errno.h>
13#include <linux/mm.h>
14#include <linux/highmem.h>
15#include <linux/pagemap.h>
16#include <linux/smp_lock.h>
17#include <linux/ptrace.h>
18#include <linux/security.h>
19
20#include <asm/pgtable.h>
21#include <asm/uaccess.h>
22
23/*
24 * ptrace a task: make the debugger its new parent and
25 * move it to the ptrace list.
26 *
27 * Must be called with the tasklist lock write-held.
28 */
29void __ptrace_link(task_t *child, task_t *new_parent)
30{
31 if (!list_empty(&child->ptrace_list))
32 BUG();
33 if (child->parent == new_parent)
34 return;
35 list_add(&child->ptrace_list, &child->parent->ptrace_children);
36 REMOVE_LINKS(child);
37 child->parent = new_parent;
38 SET_LINKS(child);
39}
40
41/*
42 * Turn a tracing stop into a normal stop now, since with no tracer there
43 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a
44 * signal sent that would resume the child, but didn't because it was in
45 * TASK_TRACED, resume it now.
46 * Requires that irqs be disabled.
47 */
48void ptrace_untrace(task_t *child)
49{
50 spin_lock(&child->sighand->siglock);
51 if (child->state == TASK_TRACED) {
52 if (child->signal->flags & SIGNAL_STOP_STOPPED) {
53 child->state = TASK_STOPPED;
54 } else {
55 signal_wake_up(child, 1);
56 }
57 }
58 spin_unlock(&child->sighand->siglock);
59}
60
61/*
62 * unptrace a task: move it back to its original parent and
63 * remove it from the ptrace list.
64 *
65 * Must be called with the tasklist lock write-held.
66 */
67void __ptrace_unlink(task_t *child)
68{
69 if (!child->ptrace)
70 BUG();
71 child->ptrace = 0;
72 if (!list_empty(&child->ptrace_list)) {
73 list_del_init(&child->ptrace_list);
74 REMOVE_LINKS(child);
75 child->parent = child->real_parent;
76 SET_LINKS(child);
77 }
78
79 if (child->state == TASK_TRACED)
80 ptrace_untrace(child);
81}
82
83/*
84 * Check that we have indeed attached to the thing..
85 */
86int ptrace_check_attach(struct task_struct *child, int kill)
87{
88 int ret = -ESRCH;
89
90 /*
91 * We take the read lock around doing both checks to close a
92 * possible race where someone else was tracing our child and
93 * detached between these two checks. After this locked check,
94 * we are sure that this is our traced child and that can only
95 * be changed by us so it's not changing right after this.
96 */
97 read_lock(&tasklist_lock);
98 if ((child->ptrace & PT_PTRACED) && child->parent == current &&
99 (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
100 && child->signal != NULL) {
101 ret = 0;
102 spin_lock_irq(&child->sighand->siglock);
103 if (child->state == TASK_STOPPED) {
104 child->state = TASK_TRACED;
105 } else if (child->state != TASK_TRACED && !kill) {
106 ret = -ESRCH;
107 }
108 spin_unlock_irq(&child->sighand->siglock);
109 }
110 read_unlock(&tasklist_lock);
111
112 if (!ret && !kill) {
113 wait_task_inactive(child);
114 }
115
116 /* All systems go.. */
117 return ret;
118}
119
120int ptrace_attach(struct task_struct *task)
121{
122 int retval;
123 task_lock(task);
124 retval = -EPERM;
125 if (task->pid <= 1)
126 goto bad;
127 if (task == current)
128 goto bad;
129 if (!task->mm)
130 goto bad;
131 if(((current->uid != task->euid) ||
132 (current->uid != task->suid) ||
133 (current->uid != task->uid) ||
134 (current->gid != task->egid) ||
135 (current->gid != task->sgid) ||
136 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
137 goto bad;
138 rmb();
139 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
140 goto bad;
141 /* the same process cannot be attached many times */
142 if (task->ptrace & PT_PTRACED)
143 goto bad;
144 retval = security_ptrace(current, task);
145 if (retval)
146 goto bad;
147
148 /* Go */
149 task->ptrace |= PT_PTRACED | ((task->real_parent != current)
150 ? PT_ATTACHED : 0);
151 if (capable(CAP_SYS_PTRACE))
152 task->ptrace |= PT_PTRACE_CAP;
153 task_unlock(task);
154
155 write_lock_irq(&tasklist_lock);
156 __ptrace_link(task, current);
157 write_unlock_irq(&tasklist_lock);
158
159 force_sig_specific(SIGSTOP, task);
160 return 0;
161
162bad:
163 task_unlock(task);
164 return retval;
165}
166
167int ptrace_detach(struct task_struct *child, unsigned int data)
168{
169 if ((unsigned long) data > _NSIG)
170 return -EIO;
171
172 /* Architecture-specific hardware disable .. */
173 ptrace_disable(child);
174
175 /* .. re-parent .. */
176 child->exit_code = data;
177
178 write_lock_irq(&tasklist_lock);
179 __ptrace_unlink(child);
180 /* .. and wake it up. */
181 if (child->exit_state != EXIT_ZOMBIE)
182 wake_up_process(child);
183 write_unlock_irq(&tasklist_lock);
184
185 return 0;
186}
187
188/*
189 * Access another process' address space.
190 * Source/target buffer must be kernel space,
191 * Do not walk the page table directly, use get_user_pages
192 */
193
194int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
195{
196 struct mm_struct *mm;
197 struct vm_area_struct *vma;
198 struct page *page;
199 void *old_buf = buf;
200
201 mm = get_task_mm(tsk);
202 if (!mm)
203 return 0;
204
205 down_read(&mm->mmap_sem);
206 /* ignore errors, just check how much was sucessfully transfered */
207 while (len) {
208 int bytes, ret, offset;
209 void *maddr;
210
211 ret = get_user_pages(tsk, mm, addr, 1,
212 write, 1, &page, &vma);
213 if (ret <= 0)
214 break;
215
216 bytes = len;
217 offset = addr & (PAGE_SIZE-1);
218 if (bytes > PAGE_SIZE-offset)
219 bytes = PAGE_SIZE-offset;
220
221 maddr = kmap(page);
222 if (write) {
223 copy_to_user_page(vma, page, addr,
224 maddr + offset, buf, bytes);
225 set_page_dirty_lock(page);
226 } else {
227 copy_from_user_page(vma, page, addr,
228 buf, maddr + offset, bytes);
229 }
230 kunmap(page);
231 page_cache_release(page);
232 len -= bytes;
233 buf += bytes;
234 addr += bytes;
235 }
236 up_read(&mm->mmap_sem);
237 mmput(mm);
238
239 return buf - old_buf;
240}
241
242int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
243{
244 int copied = 0;
245
246 while (len > 0) {
247 char buf[128];
248 int this_len, retval;
249
250 this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
251 retval = access_process_vm(tsk, src, buf, this_len, 0);
252 if (!retval) {
253 if (copied)
254 break;
255 return -EIO;
256 }
257 if (copy_to_user(dst, buf, retval))
258 return -EFAULT;
259 copied += retval;
260 src += retval;
261 dst += retval;
262 len -= retval;
263 }
264 return copied;
265}
266
267int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
268{
269 int copied = 0;
270
271 while (len > 0) {
272 char buf[128];
273 int this_len, retval;
274
275 this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
276 if (copy_from_user(buf, src, this_len))
277 return -EFAULT;
278 retval = access_process_vm(tsk, dst, buf, this_len, 1);
279 if (!retval) {
280 if (copied)
281 break;
282 return -EIO;
283 }
284 copied += retval;
285 src += retval;
286 dst += retval;
287 len -= retval;
288 }
289 return copied;
290}
291
292static int ptrace_setoptions(struct task_struct *child, long data)
293{
294 child->ptrace &= ~PT_TRACE_MASK;
295
296 if (data & PTRACE_O_TRACESYSGOOD)
297 child->ptrace |= PT_TRACESYSGOOD;
298
299 if (data & PTRACE_O_TRACEFORK)
300 child->ptrace |= PT_TRACE_FORK;
301
302 if (data & PTRACE_O_TRACEVFORK)
303 child->ptrace |= PT_TRACE_VFORK;
304
305 if (data & PTRACE_O_TRACECLONE)
306 child->ptrace |= PT_TRACE_CLONE;
307
308 if (data & PTRACE_O_TRACEEXEC)
309 child->ptrace |= PT_TRACE_EXEC;
310
311 if (data & PTRACE_O_TRACEVFORKDONE)
312 child->ptrace |= PT_TRACE_VFORK_DONE;
313
314 if (data & PTRACE_O_TRACEEXIT)
315 child->ptrace |= PT_TRACE_EXIT;
316
317 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
318}
319
320static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
321{
322 siginfo_t lastinfo;
323 int error = -ESRCH;
324
325 read_lock(&tasklist_lock);
326 if (likely(child->sighand != NULL)) {
327 error = -EINVAL;
328 spin_lock_irq(&child->sighand->siglock);
329 if (likely(child->last_siginfo != NULL)) {
330 lastinfo = *child->last_siginfo;
331 error = 0;
332 }
333 spin_unlock_irq(&child->sighand->siglock);
334 }
335 read_unlock(&tasklist_lock);
336 if (!error)
337 return copy_siginfo_to_user(data, &lastinfo);
338 return error;
339}
340
341static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
342{
343 siginfo_t newinfo;
344 int error = -ESRCH;
345
346 if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
347 return -EFAULT;
348
349 read_lock(&tasklist_lock);
350 if (likely(child->sighand != NULL)) {
351 error = -EINVAL;
352 spin_lock_irq(&child->sighand->siglock);
353 if (likely(child->last_siginfo != NULL)) {
354 *child->last_siginfo = newinfo;
355 error = 0;
356 }
357 spin_unlock_irq(&child->sighand->siglock);
358 }
359 read_unlock(&tasklist_lock);
360 return error;
361}
362
363int ptrace_request(struct task_struct *child, long request,
364 long addr, long data)
365{
366 int ret = -EIO;
367
368 switch (request) {
369#ifdef PTRACE_OLDSETOPTIONS
370 case PTRACE_OLDSETOPTIONS:
371#endif
372 case PTRACE_SETOPTIONS:
373 ret = ptrace_setoptions(child, data);
374 break;
375 case PTRACE_GETEVENTMSG:
376 ret = put_user(child->ptrace_message, (unsigned long __user *) data);
377 break;
378 case PTRACE_GETSIGINFO:
379 ret = ptrace_getsiginfo(child, (siginfo_t __user *) data);
380 break;
381 case PTRACE_SETSIGINFO:
382 ret = ptrace_setsiginfo(child, (siginfo_t __user *) data);
383 break;
384 default:
385 break;
386 }
387
388 return ret;
389}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
new file mode 100644
index 000000000000..d00eded75d71
--- /dev/null
+++ b/kernel/rcupdate.c
@@ -0,0 +1,470 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * http://lse.sourceforge.net/locking/rcupdate.html
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/interrupt.h>
39#include <linux/sched.h>
40#include <asm/atomic.h>
41#include <linux/bitops.h>
42#include <linux/module.h>
43#include <linux/completion.h>
44#include <linux/moduleparam.h>
45#include <linux/percpu.h>
46#include <linux/notifier.h>
47#include <linux/rcupdate.h>
48#include <linux/cpu.h>
49
50/* Definition for rcupdate control block. */
51struct rcu_ctrlblk rcu_ctrlblk =
52 { .cur = -300, .completed = -300 };
53struct rcu_ctrlblk rcu_bh_ctrlblk =
54 { .cur = -300, .completed = -300 };
55
56/* Bookkeeping of the progress of the grace period */
57struct rcu_state {
58 spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */
59 cpumask_t cpumask; /* CPUs that need to switch in order */
60 /* for current batch to proceed. */
61};
62
63static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
64 {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
65static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
66 {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
67
68DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
69DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
70
71/* Fake initialization required by compiler */
72static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
73static int maxbatch = 10;
74
75/**
76 * call_rcu - Queue an RCU callback for invocation after a grace period.
77 * @head: structure to be used for queueing the RCU updates.
78 * @func: actual update function to be invoked after the grace period
79 *
80 * The update function will be invoked some time after a full grace
81 * period elapses, in other words after all currently executing RCU
82 * read-side critical sections have completed. RCU read-side critical
83 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
84 * and may be nested.
85 */
86void fastcall call_rcu(struct rcu_head *head,
87 void (*func)(struct rcu_head *rcu))
88{
89 unsigned long flags;
90 struct rcu_data *rdp;
91
92 head->func = func;
93 head->next = NULL;
94 local_irq_save(flags);
95 rdp = &__get_cpu_var(rcu_data);
96 *rdp->nxttail = head;
97 rdp->nxttail = &head->next;
98 local_irq_restore(flags);
99}
100
101/**
102 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
103 * @head: structure to be used for queueing the RCU updates.
104 * @func: actual update function to be invoked after the grace period
105 *
106 * The update function will be invoked some time after a full grace
107 * period elapses, in other words after all currently executing RCU
108 * read-side critical sections have completed. call_rcu_bh() assumes
109 * that the read-side critical sections end on completion of a softirq
110 * handler. This means that read-side critical sections in process
111 * context must not be interrupted by softirqs. This interface is to be
112 * used when most of the read-side critical sections are in softirq context.
113 * RCU read-side critical sections are delimited by rcu_read_lock() and
114 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
115 * and rcu_read_unlock_bh(), if in process context. These may be nested.
116 */
117void fastcall call_rcu_bh(struct rcu_head *head,
118 void (*func)(struct rcu_head *rcu))
119{
120 unsigned long flags;
121 struct rcu_data *rdp;
122
123 head->func = func;
124 head->next = NULL;
125 local_irq_save(flags);
126 rdp = &__get_cpu_var(rcu_bh_data);
127 *rdp->nxttail = head;
128 rdp->nxttail = &head->next;
129 local_irq_restore(flags);
130}
131
132/*
133 * Invoke the completed RCU callbacks. They are expected to be in
134 * a per-cpu list.
135 */
136static void rcu_do_batch(struct rcu_data *rdp)
137{
138 struct rcu_head *next, *list;
139 int count = 0;
140
141 list = rdp->donelist;
142 while (list) {
143 next = rdp->donelist = list->next;
144 list->func(list);
145 list = next;
146 if (++count >= maxbatch)
147 break;
148 }
149 if (!rdp->donelist)
150 rdp->donetail = &rdp->donelist;
151 else
152 tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
153}
154
155/*
156 * Grace period handling:
157 * The grace period handling consists out of two steps:
158 * - A new grace period is started.
159 * This is done by rcu_start_batch. The start is not broadcasted to
160 * all cpus, they must pick this up by comparing rcp->cur with
161 * rdp->quiescbatch. All cpus are recorded in the
162 * rcu_state.cpumask bitmap.
163 * - All cpus must go through a quiescent state.
164 * Since the start of the grace period is not broadcasted, at least two
165 * calls to rcu_check_quiescent_state are required:
166 * The first call just notices that a new grace period is running. The
167 * following calls check if there was a quiescent state since the beginning
168 * of the grace period. If so, it updates rcu_state.cpumask. If
169 * the bitmap is empty, then the grace period is completed.
170 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
171 * period (if necessary).
172 */
173/*
174 * Register a new batch of callbacks, and start it up if there is currently no
175 * active batch and the batch to be registered has not already occurred.
176 * Caller must hold rcu_state.lock.
177 */
178static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
179 int next_pending)
180{
181 if (next_pending)
182 rcp->next_pending = 1;
183
184 if (rcp->next_pending &&
185 rcp->completed == rcp->cur) {
186 /* Can't change, since spin lock held. */
187 cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
188
189 rcp->next_pending = 0;
190 /* next_pending == 0 must be visible in __rcu_process_callbacks()
191 * before it can see new value of cur.
192 */
193 smp_wmb();
194 rcp->cur++;
195 }
196}
197
198/*
199 * cpu went through a quiescent state since the beginning of the grace period.
200 * Clear it from the cpu mask and complete the grace period if it was the last
201 * cpu. Start another grace period if someone has further entries pending
202 */
203static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
204{
205 cpu_clear(cpu, rsp->cpumask);
206 if (cpus_empty(rsp->cpumask)) {
207 /* batch completed ! */
208 rcp->completed = rcp->cur;
209 rcu_start_batch(rcp, rsp, 0);
210 }
211}
212
213/*
214 * Check if the cpu has gone through a quiescent state (say context
215 * switch). If so and if it already hasn't done so in this RCU
216 * quiescent cycle, then indicate that it has done so.
217 */
218static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
219 struct rcu_state *rsp, struct rcu_data *rdp)
220{
221 if (rdp->quiescbatch != rcp->cur) {
222 /* start new grace period: */
223 rdp->qs_pending = 1;
224 rdp->passed_quiesc = 0;
225 rdp->quiescbatch = rcp->cur;
226 return;
227 }
228
229 /* Grace period already completed for this cpu?
230 * qs_pending is checked instead of the actual bitmap to avoid
231 * cacheline trashing.
232 */
233 if (!rdp->qs_pending)
234 return;
235
236 /*
237 * Was there a quiescent state since the beginning of the grace
238 * period? If no, then exit and wait for the next call.
239 */
240 if (!rdp->passed_quiesc)
241 return;
242 rdp->qs_pending = 0;
243
244 spin_lock(&rsp->lock);
245 /*
246 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
247 * during cpu startup. Ignore the quiescent state.
248 */
249 if (likely(rdp->quiescbatch == rcp->cur))
250 cpu_quiet(rdp->cpu, rcp, rsp);
251
252 spin_unlock(&rsp->lock);
253}
254
255
256#ifdef CONFIG_HOTPLUG_CPU
257
258/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
259 * locking requirements, the list it's pulling from has to belong to a cpu
260 * which is dead and hence not processing interrupts.
261 */
262static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
263 struct rcu_head **tail)
264{
265 local_irq_disable();
266 *this_rdp->nxttail = list;
267 if (list)
268 this_rdp->nxttail = tail;
269 local_irq_enable();
270}
271
272static void __rcu_offline_cpu(struct rcu_data *this_rdp,
273 struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp)
274{
275 /* if the cpu going offline owns the grace period
276 * we can block indefinitely waiting for it, so flush
277 * it here
278 */
279 spin_lock_bh(&rsp->lock);
280 if (rcp->cur != rcp->completed)
281 cpu_quiet(rdp->cpu, rcp, rsp);
282 spin_unlock_bh(&rsp->lock);
283 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
284 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
285
286}
287static void rcu_offline_cpu(int cpu)
288{
289 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
290 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
291
292 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state,
293 &per_cpu(rcu_data, cpu));
294 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state,
295 &per_cpu(rcu_bh_data, cpu));
296 put_cpu_var(rcu_data);
297 put_cpu_var(rcu_bh_data);
298 tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
299}
300
301#else
302
303static void rcu_offline_cpu(int cpu)
304{
305}
306
307#endif
308
309/*
310 * This does the RCU processing work from tasklet context.
311 */
312static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
313 struct rcu_state *rsp, struct rcu_data *rdp)
314{
315 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
316 *rdp->donetail = rdp->curlist;
317 rdp->donetail = rdp->curtail;
318 rdp->curlist = NULL;
319 rdp->curtail = &rdp->curlist;
320 }
321
322 local_irq_disable();
323 if (rdp->nxtlist && !rdp->curlist) {
324 rdp->curlist = rdp->nxtlist;
325 rdp->curtail = rdp->nxttail;
326 rdp->nxtlist = NULL;
327 rdp->nxttail = &rdp->nxtlist;
328 local_irq_enable();
329
330 /*
331 * start the next batch of callbacks
332 */
333
334 /* determine batch number */
335 rdp->batch = rcp->cur + 1;
336 /* see the comment and corresponding wmb() in
337 * the rcu_start_batch()
338 */
339 smp_rmb();
340
341 if (!rcp->next_pending) {
342 /* and start it/schedule start if it's a new batch */
343 spin_lock(&rsp->lock);
344 rcu_start_batch(rcp, rsp, 1);
345 spin_unlock(&rsp->lock);
346 }
347 } else {
348 local_irq_enable();
349 }
350 rcu_check_quiescent_state(rcp, rsp, rdp);
351 if (rdp->donelist)
352 rcu_do_batch(rdp);
353}
354
355static void rcu_process_callbacks(unsigned long unused)
356{
357 __rcu_process_callbacks(&rcu_ctrlblk, &rcu_state,
358 &__get_cpu_var(rcu_data));
359 __rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state,
360 &__get_cpu_var(rcu_bh_data));
361}
362
363void rcu_check_callbacks(int cpu, int user)
364{
365 if (user ||
366 (idle_cpu(cpu) && !in_softirq() &&
367 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
368 rcu_qsctr_inc(cpu);
369 rcu_bh_qsctr_inc(cpu);
370 } else if (!in_softirq())
371 rcu_bh_qsctr_inc(cpu);
372 tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
373}
374
375static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
376 struct rcu_data *rdp)
377{
378 memset(rdp, 0, sizeof(*rdp));
379 rdp->curtail = &rdp->curlist;
380 rdp->nxttail = &rdp->nxtlist;
381 rdp->donetail = &rdp->donelist;
382 rdp->quiescbatch = rcp->completed;
383 rdp->qs_pending = 0;
384 rdp->cpu = cpu;
385}
386
387static void __devinit rcu_online_cpu(int cpu)
388{
389 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
390 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
391
392 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
393 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
394 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
395}
396
397static int __devinit rcu_cpu_notify(struct notifier_block *self,
398 unsigned long action, void *hcpu)
399{
400 long cpu = (long)hcpu;
401 switch (action) {
402 case CPU_UP_PREPARE:
403 rcu_online_cpu(cpu);
404 break;
405 case CPU_DEAD:
406 rcu_offline_cpu(cpu);
407 break;
408 default:
409 break;
410 }
411 return NOTIFY_OK;
412}
413
414static struct notifier_block __devinitdata rcu_nb = {
415 .notifier_call = rcu_cpu_notify,
416};
417
418/*
419 * Initializes rcu mechanism. Assumed to be called early.
420 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
421 * Note that rcu_qsctr and friends are implicitly
422 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
423 */
424void __init rcu_init(void)
425{
426 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
427 (void *)(long)smp_processor_id());
428 /* Register notifier for non-boot CPUs */
429 register_cpu_notifier(&rcu_nb);
430}
431
432struct rcu_synchronize {
433 struct rcu_head head;
434 struct completion completion;
435};
436
437/* Because of FASTCALL declaration of complete, we use this wrapper */
438static void wakeme_after_rcu(struct rcu_head *head)
439{
440 struct rcu_synchronize *rcu;
441
442 rcu = container_of(head, struct rcu_synchronize, head);
443 complete(&rcu->completion);
444}
445
446/**
447 * synchronize_kernel - wait until a grace period has elapsed.
448 *
449 * Control will return to the caller some time after a full grace
450 * period has elapsed, in other words after all currently executing RCU
451 * read-side critical sections have completed. RCU read-side critical
452 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
453 * and may be nested.
454 */
455void synchronize_kernel(void)
456{
457 struct rcu_synchronize rcu;
458
459 init_completion(&rcu.completion);
460 /* Will wake me after RCU finished */
461 call_rcu(&rcu.head, wakeme_after_rcu);
462
463 /* Wait for it */
464 wait_for_completion(&rcu.completion);
465}
466
467module_param(maxbatch, int, 0);
468EXPORT_SYMBOL_GPL(call_rcu);
469EXPORT_SYMBOL_GPL(call_rcu_bh);
470EXPORT_SYMBOL_GPL(synchronize_kernel);
diff --git a/kernel/resource.c b/kernel/resource.c
new file mode 100644
index 000000000000..35c99ac02c7c
--- /dev/null
+++ b/kernel/resource.c
@@ -0,0 +1,551 @@
1/*
2 * linux/kernel/resource.c
3 *
4 * Copyright (C) 1999 Linus Torvalds
5 * Copyright (C) 1999 Martin Mares <mj@ucw.cz>
6 *
7 * Arbitrary resource management.
8 */
9
10#include <linux/config.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/errno.h>
14#include <linux/ioport.h>
15#include <linux/init.h>
16#include <linux/slab.h>
17#include <linux/spinlock.h>
18#include <linux/fs.h>
19#include <linux/proc_fs.h>
20#include <linux/seq_file.h>
21#include <asm/io.h>
22
23
24struct resource ioport_resource = {
25 .name = "PCI IO",
26 .start = 0x0000,
27 .end = IO_SPACE_LIMIT,
28 .flags = IORESOURCE_IO,
29};
30
31EXPORT_SYMBOL(ioport_resource);
32
33struct resource iomem_resource = {
34 .name = "PCI mem",
35 .start = 0UL,
36 .end = ~0UL,
37 .flags = IORESOURCE_MEM,
38};
39
40EXPORT_SYMBOL(iomem_resource);
41
42static DEFINE_RWLOCK(resource_lock);
43
44#ifdef CONFIG_PROC_FS
45
46enum { MAX_IORES_LEVEL = 5 };
47
48static void *r_next(struct seq_file *m, void *v, loff_t *pos)
49{
50 struct resource *p = v;
51 (*pos)++;
52 if (p->child)
53 return p->child;
54 while (!p->sibling && p->parent)
55 p = p->parent;
56 return p->sibling;
57}
58
59static void *r_start(struct seq_file *m, loff_t *pos)
60 __acquires(resource_lock)
61{
62 struct resource *p = m->private;
63 loff_t l = 0;
64 read_lock(&resource_lock);
65 for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
66 ;
67 return p;
68}
69
70static void r_stop(struct seq_file *m, void *v)
71 __releases(resource_lock)
72{
73 read_unlock(&resource_lock);
74}
75
76static int r_show(struct seq_file *m, void *v)
77{
78 struct resource *root = m->private;
79 struct resource *r = v, *p;
80 int width = root->end < 0x10000 ? 4 : 8;
81 int depth;
82
83 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
84 if (p->parent == root)
85 break;
86 seq_printf(m, "%*s%0*lx-%0*lx : %s\n",
87 depth * 2, "",
88 width, r->start,
89 width, r->end,
90 r->name ? r->name : "<BAD>");
91 return 0;
92}
93
94static struct seq_operations resource_op = {
95 .start = r_start,
96 .next = r_next,
97 .stop = r_stop,
98 .show = r_show,
99};
100
101static int ioports_open(struct inode *inode, struct file *file)
102{
103 int res = seq_open(file, &resource_op);
104 if (!res) {
105 struct seq_file *m = file->private_data;
106 m->private = &ioport_resource;
107 }
108 return res;
109}
110
111static int iomem_open(struct inode *inode, struct file *file)
112{
113 int res = seq_open(file, &resource_op);
114 if (!res) {
115 struct seq_file *m = file->private_data;
116 m->private = &iomem_resource;
117 }
118 return res;
119}
120
121static struct file_operations proc_ioports_operations = {
122 .open = ioports_open,
123 .read = seq_read,
124 .llseek = seq_lseek,
125 .release = seq_release,
126};
127
128static struct file_operations proc_iomem_operations = {
129 .open = iomem_open,
130 .read = seq_read,
131 .llseek = seq_lseek,
132 .release = seq_release,
133};
134
135static int __init ioresources_init(void)
136{
137 struct proc_dir_entry *entry;
138
139 entry = create_proc_entry("ioports", 0, NULL);
140 if (entry)
141 entry->proc_fops = &proc_ioports_operations;
142 entry = create_proc_entry("iomem", 0, NULL);
143 if (entry)
144 entry->proc_fops = &proc_iomem_operations;
145 return 0;
146}
147__initcall(ioresources_init);
148
149#endif /* CONFIG_PROC_FS */
150
151/* Return the conflict entry if you can't request it */
152static struct resource * __request_resource(struct resource *root, struct resource *new)
153{
154 unsigned long start = new->start;
155 unsigned long end = new->end;
156 struct resource *tmp, **p;
157
158 if (end < start)
159 return root;
160 if (start < root->start)
161 return root;
162 if (end > root->end)
163 return root;
164 p = &root->child;
165 for (;;) {
166 tmp = *p;
167 if (!tmp || tmp->start > end) {
168 new->sibling = tmp;
169 *p = new;
170 new->parent = root;
171 return NULL;
172 }
173 p = &tmp->sibling;
174 if (tmp->end < start)
175 continue;
176 return tmp;
177 }
178}
179
180static int __release_resource(struct resource *old)
181{
182 struct resource *tmp, **p;
183
184 p = &old->parent->child;
185 for (;;) {
186 tmp = *p;
187 if (!tmp)
188 break;
189 if (tmp == old) {
190 *p = tmp->sibling;
191 old->parent = NULL;
192 return 0;
193 }
194 p = &tmp->sibling;
195 }
196 return -EINVAL;
197}
198
199int request_resource(struct resource *root, struct resource *new)
200{
201 struct resource *conflict;
202
203 write_lock(&resource_lock);
204 conflict = __request_resource(root, new);
205 write_unlock(&resource_lock);
206 return conflict ? -EBUSY : 0;
207}
208
209EXPORT_SYMBOL(request_resource);
210
211struct resource *____request_resource(struct resource *root, struct resource *new)
212{
213 struct resource *conflict;
214
215 write_lock(&resource_lock);
216 conflict = __request_resource(root, new);
217 write_unlock(&resource_lock);
218 return conflict;
219}
220
221EXPORT_SYMBOL(____request_resource);
222
223int release_resource(struct resource *old)
224{
225 int retval;
226
227 write_lock(&resource_lock);
228 retval = __release_resource(old);
229 write_unlock(&resource_lock);
230 return retval;
231}
232
233EXPORT_SYMBOL(release_resource);
234
235/*
236 * Find empty slot in the resource tree given range and alignment.
237 */
238static int find_resource(struct resource *root, struct resource *new,
239 unsigned long size,
240 unsigned long min, unsigned long max,
241 unsigned long align,
242 void (*alignf)(void *, struct resource *,
243 unsigned long, unsigned long),
244 void *alignf_data)
245{
246 struct resource *this = root->child;
247
248 new->start = root->start;
249 /*
250 * Skip past an allocated resource that starts at 0, since the assignment
251 * of this->start - 1 to new->end below would cause an underflow.
252 */
253 if (this && this->start == 0) {
254 new->start = this->end + 1;
255 this = this->sibling;
256 }
257 for(;;) {
258 if (this)
259 new->end = this->start - 1;
260 else
261 new->end = root->end;
262 if (new->start < min)
263 new->start = min;
264 if (new->end > max)
265 new->end = max;
266 new->start = (new->start + align - 1) & ~(align - 1);
267 if (alignf)
268 alignf(alignf_data, new, size, align);
269 if (new->start < new->end && new->end - new->start + 1 >= size) {
270 new->end = new->start + size - 1;
271 return 0;
272 }
273 if (!this)
274 break;
275 new->start = this->end + 1;
276 this = this->sibling;
277 }
278 return -EBUSY;
279}
280
281/*
282 * Allocate empty slot in the resource tree given range and alignment.
283 */
284int allocate_resource(struct resource *root, struct resource *new,
285 unsigned long size,
286 unsigned long min, unsigned long max,
287 unsigned long align,
288 void (*alignf)(void *, struct resource *,
289 unsigned long, unsigned long),
290 void *alignf_data)
291{
292 int err;
293
294 write_lock(&resource_lock);
295 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
296 if (err >= 0 && __request_resource(root, new))
297 err = -EBUSY;
298 write_unlock(&resource_lock);
299 return err;
300}
301
302EXPORT_SYMBOL(allocate_resource);
303
304/**
305 * insert_resource - Inserts a resource in the resource tree
306 * @parent: parent of the new resource
307 * @new: new resource to insert
308 *
309 * Returns 0 on success, -EBUSY if the resource can't be inserted.
310 *
311 * This function is equivalent of request_resource when no conflict
312 * happens. If a conflict happens, and the conflicting resources
313 * entirely fit within the range of the new resource, then the new
314 * resource is inserted and the conflicting resources become childs of
315 * the new resource. Otherwise the new resource becomes the child of
316 * the conflicting resource
317 */
318int insert_resource(struct resource *parent, struct resource *new)
319{
320 int result;
321 struct resource *first, *next;
322
323 write_lock(&resource_lock);
324 begin:
325 result = 0;
326 first = __request_resource(parent, new);
327 if (!first)
328 goto out;
329
330 result = -EBUSY;
331 if (first == parent)
332 goto out;
333
334 /* Resource fully contained by the clashing resource? Recurse into it */
335 if (first->start <= new->start && first->end >= new->end) {
336 parent = first;
337 goto begin;
338 }
339
340 for (next = first; ; next = next->sibling) {
341 /* Partial overlap? Bad, and unfixable */
342 if (next->start < new->start || next->end > new->end)
343 goto out;
344 if (!next->sibling)
345 break;
346 if (next->sibling->start > new->end)
347 break;
348 }
349
350 result = 0;
351
352 new->parent = parent;
353 new->sibling = next->sibling;
354 new->child = first;
355
356 next->sibling = NULL;
357 for (next = first; next; next = next->sibling)
358 next->parent = new;
359
360 if (parent->child == first) {
361 parent->child = new;
362 } else {
363 next = parent->child;
364 while (next->sibling != first)
365 next = next->sibling;
366 next->sibling = new;
367 }
368
369 out:
370 write_unlock(&resource_lock);
371 return result;
372}
373
374EXPORT_SYMBOL(insert_resource);
375
376/*
377 * Given an existing resource, change its start and size to match the
378 * arguments. Returns -EBUSY if it can't fit. Existing children of
379 * the resource are assumed to be immutable.
380 */
381int adjust_resource(struct resource *res, unsigned long start, unsigned long size)
382{
383 struct resource *tmp, *parent = res->parent;
384 unsigned long end = start + size - 1;
385 int result = -EBUSY;
386
387 write_lock(&resource_lock);
388
389 if ((start < parent->start) || (end > parent->end))
390 goto out;
391
392 for (tmp = res->child; tmp; tmp = tmp->sibling) {
393 if ((tmp->start < start) || (tmp->end > end))
394 goto out;
395 }
396
397 if (res->sibling && (res->sibling->start <= end))
398 goto out;
399
400 tmp = parent->child;
401 if (tmp != res) {
402 while (tmp->sibling != res)
403 tmp = tmp->sibling;
404 if (start <= tmp->end)
405 goto out;
406 }
407
408 res->start = start;
409 res->end = end;
410 result = 0;
411
412 out:
413 write_unlock(&resource_lock);
414 return result;
415}
416
417EXPORT_SYMBOL(adjust_resource);
418
419/*
420 * This is compatibility stuff for IO resources.
421 *
422 * Note how this, unlike the above, knows about
423 * the IO flag meanings (busy etc).
424 *
425 * Request-region creates a new busy region.
426 *
427 * Check-region returns non-zero if the area is already busy
428 *
429 * Release-region releases a matching busy region.
430 */
431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
432{
433 struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
434
435 if (res) {
436 memset(res, 0, sizeof(*res));
437 res->name = name;
438 res->start = start;
439 res->end = start + n - 1;
440 res->flags = IORESOURCE_BUSY;
441
442 write_lock(&resource_lock);
443
444 for (;;) {
445 struct resource *conflict;
446
447 conflict = __request_resource(parent, res);
448 if (!conflict)
449 break;
450 if (conflict != parent) {
451 parent = conflict;
452 if (!(conflict->flags & IORESOURCE_BUSY))
453 continue;
454 }
455
456 /* Uhhuh, that didn't work out.. */
457 kfree(res);
458 res = NULL;
459 break;
460 }
461 write_unlock(&resource_lock);
462 }
463 return res;
464}
465
466EXPORT_SYMBOL(__request_region);
467
468int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n)
469{
470 struct resource * res;
471
472 res = __request_region(parent, start, n, "check-region");
473 if (!res)
474 return -EBUSY;
475
476 release_resource(res);
477 kfree(res);
478 return 0;
479}
480
481EXPORT_SYMBOL(__check_region);
482
483void __release_region(struct resource *parent, unsigned long start, unsigned long n)
484{
485 struct resource **p;
486 unsigned long end;
487
488 p = &parent->child;
489 end = start + n - 1;
490
491 write_lock(&resource_lock);
492
493 for (;;) {
494 struct resource *res = *p;
495
496 if (!res)
497 break;
498 if (res->start <= start && res->end >= end) {
499 if (!(res->flags & IORESOURCE_BUSY)) {
500 p = &res->child;
501 continue;
502 }
503 if (res->start != start || res->end != end)
504 break;
505 *p = res->sibling;
506 write_unlock(&resource_lock);
507 kfree(res);
508 return;
509 }
510 p = &res->sibling;
511 }
512
513 write_unlock(&resource_lock);
514
515 printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end);
516}
517
518EXPORT_SYMBOL(__release_region);
519
520/*
521 * Called from init/main.c to reserve IO ports.
522 */
523#define MAXRESERVE 4
524static int __init reserve_setup(char *str)
525{
526 static int reserved;
527 static struct resource reserve[MAXRESERVE];
528
529 for (;;) {
530 int io_start, io_num;
531 int x = reserved;
532
533 if (get_option (&str, &io_start) != 2)
534 break;
535 if (get_option (&str, &io_num) == 0)
536 break;
537 if (x < MAXRESERVE) {
538 struct resource *res = reserve + x;
539 res->name = "reserved";
540 res->start = io_start;
541 res->end = io_start + io_num - 1;
542 res->flags = IORESOURCE_BUSY;
543 res->child = NULL;
544 if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
545 reserved = x+1;
546 }
547 }
548 return 1;
549}
550
551__setup("reserve=", reserve_setup);
diff --git a/kernel/sched.c b/kernel/sched.c
new file mode 100644
index 000000000000..f69c4a5361e3
--- /dev/null
+++ b/kernel/sched.c
@@ -0,0 +1,5004 @@
1/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
30#include <linux/completion.h>
31#include <linux/kernel_stat.h>
32#include <linux/security.h>
33#include <linux/notifier.h>
34#include <linux/profile.h>
35#include <linux/suspend.h>
36#include <linux/blkdev.h>
37#include <linux/delay.h>
38#include <linux/smp.h>
39#include <linux/threads.h>
40#include <linux/timer.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h>
43#include <linux/cpuset.h>
44#include <linux/percpu.h>
45#include <linux/kthread.h>
46#include <linux/seq_file.h>
47#include <linux/syscalls.h>
48#include <linux/times.h>
49#include <linux/acct.h>
50#include <asm/tlb.h>
51
52#include <asm/unistd.h>
53
54/*
55 * Convert user-nice values [ -20 ... 0 ... 19 ]
56 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
57 * and back.
58 */
59#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
60#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
61#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
62
63/*
64 * 'User priority' is the nice value converted to something we
65 * can work with better when scaling various scheduler parameters,
66 * it's a [ 0 ... 39 ] range.
67 */
68#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
69#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
70#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
71
72/*
73 * Some helpers for converting nanosecond timing to jiffy resolution
74 */
75#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
76#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
77
78/*
79 * These are the 'tuning knobs' of the scheduler:
80 *
81 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
82 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
83 * Timeslices get refilled after they expire.
84 */
85#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
86#define DEF_TIMESLICE (100 * HZ / 1000)
87#define ON_RUNQUEUE_WEIGHT 30
88#define CHILD_PENALTY 95
89#define PARENT_PENALTY 100
90#define EXIT_WEIGHT 3
91#define PRIO_BONUS_RATIO 25
92#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
93#define INTERACTIVE_DELTA 2
94#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
95#define STARVATION_LIMIT (MAX_SLEEP_AVG)
96#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
97
98/*
99 * If a task is 'interactive' then we reinsert it in the active
100 * array after it has expired its current timeslice. (it will not
101 * continue to run immediately, it will still roundrobin with
102 * other interactive tasks.)
103 *
104 * This part scales the interactivity limit depending on niceness.
105 *
106 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
107 * Here are a few examples of different nice levels:
108 *
109 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
110 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
111 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
112 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
113 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
114 *
115 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
116 * priority range a task can explore, a value of '1' means the
117 * task is rated interactive.)
118 *
119 * Ie. nice +19 tasks can never get 'interactive' enough to be
120 * reinserted into the active array. And only heavily CPU-hog nice -20
121 * tasks will be expired. Default nice 0 tasks are somewhere between,
122 * it takes some effort for them to get interactive, but it's not
123 * too hard.
124 */
125
126#define CURRENT_BONUS(p) \
127 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
128 MAX_SLEEP_AVG)
129
130#define GRANULARITY (10 * HZ / 1000 ? : 1)
131
132#ifdef CONFIG_SMP
133#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
134 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
135 num_online_cpus())
136#else
137#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
138 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
139#endif
140
141#define SCALE(v1,v1_max,v2_max) \
142 (v1) * (v2_max) / (v1_max)
143
144#define DELTA(p) \
145 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
146
147#define TASK_INTERACTIVE(p) \
148 ((p)->prio <= (p)->static_prio - DELTA(p))
149
150#define INTERACTIVE_SLEEP(p) \
151 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
152 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
153
154#define TASK_PREEMPTS_CURR(p, rq) \
155 ((p)->prio < (rq)->curr->prio)
156
157/*
158 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
159 * to time slice values: [800ms ... 100ms ... 5ms]
160 *
161 * The higher a thread's priority, the bigger timeslices
162 * it gets during one round of execution. But even the lowest
163 * priority thread gets MIN_TIMESLICE worth of execution time.
164 */
165
166#define SCALE_PRIO(x, prio) \
167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168
169static inline unsigned int task_timeslice(task_t *p)
170{
171 if (p->static_prio < NICE_TO_PRIO(0))
172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
173 else
174 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
175}
176#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
177 < (long long) (sd)->cache_hot_time)
178
179/*
180 * These are the runqueue data structures:
181 */
182
183#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
184
185typedef struct runqueue runqueue_t;
186
187struct prio_array {
188 unsigned int nr_active;
189 unsigned long bitmap[BITMAP_SIZE];
190 struct list_head queue[MAX_PRIO];
191};
192
193/*
194 * This is the main, per-CPU runqueue data structure.
195 *
196 * Locking rule: those places that want to lock multiple runqueues
197 * (such as the load balancing or the thread migration code), lock
198 * acquire operations must be ordered by ascending &runqueue.
199 */
200struct runqueue {
201 spinlock_t lock;
202
203 /*
204 * nr_running and cpu_load should be in the same cacheline because
205 * remote CPUs use both these fields when doing load calculation.
206 */
207 unsigned long nr_running;
208#ifdef CONFIG_SMP
209 unsigned long cpu_load;
210#endif
211 unsigned long long nr_switches;
212
213 /*
214 * This is part of a global counter where only the total sum
215 * over all CPUs matters. A task can increase this counter on
216 * one CPU and if it got migrated afterwards it may decrease
217 * it on another CPU. Always updated under the runqueue lock:
218 */
219 unsigned long nr_uninterruptible;
220
221 unsigned long expired_timestamp;
222 unsigned long long timestamp_last_tick;
223 task_t *curr, *idle;
224 struct mm_struct *prev_mm;
225 prio_array_t *active, *expired, arrays[2];
226 int best_expired_prio;
227 atomic_t nr_iowait;
228
229#ifdef CONFIG_SMP
230 struct sched_domain *sd;
231
232 /* For active balancing */
233 int active_balance;
234 int push_cpu;
235
236 task_t *migration_thread;
237 struct list_head migration_queue;
238#endif
239
240#ifdef CONFIG_SCHEDSTATS
241 /* latency stats */
242 struct sched_info rq_sched_info;
243
244 /* sys_sched_yield() stats */
245 unsigned long yld_exp_empty;
246 unsigned long yld_act_empty;
247 unsigned long yld_both_empty;
248 unsigned long yld_cnt;
249
250 /* schedule() stats */
251 unsigned long sched_switch;
252 unsigned long sched_cnt;
253 unsigned long sched_goidle;
254
255 /* try_to_wake_up() stats */
256 unsigned long ttwu_cnt;
257 unsigned long ttwu_local;
258#endif
259};
260
261static DEFINE_PER_CPU(struct runqueue, runqueues);
262
263#define for_each_domain(cpu, domain) \
264 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
265
266#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
267#define this_rq() (&__get_cpu_var(runqueues))
268#define task_rq(p) cpu_rq(task_cpu(p))
269#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
270
271/*
272 * Default context-switch locking:
273 */
274#ifndef prepare_arch_switch
275# define prepare_arch_switch(rq, next) do { } while (0)
276# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
277# define task_running(rq, p) ((rq)->curr == (p))
278#endif
279
280/*
281 * task_rq_lock - lock the runqueue a given task resides on and disable
282 * interrupts. Note the ordering: we can safely lookup the task_rq without
283 * explicitly disabling preemption.
284 */
285static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
286 __acquires(rq->lock)
287{
288 struct runqueue *rq;
289
290repeat_lock_task:
291 local_irq_save(*flags);
292 rq = task_rq(p);
293 spin_lock(&rq->lock);
294 if (unlikely(rq != task_rq(p))) {
295 spin_unlock_irqrestore(&rq->lock, *flags);
296 goto repeat_lock_task;
297 }
298 return rq;
299}
300
301static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
302 __releases(rq->lock)
303{
304 spin_unlock_irqrestore(&rq->lock, *flags);
305}
306
307#ifdef CONFIG_SCHEDSTATS
308/*
309 * bump this up when changing the output format or the meaning of an existing
310 * format, so that tools can adapt (or abort)
311 */
312#define SCHEDSTAT_VERSION 11
313
314static int show_schedstat(struct seq_file *seq, void *v)
315{
316 int cpu;
317
318 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
319 seq_printf(seq, "timestamp %lu\n", jiffies);
320 for_each_online_cpu(cpu) {
321 runqueue_t *rq = cpu_rq(cpu);
322#ifdef CONFIG_SMP
323 struct sched_domain *sd;
324 int dcnt = 0;
325#endif
326
327 /* runqueue-specific stats */
328 seq_printf(seq,
329 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
330 cpu, rq->yld_both_empty,
331 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
332 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
333 rq->ttwu_cnt, rq->ttwu_local,
334 rq->rq_sched_info.cpu_time,
335 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
336
337 seq_printf(seq, "\n");
338
339#ifdef CONFIG_SMP
340 /* domain-specific stats */
341 for_each_domain(cpu, sd) {
342 enum idle_type itype;
343 char mask_str[NR_CPUS];
344
345 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
346 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
347 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
348 itype++) {
349 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
350 sd->lb_cnt[itype],
351 sd->lb_balanced[itype],
352 sd->lb_failed[itype],
353 sd->lb_imbalance[itype],
354 sd->lb_gained[itype],
355 sd->lb_hot_gained[itype],
356 sd->lb_nobusyq[itype],
357 sd->lb_nobusyg[itype]);
358 }
359 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
360 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361 sd->sbe_pushed, sd->sbe_attempts,
362 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
363 }
364#endif
365 }
366 return 0;
367}
368
369static int schedstat_open(struct inode *inode, struct file *file)
370{
371 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
372 char *buf = kmalloc(size, GFP_KERNEL);
373 struct seq_file *m;
374 int res;
375
376 if (!buf)
377 return -ENOMEM;
378 res = single_open(file, show_schedstat, NULL);
379 if (!res) {
380 m = file->private_data;
381 m->buf = buf;
382 m->size = size;
383 } else
384 kfree(buf);
385 return res;
386}
387
388struct file_operations proc_schedstat_operations = {
389 .open = schedstat_open,
390 .read = seq_read,
391 .llseek = seq_lseek,
392 .release = single_release,
393};
394
395# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
396# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
397#else /* !CONFIG_SCHEDSTATS */
398# define schedstat_inc(rq, field) do { } while (0)
399# define schedstat_add(rq, field, amt) do { } while (0)
400#endif
401
402/*
403 * rq_lock - lock a given runqueue and disable interrupts.
404 */
405static inline runqueue_t *this_rq_lock(void)
406 __acquires(rq->lock)
407{
408 runqueue_t *rq;
409
410 local_irq_disable();
411 rq = this_rq();
412 spin_lock(&rq->lock);
413
414 return rq;
415}
416
417#ifdef CONFIG_SCHED_SMT
418static int cpu_and_siblings_are_idle(int cpu)
419{
420 int sib;
421 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
422 if (idle_cpu(sib))
423 continue;
424 return 0;
425 }
426
427 return 1;
428}
429#else
430#define cpu_and_siblings_are_idle(A) idle_cpu(A)
431#endif
432
433#ifdef CONFIG_SCHEDSTATS
434/*
435 * Called when a process is dequeued from the active array and given
436 * the cpu. We should note that with the exception of interactive
437 * tasks, the expired queue will become the active queue after the active
438 * queue is empty, without explicitly dequeuing and requeuing tasks in the
439 * expired queue. (Interactive tasks may be requeued directly to the
440 * active queue, thus delaying tasks in the expired queue from running;
441 * see scheduler_tick()).
442 *
443 * This function is only called from sched_info_arrive(), rather than
444 * dequeue_task(). Even though a task may be queued and dequeued multiple
445 * times as it is shuffled about, we're really interested in knowing how
446 * long it was from the *first* time it was queued to the time that it
447 * finally hit a cpu.
448 */
449static inline void sched_info_dequeued(task_t *t)
450{
451 t->sched_info.last_queued = 0;
452}
453
454/*
455 * Called when a task finally hits the cpu. We can now calculate how
456 * long it was waiting to run. We also note when it began so that we
457 * can keep stats on how long its timeslice is.
458 */
459static inline void sched_info_arrive(task_t *t)
460{
461 unsigned long now = jiffies, diff = 0;
462 struct runqueue *rq = task_rq(t);
463
464 if (t->sched_info.last_queued)
465 diff = now - t->sched_info.last_queued;
466 sched_info_dequeued(t);
467 t->sched_info.run_delay += diff;
468 t->sched_info.last_arrival = now;
469 t->sched_info.pcnt++;
470
471 if (!rq)
472 return;
473
474 rq->rq_sched_info.run_delay += diff;
475 rq->rq_sched_info.pcnt++;
476}
477
478/*
479 * Called when a process is queued into either the active or expired
480 * array. The time is noted and later used to determine how long we
481 * had to wait for us to reach the cpu. Since the expired queue will
482 * become the active queue after active queue is empty, without dequeuing
483 * and requeuing any tasks, we are interested in queuing to either. It
484 * is unusual but not impossible for tasks to be dequeued and immediately
485 * requeued in the same or another array: this can happen in sched_yield(),
486 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
487 * to runqueue.
488 *
489 * This function is only called from enqueue_task(), but also only updates
490 * the timestamp if it is already not set. It's assumed that
491 * sched_info_dequeued() will clear that stamp when appropriate.
492 */
493static inline void sched_info_queued(task_t *t)
494{
495 if (!t->sched_info.last_queued)
496 t->sched_info.last_queued = jiffies;
497}
498
499/*
500 * Called when a process ceases being the active-running process, either
501 * voluntarily or involuntarily. Now we can calculate how long we ran.
502 */
503static inline void sched_info_depart(task_t *t)
504{
505 struct runqueue *rq = task_rq(t);
506 unsigned long diff = jiffies - t->sched_info.last_arrival;
507
508 t->sched_info.cpu_time += diff;
509
510 if (rq)
511 rq->rq_sched_info.cpu_time += diff;
512}
513
514/*
515 * Called when tasks are switched involuntarily due, typically, to expiring
516 * their time slice. (This may also be called when switching to or from
517 * the idle task.) We are only called when prev != next.
518 */
519static inline void sched_info_switch(task_t *prev, task_t *next)
520{
521 struct runqueue *rq = task_rq(prev);
522
523 /*
524 * prev now departs the cpu. It's not interesting to record
525 * stats about how efficient we were at scheduling the idle
526 * process, however.
527 */
528 if (prev != rq->idle)
529 sched_info_depart(prev);
530
531 if (next != rq->idle)
532 sched_info_arrive(next);
533}
534#else
535#define sched_info_queued(t) do { } while (0)
536#define sched_info_switch(t, next) do { } while (0)
537#endif /* CONFIG_SCHEDSTATS */
538
539/*
540 * Adding/removing a task to/from a priority array:
541 */
542static void dequeue_task(struct task_struct *p, prio_array_t *array)
543{
544 array->nr_active--;
545 list_del(&p->run_list);
546 if (list_empty(array->queue + p->prio))
547 __clear_bit(p->prio, array->bitmap);
548}
549
550static void enqueue_task(struct task_struct *p, prio_array_t *array)
551{
552 sched_info_queued(p);
553 list_add_tail(&p->run_list, array->queue + p->prio);
554 __set_bit(p->prio, array->bitmap);
555 array->nr_active++;
556 p->array = array;
557}
558
559/*
560 * Put task to the end of the run list without the overhead of dequeue
561 * followed by enqueue.
562 */
563static void requeue_task(struct task_struct *p, prio_array_t *array)
564{
565 list_move_tail(&p->run_list, array->queue + p->prio);
566}
567
568static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
569{
570 list_add(&p->run_list, array->queue + p->prio);
571 __set_bit(p->prio, array->bitmap);
572 array->nr_active++;
573 p->array = array;
574}
575
576/*
577 * effective_prio - return the priority that is based on the static
578 * priority but is modified by bonuses/penalties.
579 *
580 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
581 * into the -5 ... 0 ... +5 bonus/penalty range.
582 *
583 * We use 25% of the full 0...39 priority range so that:
584 *
585 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
586 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
587 *
588 * Both properties are important to certain workloads.
589 */
590static int effective_prio(task_t *p)
591{
592 int bonus, prio;
593
594 if (rt_task(p))
595 return p->prio;
596
597 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
598
599 prio = p->static_prio - bonus;
600 if (prio < MAX_RT_PRIO)
601 prio = MAX_RT_PRIO;
602 if (prio > MAX_PRIO-1)
603 prio = MAX_PRIO-1;
604 return prio;
605}
606
607/*
608 * __activate_task - move a task to the runqueue.
609 */
610static inline void __activate_task(task_t *p, runqueue_t *rq)
611{
612 enqueue_task(p, rq->active);
613 rq->nr_running++;
614}
615
616/*
617 * __activate_idle_task - move idle task to the _front_ of runqueue.
618 */
619static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
620{
621 enqueue_task_head(p, rq->active);
622 rq->nr_running++;
623}
624
625static void recalc_task_prio(task_t *p, unsigned long long now)
626{
627 /* Caller must always ensure 'now >= p->timestamp' */
628 unsigned long long __sleep_time = now - p->timestamp;
629 unsigned long sleep_time;
630
631 if (__sleep_time > NS_MAX_SLEEP_AVG)
632 sleep_time = NS_MAX_SLEEP_AVG;
633 else
634 sleep_time = (unsigned long)__sleep_time;
635
636 if (likely(sleep_time > 0)) {
637 /*
638 * User tasks that sleep a long time are categorised as
639 * idle and will get just interactive status to stay active &
640 * prevent them suddenly becoming cpu hogs and starving
641 * other processes.
642 */
643 if (p->mm && p->activated != -1 &&
644 sleep_time > INTERACTIVE_SLEEP(p)) {
645 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
646 DEF_TIMESLICE);
647 } else {
648 /*
649 * The lower the sleep avg a task has the more
650 * rapidly it will rise with sleep time.
651 */
652 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
653
654 /*
655 * Tasks waking from uninterruptible sleep are
656 * limited in their sleep_avg rise as they
657 * are likely to be waiting on I/O
658 */
659 if (p->activated == -1 && p->mm) {
660 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
661 sleep_time = 0;
662 else if (p->sleep_avg + sleep_time >=
663 INTERACTIVE_SLEEP(p)) {
664 p->sleep_avg = INTERACTIVE_SLEEP(p);
665 sleep_time = 0;
666 }
667 }
668
669 /*
670 * This code gives a bonus to interactive tasks.
671 *
672 * The boost works by updating the 'average sleep time'
673 * value here, based on ->timestamp. The more time a
674 * task spends sleeping, the higher the average gets -
675 * and the higher the priority boost gets as well.
676 */
677 p->sleep_avg += sleep_time;
678
679 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
680 p->sleep_avg = NS_MAX_SLEEP_AVG;
681 }
682 }
683
684 p->prio = effective_prio(p);
685}
686
687/*
688 * activate_task - move a task to the runqueue and do priority recalculation
689 *
690 * Update all the scheduling statistics stuff. (sleep average
691 * calculation, priority modifiers, etc.)
692 */
693static void activate_task(task_t *p, runqueue_t *rq, int local)
694{
695 unsigned long long now;
696
697 now = sched_clock();
698#ifdef CONFIG_SMP
699 if (!local) {
700 /* Compensate for drifting sched_clock */
701 runqueue_t *this_rq = this_rq();
702 now = (now - this_rq->timestamp_last_tick)
703 + rq->timestamp_last_tick;
704 }
705#endif
706
707 recalc_task_prio(p, now);
708
709 /*
710 * This checks to make sure it's not an uninterruptible task
711 * that is now waking up.
712 */
713 if (!p->activated) {
714 /*
715 * Tasks which were woken up by interrupts (ie. hw events)
716 * are most likely of interactive nature. So we give them
717 * the credit of extending their sleep time to the period
718 * of time they spend on the runqueue, waiting for execution
719 * on a CPU, first time around:
720 */
721 if (in_interrupt())
722 p->activated = 2;
723 else {
724 /*
725 * Normal first-time wakeups get a credit too for
726 * on-runqueue time, but it will be weighted down:
727 */
728 p->activated = 1;
729 }
730 }
731 p->timestamp = now;
732
733 __activate_task(p, rq);
734}
735
736/*
737 * deactivate_task - remove a task from the runqueue.
738 */
739static void deactivate_task(struct task_struct *p, runqueue_t *rq)
740{
741 rq->nr_running--;
742 dequeue_task(p, p->array);
743 p->array = NULL;
744}
745
746/*
747 * resched_task - mark a task 'to be rescheduled now'.
748 *
749 * On UP this means the setting of the need_resched flag, on SMP it
750 * might also involve a cross-CPU call to trigger the scheduler on
751 * the target CPU.
752 */
753#ifdef CONFIG_SMP
754static void resched_task(task_t *p)
755{
756 int need_resched, nrpolling;
757
758 assert_spin_locked(&task_rq(p)->lock);
759
760 /* minimise the chance of sending an interrupt to poll_idle() */
761 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
762 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
763 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
764
765 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
766 smp_send_reschedule(task_cpu(p));
767}
768#else
769static inline void resched_task(task_t *p)
770{
771 set_tsk_need_resched(p);
772}
773#endif
774
775/**
776 * task_curr - is this task currently executing on a CPU?
777 * @p: the task in question.
778 */
779inline int task_curr(const task_t *p)
780{
781 return cpu_curr(task_cpu(p)) == p;
782}
783
784#ifdef CONFIG_SMP
785enum request_type {
786 REQ_MOVE_TASK,
787 REQ_SET_DOMAIN,
788};
789
790typedef struct {
791 struct list_head list;
792 enum request_type type;
793
794 /* For REQ_MOVE_TASK */
795 task_t *task;
796 int dest_cpu;
797
798 /* For REQ_SET_DOMAIN */
799 struct sched_domain *sd;
800
801 struct completion done;
802} migration_req_t;
803
804/*
805 * The task's runqueue lock must be held.
806 * Returns true if you have to wait for migration thread.
807 */
808static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
809{
810 runqueue_t *rq = task_rq(p);
811
812 /*
813 * If the task is not on a runqueue (and not running), then
814 * it is sufficient to simply update the task's cpu field.
815 */
816 if (!p->array && !task_running(rq, p)) {
817 set_task_cpu(p, dest_cpu);
818 return 0;
819 }
820
821 init_completion(&req->done);
822 req->type = REQ_MOVE_TASK;
823 req->task = p;
824 req->dest_cpu = dest_cpu;
825 list_add(&req->list, &rq->migration_queue);
826 return 1;
827}
828
829/*
830 * wait_task_inactive - wait for a thread to unschedule.
831 *
832 * The caller must ensure that the task *will* unschedule sometime soon,
833 * else this function might spin for a *long* time. This function can't
834 * be called with interrupts off, or it may introduce deadlock with
835 * smp_call_function() if an IPI is sent by the same process we are
836 * waiting to become inactive.
837 */
838void wait_task_inactive(task_t * p)
839{
840 unsigned long flags;
841 runqueue_t *rq;
842 int preempted;
843
844repeat:
845 rq = task_rq_lock(p, &flags);
846 /* Must be off runqueue entirely, not preempted. */
847 if (unlikely(p->array || task_running(rq, p))) {
848 /* If it's preempted, we yield. It could be a while. */
849 preempted = !task_running(rq, p);
850 task_rq_unlock(rq, &flags);
851 cpu_relax();
852 if (preempted)
853 yield();
854 goto repeat;
855 }
856 task_rq_unlock(rq, &flags);
857}
858
859/***
860 * kick_process - kick a running thread to enter/exit the kernel
861 * @p: the to-be-kicked thread
862 *
863 * Cause a process which is running on another CPU to enter
864 * kernel-mode, without any delay. (to get signals handled.)
865 *
866 * NOTE: this function doesnt have to take the runqueue lock,
867 * because all it wants to ensure is that the remote task enters
868 * the kernel. If the IPI races and the task has been migrated
869 * to another CPU then no harm is done and the purpose has been
870 * achieved as well.
871 */
872void kick_process(task_t *p)
873{
874 int cpu;
875
876 preempt_disable();
877 cpu = task_cpu(p);
878 if ((cpu != smp_processor_id()) && task_curr(p))
879 smp_send_reschedule(cpu);
880 preempt_enable();
881}
882
883/*
884 * Return a low guess at the load of a migration-source cpu.
885 *
886 * We want to under-estimate the load of migration sources, to
887 * balance conservatively.
888 */
889static inline unsigned long source_load(int cpu)
890{
891 runqueue_t *rq = cpu_rq(cpu);
892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
893
894 return min(rq->cpu_load, load_now);
895}
896
897/*
898 * Return a high guess at the load of a migration-target cpu
899 */
900static inline unsigned long target_load(int cpu)
901{
902 runqueue_t *rq = cpu_rq(cpu);
903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
904
905 return max(rq->cpu_load, load_now);
906}
907
908#endif
909
910/*
911 * wake_idle() will wake a task on an idle cpu if task->cpu is
912 * not idle and an idle cpu is available. The span of cpus to
913 * search starts with cpus closest then further out as needed,
914 * so we always favor a closer, idle cpu.
915 *
916 * Returns the CPU we should wake onto.
917 */
918#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
919static int wake_idle(int cpu, task_t *p)
920{
921 cpumask_t tmp;
922 struct sched_domain *sd;
923 int i;
924
925 if (idle_cpu(cpu))
926 return cpu;
927
928 for_each_domain(cpu, sd) {
929 if (sd->flags & SD_WAKE_IDLE) {
930 cpus_and(tmp, sd->span, cpu_online_map);
931 cpus_and(tmp, tmp, p->cpus_allowed);
932 for_each_cpu_mask(i, tmp) {
933 if (idle_cpu(i))
934 return i;
935 }
936 }
937 else break;
938 }
939 return cpu;
940}
941#else
942static inline int wake_idle(int cpu, task_t *p)
943{
944 return cpu;
945}
946#endif
947
948/***
949 * try_to_wake_up - wake up a thread
950 * @p: the to-be-woken-up thread
951 * @state: the mask of task states that can be woken
952 * @sync: do a synchronous wakeup?
953 *
954 * Put it on the run-queue if it's not already there. The "current"
955 * thread is always on the run-queue (except when the actual
956 * re-schedule is in progress), and as such you're allowed to do
957 * the simpler "current->state = TASK_RUNNING" to mark yourself
958 * runnable without the overhead of this.
959 *
960 * returns failure only if the task is already active.
961 */
962static int try_to_wake_up(task_t * p, unsigned int state, int sync)
963{
964 int cpu, this_cpu, success = 0;
965 unsigned long flags;
966 long old_state;
967 runqueue_t *rq;
968#ifdef CONFIG_SMP
969 unsigned long load, this_load;
970 struct sched_domain *sd;
971 int new_cpu;
972#endif
973
974 rq = task_rq_lock(p, &flags);
975 old_state = p->state;
976 if (!(old_state & state))
977 goto out;
978
979 if (p->array)
980 goto out_running;
981
982 cpu = task_cpu(p);
983 this_cpu = smp_processor_id();
984
985#ifdef CONFIG_SMP
986 if (unlikely(task_running(rq, p)))
987 goto out_activate;
988
989#ifdef CONFIG_SCHEDSTATS
990 schedstat_inc(rq, ttwu_cnt);
991 if (cpu == this_cpu) {
992 schedstat_inc(rq, ttwu_local);
993 } else {
994 for_each_domain(this_cpu, sd) {
995 if (cpu_isset(cpu, sd->span)) {
996 schedstat_inc(sd, ttwu_wake_remote);
997 break;
998 }
999 }
1000 }
1001#endif
1002
1003 new_cpu = cpu;
1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu;
1006
1007 load = source_load(cpu);
1008 this_load = target_load(this_cpu);
1009
1010 /*
1011 * If sync wakeup then subtract the (maximum possible) effect of
1012 * the currently running task from the load of the current CPU:
1013 */
1014 if (sync)
1015 this_load -= SCHED_LOAD_SCALE;
1016
1017 /* Don't pull the task off an idle CPU to a busy one */
1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019 goto out_set_cpu;
1020
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */
1022
1023 /*
1024 * Scan domains for affine wakeup and passive balancing
1025 * possibilities.
1026 */
1027 for_each_domain(this_cpu, sd) {
1028 unsigned int imbalance;
1029 /*
1030 * Start passive balancing when half the imbalance_pct
1031 * limit is reached.
1032 */
1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1034
1035 if ((sd->flags & SD_WAKE_AFFINE) &&
1036 !task_hot(p, rq->timestamp_last_tick, sd)) {
1037 /*
1038 * This domain has SD_WAKE_AFFINE and p is cache cold
1039 * in this domain.
1040 */
1041 if (cpu_isset(cpu, sd->span)) {
1042 schedstat_inc(sd, ttwu_move_affine);
1043 goto out_set_cpu;
1044 }
1045 } else if ((sd->flags & SD_WAKE_BALANCE) &&
1046 imbalance*this_load <= 100*load) {
1047 /*
1048 * This domain has SD_WAKE_BALANCE and there is
1049 * an imbalance.
1050 */
1051 if (cpu_isset(cpu, sd->span)) {
1052 schedstat_inc(sd, ttwu_move_balance);
1053 goto out_set_cpu;
1054 }
1055 }
1056 }
1057
1058 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1059out_set_cpu:
1060 new_cpu = wake_idle(new_cpu, p);
1061 if (new_cpu != cpu) {
1062 set_task_cpu(p, new_cpu);
1063 task_rq_unlock(rq, &flags);
1064 /* might preempt at this point */
1065 rq = task_rq_lock(p, &flags);
1066 old_state = p->state;
1067 if (!(old_state & state))
1068 goto out;
1069 if (p->array)
1070 goto out_running;
1071
1072 this_cpu = smp_processor_id();
1073 cpu = task_cpu(p);
1074 }
1075
1076out_activate:
1077#endif /* CONFIG_SMP */
1078 if (old_state == TASK_UNINTERRUPTIBLE) {
1079 rq->nr_uninterruptible--;
1080 /*
1081 * Tasks on involuntary sleep don't earn
1082 * sleep_avg beyond just interactive state.
1083 */
1084 p->activated = -1;
1085 }
1086
1087 /*
1088 * Sync wakeups (i.e. those types of wakeups where the waker
1089 * has indicated that it will leave the CPU in short order)
1090 * don't trigger a preemption, if the woken up task will run on
1091 * this cpu. (in this case the 'I will reschedule' promise of
1092 * the waker guarantees that the freshly woken up task is going
1093 * to be considered on this CPU.)
1094 */
1095 activate_task(p, rq, cpu == this_cpu);
1096 if (!sync || cpu != this_cpu) {
1097 if (TASK_PREEMPTS_CURR(p, rq))
1098 resched_task(rq->curr);
1099 }
1100 success = 1;
1101
1102out_running:
1103 p->state = TASK_RUNNING;
1104out:
1105 task_rq_unlock(rq, &flags);
1106
1107 return success;
1108}
1109
1110int fastcall wake_up_process(task_t * p)
1111{
1112 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1113 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1114}
1115
1116EXPORT_SYMBOL(wake_up_process);
1117
1118int fastcall wake_up_state(task_t *p, unsigned int state)
1119{
1120 return try_to_wake_up(p, state, 0);
1121}
1122
1123#ifdef CONFIG_SMP
1124static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1125 struct sched_domain *sd);
1126#endif
1127
1128/*
1129 * Perform scheduler related setup for a newly forked process p.
1130 * p is forked by current.
1131 */
1132void fastcall sched_fork(task_t *p)
1133{
1134 /*
1135 * We mark the process as running here, but have not actually
1136 * inserted it onto the runqueue yet. This guarantees that
1137 * nobody will actually run it, and a signal or other external
1138 * event cannot wake it up and insert it on the runqueue either.
1139 */
1140 p->state = TASK_RUNNING;
1141 INIT_LIST_HEAD(&p->run_list);
1142 p->array = NULL;
1143 spin_lock_init(&p->switch_lock);
1144#ifdef CONFIG_SCHEDSTATS
1145 memset(&p->sched_info, 0, sizeof(p->sched_info));
1146#endif
1147#ifdef CONFIG_PREEMPT
1148 /*
1149 * During context-switch we hold precisely one spinlock, which
1150 * schedule_tail drops. (in the common case it's this_rq()->lock,
1151 * but it also can be p->switch_lock.) So we compensate with a count
1152 * of 1. Also, we want to start with kernel preemption disabled.
1153 */
1154 p->thread_info->preempt_count = 1;
1155#endif
1156 /*
1157 * Share the timeslice between parent and child, thus the
1158 * total amount of pending timeslices in the system doesn't change,
1159 * resulting in more scheduling fairness.
1160 */
1161 local_irq_disable();
1162 p->time_slice = (current->time_slice + 1) >> 1;
1163 /*
1164 * The remainder of the first timeslice might be recovered by
1165 * the parent if the child exits early enough.
1166 */
1167 p->first_time_slice = 1;
1168 current->time_slice >>= 1;
1169 p->timestamp = sched_clock();
1170 if (unlikely(!current->time_slice)) {
1171 /*
1172 * This case is rare, it happens when the parent has only
1173 * a single jiffy left from its timeslice. Taking the
1174 * runqueue lock is not a problem.
1175 */
1176 current->time_slice = 1;
1177 preempt_disable();
1178 scheduler_tick();
1179 local_irq_enable();
1180 preempt_enable();
1181 } else
1182 local_irq_enable();
1183}
1184
1185/*
1186 * wake_up_new_task - wake up a newly created task for the first time.
1187 *
1188 * This function will do some initial scheduler statistics housekeeping
1189 * that must be done for every newly created context, then puts the task
1190 * on the runqueue and wakes it.
1191 */
1192void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1193{
1194 unsigned long flags;
1195 int this_cpu, cpu;
1196 runqueue_t *rq, *this_rq;
1197
1198 rq = task_rq_lock(p, &flags);
1199 cpu = task_cpu(p);
1200 this_cpu = smp_processor_id();
1201
1202 BUG_ON(p->state != TASK_RUNNING);
1203
1204 /*
1205 * We decrease the sleep average of forking parents
1206 * and children as well, to keep max-interactive tasks
1207 * from forking tasks that are max-interactive. The parent
1208 * (current) is done further down, under its lock.
1209 */
1210 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1211 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1212
1213 p->prio = effective_prio(p);
1214
1215 if (likely(cpu == this_cpu)) {
1216 if (!(clone_flags & CLONE_VM)) {
1217 /*
1218 * The VM isn't cloned, so we're in a good position to
1219 * do child-runs-first in anticipation of an exec. This
1220 * usually avoids a lot of COW overhead.
1221 */
1222 if (unlikely(!current->array))
1223 __activate_task(p, rq);
1224 else {
1225 p->prio = current->prio;
1226 list_add_tail(&p->run_list, &current->run_list);
1227 p->array = current->array;
1228 p->array->nr_active++;
1229 rq->nr_running++;
1230 }
1231 set_need_resched();
1232 } else
1233 /* Run child last */
1234 __activate_task(p, rq);
1235 /*
1236 * We skip the following code due to cpu == this_cpu
1237 *
1238 * task_rq_unlock(rq, &flags);
1239 * this_rq = task_rq_lock(current, &flags);
1240 */
1241 this_rq = rq;
1242 } else {
1243 this_rq = cpu_rq(this_cpu);
1244
1245 /*
1246 * Not the local CPU - must adjust timestamp. This should
1247 * get optimised away in the !CONFIG_SMP case.
1248 */
1249 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1250 + rq->timestamp_last_tick;
1251 __activate_task(p, rq);
1252 if (TASK_PREEMPTS_CURR(p, rq))
1253 resched_task(rq->curr);
1254
1255 /*
1256 * Parent and child are on different CPUs, now get the
1257 * parent runqueue to update the parent's ->sleep_avg:
1258 */
1259 task_rq_unlock(rq, &flags);
1260 this_rq = task_rq_lock(current, &flags);
1261 }
1262 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1263 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1264 task_rq_unlock(this_rq, &flags);
1265}
1266
1267/*
1268 * Potentially available exiting-child timeslices are
1269 * retrieved here - this way the parent does not get
1270 * penalized for creating too many threads.
1271 *
1272 * (this cannot be used to 'generate' timeslices
1273 * artificially, because any timeslice recovered here
1274 * was given away by the parent in the first place.)
1275 */
1276void fastcall sched_exit(task_t * p)
1277{
1278 unsigned long flags;
1279 runqueue_t *rq;
1280
1281 /*
1282 * If the child was a (relative-) CPU hog then decrease
1283 * the sleep_avg of the parent as well.
1284 */
1285 rq = task_rq_lock(p->parent, &flags);
1286 if (p->first_time_slice) {
1287 p->parent->time_slice += p->time_slice;
1288 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1289 p->parent->time_slice = task_timeslice(p);
1290 }
1291 if (p->sleep_avg < p->parent->sleep_avg)
1292 p->parent->sleep_avg = p->parent->sleep_avg /
1293 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1294 (EXIT_WEIGHT + 1);
1295 task_rq_unlock(rq, &flags);
1296}
1297
1298/**
1299 * finish_task_switch - clean up after a task-switch
1300 * @prev: the thread we just switched away from.
1301 *
1302 * We enter this with the runqueue still locked, and finish_arch_switch()
1303 * will unlock it along with doing any other architecture-specific cleanup
1304 * actions.
1305 *
1306 * Note that we may have delayed dropping an mm in context_switch(). If
1307 * so, we finish that here outside of the runqueue lock. (Doing it
1308 * with the lock held can cause deadlocks; see schedule() for
1309 * details.)
1310 */
1311static inline void finish_task_switch(task_t *prev)
1312 __releases(rq->lock)
1313{
1314 runqueue_t *rq = this_rq();
1315 struct mm_struct *mm = rq->prev_mm;
1316 unsigned long prev_task_flags;
1317
1318 rq->prev_mm = NULL;
1319
1320 /*
1321 * A task struct has one reference for the use as "current".
1322 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1323 * calls schedule one last time. The schedule call will never return,
1324 * and the scheduled task must drop that reference.
1325 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1326 * still held, otherwise prev could be scheduled on another cpu, die
1327 * there before we look at prev->state, and then the reference would
1328 * be dropped twice.
1329 * Manfred Spraul <manfred@colorfullife.com>
1330 */
1331 prev_task_flags = prev->flags;
1332 finish_arch_switch(rq, prev);
1333 if (mm)
1334 mmdrop(mm);
1335 if (unlikely(prev_task_flags & PF_DEAD))
1336 put_task_struct(prev);
1337}
1338
1339/**
1340 * schedule_tail - first thing a freshly forked thread must call.
1341 * @prev: the thread we just switched away from.
1342 */
1343asmlinkage void schedule_tail(task_t *prev)
1344 __releases(rq->lock)
1345{
1346 finish_task_switch(prev);
1347
1348 if (current->set_child_tid)
1349 put_user(current->pid, current->set_child_tid);
1350}
1351
1352/*
1353 * context_switch - switch to the new MM and the new
1354 * thread's register state.
1355 */
1356static inline
1357task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1358{
1359 struct mm_struct *mm = next->mm;
1360 struct mm_struct *oldmm = prev->active_mm;
1361
1362 if (unlikely(!mm)) {
1363 next->active_mm = oldmm;
1364 atomic_inc(&oldmm->mm_count);
1365 enter_lazy_tlb(oldmm, next);
1366 } else
1367 switch_mm(oldmm, mm, next);
1368
1369 if (unlikely(!prev->mm)) {
1370 prev->active_mm = NULL;
1371 WARN_ON(rq->prev_mm);
1372 rq->prev_mm = oldmm;
1373 }
1374
1375 /* Here we just switch the register state and the stack. */
1376 switch_to(prev, next, prev);
1377
1378 return prev;
1379}
1380
1381/*
1382 * nr_running, nr_uninterruptible and nr_context_switches:
1383 *
1384 * externally visible scheduler statistics: current number of runnable
1385 * threads, current number of uninterruptible-sleeping threads, total
1386 * number of context switches performed since bootup.
1387 */
1388unsigned long nr_running(void)
1389{
1390 unsigned long i, sum = 0;
1391
1392 for_each_online_cpu(i)
1393 sum += cpu_rq(i)->nr_running;
1394
1395 return sum;
1396}
1397
1398unsigned long nr_uninterruptible(void)
1399{
1400 unsigned long i, sum = 0;
1401
1402 for_each_cpu(i)
1403 sum += cpu_rq(i)->nr_uninterruptible;
1404
1405 /*
1406 * Since we read the counters lockless, it might be slightly
1407 * inaccurate. Do not allow it to go below zero though:
1408 */
1409 if (unlikely((long)sum < 0))
1410 sum = 0;
1411
1412 return sum;
1413}
1414
1415unsigned long long nr_context_switches(void)
1416{
1417 unsigned long long i, sum = 0;
1418
1419 for_each_cpu(i)
1420 sum += cpu_rq(i)->nr_switches;
1421
1422 return sum;
1423}
1424
1425unsigned long nr_iowait(void)
1426{
1427 unsigned long i, sum = 0;
1428
1429 for_each_cpu(i)
1430 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1431
1432 return sum;
1433}
1434
1435#ifdef CONFIG_SMP
1436
1437/*
1438 * double_rq_lock - safely lock two runqueues
1439 *
1440 * Note this does not disable interrupts like task_rq_lock,
1441 * you need to do so manually before calling.
1442 */
1443static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1444 __acquires(rq1->lock)
1445 __acquires(rq2->lock)
1446{
1447 if (rq1 == rq2) {
1448 spin_lock(&rq1->lock);
1449 __acquire(rq2->lock); /* Fake it out ;) */
1450 } else {
1451 if (rq1 < rq2) {
1452 spin_lock(&rq1->lock);
1453 spin_lock(&rq2->lock);
1454 } else {
1455 spin_lock(&rq2->lock);
1456 spin_lock(&rq1->lock);
1457 }
1458 }
1459}
1460
1461/*
1462 * double_rq_unlock - safely unlock two runqueues
1463 *
1464 * Note this does not restore interrupts like task_rq_unlock,
1465 * you need to do so manually after calling.
1466 */
1467static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1468 __releases(rq1->lock)
1469 __releases(rq2->lock)
1470{
1471 spin_unlock(&rq1->lock);
1472 if (rq1 != rq2)
1473 spin_unlock(&rq2->lock);
1474 else
1475 __release(rq2->lock);
1476}
1477
1478/*
1479 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1480 */
1481static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1482 __releases(this_rq->lock)
1483 __acquires(busiest->lock)
1484 __acquires(this_rq->lock)
1485{
1486 if (unlikely(!spin_trylock(&busiest->lock))) {
1487 if (busiest < this_rq) {
1488 spin_unlock(&this_rq->lock);
1489 spin_lock(&busiest->lock);
1490 spin_lock(&this_rq->lock);
1491 } else
1492 spin_lock(&busiest->lock);
1493 }
1494}
1495
1496/*
1497 * find_idlest_cpu - find the least busy runqueue.
1498 */
1499static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1500 struct sched_domain *sd)
1501{
1502 unsigned long load, min_load, this_load;
1503 int i, min_cpu;
1504 cpumask_t mask;
1505
1506 min_cpu = UINT_MAX;
1507 min_load = ULONG_MAX;
1508
1509 cpus_and(mask, sd->span, p->cpus_allowed);
1510
1511 for_each_cpu_mask(i, mask) {
1512 load = target_load(i);
1513
1514 if (load < min_load) {
1515 min_cpu = i;
1516 min_load = load;
1517
1518 /* break out early on an idle CPU: */
1519 if (!min_load)
1520 break;
1521 }
1522 }
1523
1524 /* add +1 to account for the new task */
1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1526
1527 /*
1528 * Would with the addition of the new task to the
1529 * current CPU there be an imbalance between this
1530 * CPU and the idlest CPU?
1531 *
1532 * Use half of the balancing threshold - new-context is
1533 * a good opportunity to balance.
1534 */
1535 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1536 return min_cpu;
1537
1538 return this_cpu;
1539}
1540
1541/*
1542 * If dest_cpu is allowed for this process, migrate the task to it.
1543 * This is accomplished by forcing the cpu_allowed mask to only
1544 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1545 * the cpu_allowed mask is restored.
1546 */
1547static void sched_migrate_task(task_t *p, int dest_cpu)
1548{
1549 migration_req_t req;
1550 runqueue_t *rq;
1551 unsigned long flags;
1552
1553 rq = task_rq_lock(p, &flags);
1554 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1555 || unlikely(cpu_is_offline(dest_cpu)))
1556 goto out;
1557
1558 /* force the process onto the specified CPU */
1559 if (migrate_task(p, dest_cpu, &req)) {
1560 /* Need to wait for migration thread (might exit: take ref). */
1561 struct task_struct *mt = rq->migration_thread;
1562 get_task_struct(mt);
1563 task_rq_unlock(rq, &flags);
1564 wake_up_process(mt);
1565 put_task_struct(mt);
1566 wait_for_completion(&req.done);
1567 return;
1568 }
1569out:
1570 task_rq_unlock(rq, &flags);
1571}
1572
1573/*
1574 * sched_exec(): find the highest-level, exec-balance-capable
1575 * domain and try to migrate the task to the least loaded CPU.
1576 *
1577 * execve() is a valuable balancing opportunity, because at this point
1578 * the task has the smallest effective memory and cache footprint.
1579 */
1580void sched_exec(void)
1581{
1582 struct sched_domain *tmp, *sd = NULL;
1583 int new_cpu, this_cpu = get_cpu();
1584
1585 /* Prefer the current CPU if there's only this task running */
1586 if (this_rq()->nr_running <= 1)
1587 goto out;
1588
1589 for_each_domain(this_cpu, tmp)
1590 if (tmp->flags & SD_BALANCE_EXEC)
1591 sd = tmp;
1592
1593 if (sd) {
1594 schedstat_inc(sd, sbe_attempts);
1595 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1596 if (new_cpu != this_cpu) {
1597 schedstat_inc(sd, sbe_pushed);
1598 put_cpu();
1599 sched_migrate_task(current, new_cpu);
1600 return;
1601 }
1602 }
1603out:
1604 put_cpu();
1605}
1606
1607/*
1608 * pull_task - move a task from a remote runqueue to the local runqueue.
1609 * Both runqueues must be locked.
1610 */
1611static inline
1612void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1613 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1614{
1615 dequeue_task(p, src_array);
1616 src_rq->nr_running--;
1617 set_task_cpu(p, this_cpu);
1618 this_rq->nr_running++;
1619 enqueue_task(p, this_array);
1620 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1621 + this_rq->timestamp_last_tick;
1622 /*
1623 * Note that idle threads have a prio of MAX_PRIO, for this test
1624 * to be always true for them.
1625 */
1626 if (TASK_PREEMPTS_CURR(p, this_rq))
1627 resched_task(this_rq->curr);
1628}
1629
1630/*
1631 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1632 */
1633static inline
1634int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1635 struct sched_domain *sd, enum idle_type idle)
1636{
1637 /*
1638 * We do not migrate tasks that are:
1639 * 1) running (obviously), or
1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1641 * 3) are cache-hot on their current CPU.
1642 */
1643 if (task_running(rq, p))
1644 return 0;
1645 if (!cpu_isset(this_cpu, p->cpus_allowed))
1646 return 0;
1647
1648 /*
1649 * Aggressive migration if:
1650 * 1) the [whole] cpu is idle, or
1651 * 2) too many balance attempts have failed.
1652 */
1653
1654 if (cpu_and_siblings_are_idle(this_cpu) || \
1655 sd->nr_balance_failed > sd->cache_nice_tries)
1656 return 1;
1657
1658 if (task_hot(p, rq->timestamp_last_tick, sd))
1659 return 0;
1660 return 1;
1661}
1662
1663/*
1664 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1665 * as part of a balancing operation within "domain". Returns the number of
1666 * tasks moved.
1667 *
1668 * Called with both runqueues locked.
1669 */
1670static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1671 unsigned long max_nr_move, struct sched_domain *sd,
1672 enum idle_type idle)
1673{
1674 prio_array_t *array, *dst_array;
1675 struct list_head *head, *curr;
1676 int idx, pulled = 0;
1677 task_t *tmp;
1678
1679 if (max_nr_move <= 0 || busiest->nr_running <= 1)
1680 goto out;
1681
1682 /*
1683 * We first consider expired tasks. Those will likely not be
1684 * executed in the near future, and they are most likely to
1685 * be cache-cold, thus switching CPUs has the least effect
1686 * on them.
1687 */
1688 if (busiest->expired->nr_active) {
1689 array = busiest->expired;
1690 dst_array = this_rq->expired;
1691 } else {
1692 array = busiest->active;
1693 dst_array = this_rq->active;
1694 }
1695
1696new_array:
1697 /* Start searching at priority 0: */
1698 idx = 0;
1699skip_bitmap:
1700 if (!idx)
1701 idx = sched_find_first_bit(array->bitmap);
1702 else
1703 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1704 if (idx >= MAX_PRIO) {
1705 if (array == busiest->expired && busiest->active->nr_active) {
1706 array = busiest->active;
1707 dst_array = this_rq->active;
1708 goto new_array;
1709 }
1710 goto out;
1711 }
1712
1713 head = array->queue + idx;
1714 curr = head->prev;
1715skip_queue:
1716 tmp = list_entry(curr, task_t, run_list);
1717
1718 curr = curr->prev;
1719
1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1721 if (curr != head)
1722 goto skip_queue;
1723 idx++;
1724 goto skip_bitmap;
1725 }
1726
1727#ifdef CONFIG_SCHEDSTATS
1728 if (task_hot(tmp, busiest->timestamp_last_tick, sd))
1729 schedstat_inc(sd, lb_hot_gained[idle]);
1730#endif
1731
1732 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1733 pulled++;
1734
1735 /* We only want to steal up to the prescribed number of tasks. */
1736 if (pulled < max_nr_move) {
1737 if (curr != head)
1738 goto skip_queue;
1739 idx++;
1740 goto skip_bitmap;
1741 }
1742out:
1743 /*
1744 * Right now, this is the only place pull_task() is called,
1745 * so we can safely collect pull_task() stats here rather than
1746 * inside pull_task().
1747 */
1748 schedstat_add(sd, lb_gained[idle], pulled);
1749 return pulled;
1750}
1751
1752/*
1753 * find_busiest_group finds and returns the busiest CPU group within the
1754 * domain. It calculates and returns the number of tasks which should be
1755 * moved to restore balance via the imbalance parameter.
1756 */
1757static struct sched_group *
1758find_busiest_group(struct sched_domain *sd, int this_cpu,
1759 unsigned long *imbalance, enum idle_type idle)
1760{
1761 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1762 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1763
1764 max_load = this_load = total_load = total_pwr = 0;
1765
1766 do {
1767 unsigned long load;
1768 int local_group;
1769 int i;
1770
1771 local_group = cpu_isset(this_cpu, group->cpumask);
1772
1773 /* Tally up the load of all CPUs in the group */
1774 avg_load = 0;
1775
1776 for_each_cpu_mask(i, group->cpumask) {
1777 /* Bias balancing toward cpus of our domain */
1778 if (local_group)
1779 load = target_load(i);
1780 else
1781 load = source_load(i);
1782
1783 avg_load += load;
1784 }
1785
1786 total_load += avg_load;
1787 total_pwr += group->cpu_power;
1788
1789 /* Adjust by relative CPU power of the group */
1790 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1791
1792 if (local_group) {
1793 this_load = avg_load;
1794 this = group;
1795 goto nextgroup;
1796 } else if (avg_load > max_load) {
1797 max_load = avg_load;
1798 busiest = group;
1799 }
1800nextgroup:
1801 group = group->next;
1802 } while (group != sd->groups);
1803
1804 if (!busiest || this_load >= max_load)
1805 goto out_balanced;
1806
1807 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1808
1809 if (this_load >= avg_load ||
1810 100*max_load <= sd->imbalance_pct*this_load)
1811 goto out_balanced;
1812
1813 /*
1814 * We're trying to get all the cpus to the average_load, so we don't
1815 * want to push ourselves above the average load, nor do we wish to
1816 * reduce the max loaded cpu below the average load, as either of these
1817 * actions would just result in more rebalancing later, and ping-pong
1818 * tasks around. Thus we look for the minimum possible imbalance.
1819 * Negative imbalances (*we* are more loaded than anyone else) will
1820 * be counted as no imbalance for these purposes -- we can't fix that
1821 * by pulling tasks to us. Be careful of negative numbers as they'll
1822 * appear as very large values with unsigned longs.
1823 */
1824 /* How much load to actually move to equalise the imbalance */
1825 *imbalance = min((max_load - avg_load) * busiest->cpu_power,
1826 (avg_load - this_load) * this->cpu_power)
1827 / SCHED_LOAD_SCALE;
1828
1829 if (*imbalance < SCHED_LOAD_SCALE) {
1830 unsigned long pwr_now = 0, pwr_move = 0;
1831 unsigned long tmp;
1832
1833 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1834 *imbalance = 1;
1835 return busiest;
1836 }
1837
1838 /*
1839 * OK, we don't have enough imbalance to justify moving tasks,
1840 * however we may be able to increase total CPU power used by
1841 * moving them.
1842 */
1843
1844 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1845 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1846 pwr_now /= SCHED_LOAD_SCALE;
1847
1848 /* Amount of load we'd subtract */
1849 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1850 if (max_load > tmp)
1851 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1852 max_load - tmp);
1853
1854 /* Amount of load we'd add */
1855 if (max_load*busiest->cpu_power <
1856 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
1857 tmp = max_load*busiest->cpu_power/this->cpu_power;
1858 else
1859 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1860 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1861 pwr_move /= SCHED_LOAD_SCALE;
1862
1863 /* Move if we gain throughput */
1864 if (pwr_move <= pwr_now)
1865 goto out_balanced;
1866
1867 *imbalance = 1;
1868 return busiest;
1869 }
1870
1871 /* Get rid of the scaling factor, rounding down as we divide */
1872 *imbalance = *imbalance / SCHED_LOAD_SCALE;
1873
1874 return busiest;
1875
1876out_balanced:
1877 if (busiest && (idle == NEWLY_IDLE ||
1878 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1879 *imbalance = 1;
1880 return busiest;
1881 }
1882
1883 *imbalance = 0;
1884 return NULL;
1885}
1886
1887/*
1888 * find_busiest_queue - find the busiest runqueue among the cpus in group.
1889 */
1890static runqueue_t *find_busiest_queue(struct sched_group *group)
1891{
1892 unsigned long load, max_load = 0;
1893 runqueue_t *busiest = NULL;
1894 int i;
1895
1896 for_each_cpu_mask(i, group->cpumask) {
1897 load = source_load(i);
1898
1899 if (load > max_load) {
1900 max_load = load;
1901 busiest = cpu_rq(i);
1902 }
1903 }
1904
1905 return busiest;
1906}
1907
1908/*
1909 * Check this_cpu to ensure it is balanced within domain. Attempt to move
1910 * tasks if there is an imbalance.
1911 *
1912 * Called with this_rq unlocked.
1913 */
1914static int load_balance(int this_cpu, runqueue_t *this_rq,
1915 struct sched_domain *sd, enum idle_type idle)
1916{
1917 struct sched_group *group;
1918 runqueue_t *busiest;
1919 unsigned long imbalance;
1920 int nr_moved;
1921
1922 spin_lock(&this_rq->lock);
1923 schedstat_inc(sd, lb_cnt[idle]);
1924
1925 group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1926 if (!group) {
1927 schedstat_inc(sd, lb_nobusyg[idle]);
1928 goto out_balanced;
1929 }
1930
1931 busiest = find_busiest_queue(group);
1932 if (!busiest) {
1933 schedstat_inc(sd, lb_nobusyq[idle]);
1934 goto out_balanced;
1935 }
1936
1937 /*
1938 * This should be "impossible", but since load
1939 * balancing is inherently racy and statistical,
1940 * it could happen in theory.
1941 */
1942 if (unlikely(busiest == this_rq)) {
1943 WARN_ON(1);
1944 goto out_balanced;
1945 }
1946
1947 schedstat_add(sd, lb_imbalance[idle], imbalance);
1948
1949 nr_moved = 0;
1950 if (busiest->nr_running > 1) {
1951 /*
1952 * Attempt to move tasks. If find_busiest_group has found
1953 * an imbalance but busiest->nr_running <= 1, the group is
1954 * still unbalanced. nr_moved simply stays zero, so it is
1955 * correctly treated as an imbalance.
1956 */
1957 double_lock_balance(this_rq, busiest);
1958 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959 imbalance, sd, idle);
1960 spin_unlock(&busiest->lock);
1961 }
1962 spin_unlock(&this_rq->lock);
1963
1964 if (!nr_moved) {
1965 schedstat_inc(sd, lb_failed[idle]);
1966 sd->nr_balance_failed++;
1967
1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969 int wake = 0;
1970
1971 spin_lock(&busiest->lock);
1972 if (!busiest->active_balance) {
1973 busiest->active_balance = 1;
1974 busiest->push_cpu = this_cpu;
1975 wake = 1;
1976 }
1977 spin_unlock(&busiest->lock);
1978 if (wake)
1979 wake_up_process(busiest->migration_thread);
1980
1981 /*
1982 * We've kicked active balancing, reset the failure
1983 * counter.
1984 */
1985 sd->nr_balance_failed = sd->cache_nice_tries;
1986 }
1987
1988 /*
1989 * We were unbalanced, but unsuccessful in move_tasks(),
1990 * so bump the balance_interval to lessen the lock contention.
1991 */
1992 if (sd->balance_interval < sd->max_interval)
1993 sd->balance_interval++;
1994 } else {
1995 sd->nr_balance_failed = 0;
1996
1997 /* We were unbalanced, so reset the balancing interval */
1998 sd->balance_interval = sd->min_interval;
1999 }
2000
2001 return nr_moved;
2002
2003out_balanced:
2004 spin_unlock(&this_rq->lock);
2005
2006 schedstat_inc(sd, lb_balanced[idle]);
2007
2008 /* tune up the balancing interval */
2009 if (sd->balance_interval < sd->max_interval)
2010 sd->balance_interval *= 2;
2011
2012 return 0;
2013}
2014
2015/*
2016 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2017 * tasks if there is an imbalance.
2018 *
2019 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2020 * this_rq is locked.
2021 */
2022static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2023 struct sched_domain *sd)
2024{
2025 struct sched_group *group;
2026 runqueue_t *busiest = NULL;
2027 unsigned long imbalance;
2028 int nr_moved = 0;
2029
2030 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2031 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2032 if (!group) {
2033 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2034 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2035 goto out;
2036 }
2037
2038 busiest = find_busiest_queue(group);
2039 if (!busiest || busiest == this_rq) {
2040 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2041 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2042 goto out;
2043 }
2044
2045 /* Attempt to move tasks */
2046 double_lock_balance(this_rq, busiest);
2047
2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050 imbalance, sd, NEWLY_IDLE);
2051 if (!nr_moved)
2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2053
2054 spin_unlock(&busiest->lock);
2055
2056out:
2057 return nr_moved;
2058}
2059
2060/*
2061 * idle_balance is called by schedule() if this_cpu is about to become
2062 * idle. Attempts to pull tasks from other CPUs.
2063 */
2064static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2065{
2066 struct sched_domain *sd;
2067
2068 for_each_domain(this_cpu, sd) {
2069 if (sd->flags & SD_BALANCE_NEWIDLE) {
2070 if (load_balance_newidle(this_cpu, this_rq, sd)) {
2071 /* We've pulled tasks over so stop searching */
2072 break;
2073 }
2074 }
2075 }
2076}
2077
2078/*
2079 * active_load_balance is run by migration threads. It pushes running tasks
2080 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2081 * running on each physical CPU where possible, and avoids physical /
2082 * logical imbalances.
2083 *
2084 * Called with busiest_rq locked.
2085 */
2086static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2087{
2088 struct sched_domain *sd;
2089 struct sched_group *cpu_group;
2090 runqueue_t *target_rq;
2091 cpumask_t visited_cpus;
2092 int cpu;
2093
2094 /*
2095 * Search for suitable CPUs to push tasks to in successively higher
2096 * domains with SD_LOAD_BALANCE set.
2097 */
2098 visited_cpus = CPU_MASK_NONE;
2099 for_each_domain(busiest_cpu, sd) {
2100 if (!(sd->flags & SD_LOAD_BALANCE))
2101 /* no more domains to search */
2102 break;
2103
2104 schedstat_inc(sd, alb_cnt);
2105
2106 cpu_group = sd->groups;
2107 do {
2108 for_each_cpu_mask(cpu, cpu_group->cpumask) {
2109 if (busiest_rq->nr_running <= 1)
2110 /* no more tasks left to move */
2111 return;
2112 if (cpu_isset(cpu, visited_cpus))
2113 continue;
2114 cpu_set(cpu, visited_cpus);
2115 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
2116 continue;
2117
2118 target_rq = cpu_rq(cpu);
2119 /*
2120 * This condition is "impossible", if it occurs
2121 * we need to fix it. Originally reported by
2122 * Bjorn Helgaas on a 128-cpu setup.
2123 */
2124 BUG_ON(busiest_rq == target_rq);
2125
2126 /* move a task from busiest_rq to target_rq */
2127 double_lock_balance(busiest_rq, target_rq);
2128 if (move_tasks(target_rq, cpu, busiest_rq,
2129 1, sd, SCHED_IDLE)) {
2130 schedstat_inc(sd, alb_pushed);
2131 } else {
2132 schedstat_inc(sd, alb_failed);
2133 }
2134 spin_unlock(&target_rq->lock);
2135 }
2136 cpu_group = cpu_group->next;
2137 } while (cpu_group != sd->groups);
2138 }
2139}
2140
2141/*
2142 * rebalance_tick will get called every timer tick, on every CPU.
2143 *
2144 * It checks each scheduling domain to see if it is due to be balanced,
2145 * and initiates a balancing operation if so.
2146 *
2147 * Balancing parameters are set up in arch_init_sched_domains.
2148 */
2149
2150/* Don't have all balancing operations going off at once */
2151#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2152
2153static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2154 enum idle_type idle)
2155{
2156 unsigned long old_load, this_load;
2157 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2158 struct sched_domain *sd;
2159
2160 /* Update our load */
2161 old_load = this_rq->cpu_load;
2162 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2163 /*
2164 * Round up the averaging division if load is increasing. This
2165 * prevents us from getting stuck on 9 if the load is 10, for
2166 * example.
2167 */
2168 if (this_load > old_load)
2169 old_load++;
2170 this_rq->cpu_load = (old_load + this_load) / 2;
2171
2172 for_each_domain(this_cpu, sd) {
2173 unsigned long interval;
2174
2175 if (!(sd->flags & SD_LOAD_BALANCE))
2176 continue;
2177
2178 interval = sd->balance_interval;
2179 if (idle != SCHED_IDLE)
2180 interval *= sd->busy_factor;
2181
2182 /* scale ms to jiffies */
2183 interval = msecs_to_jiffies(interval);
2184 if (unlikely(!interval))
2185 interval = 1;
2186
2187 if (j - sd->last_balance >= interval) {
2188 if (load_balance(this_cpu, this_rq, sd, idle)) {
2189 /* We've pulled tasks over so no longer idle */
2190 idle = NOT_IDLE;
2191 }
2192 sd->last_balance += interval;
2193 }
2194 }
2195}
2196#else
2197/*
2198 * on UP we do not need to balance between CPUs:
2199 */
2200static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2201{
2202}
2203static inline void idle_balance(int cpu, runqueue_t *rq)
2204{
2205}
2206#endif
2207
2208static inline int wake_priority_sleeper(runqueue_t *rq)
2209{
2210 int ret = 0;
2211#ifdef CONFIG_SCHED_SMT
2212 spin_lock(&rq->lock);
2213 /*
2214 * If an SMT sibling task has been put to sleep for priority
2215 * reasons reschedule the idle task to see if it can now run.
2216 */
2217 if (rq->nr_running) {
2218 resched_task(rq->idle);
2219 ret = 1;
2220 }
2221 spin_unlock(&rq->lock);
2222#endif
2223 return ret;
2224}
2225
2226DEFINE_PER_CPU(struct kernel_stat, kstat);
2227
2228EXPORT_PER_CPU_SYMBOL(kstat);
2229
2230/*
2231 * This is called on clock ticks and on context switches.
2232 * Bank in p->sched_time the ns elapsed since the last tick or switch.
2233 */
2234static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
2235 unsigned long long now)
2236{
2237 unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
2238 p->sched_time += now - last;
2239}
2240
2241/*
2242 * Return current->sched_time plus any more ns on the sched_clock
2243 * that have not yet been banked.
2244 */
2245unsigned long long current_sched_time(const task_t *tsk)
2246{
2247 unsigned long long ns;
2248 unsigned long flags;
2249 local_irq_save(flags);
2250 ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
2251 ns = tsk->sched_time + (sched_clock() - ns);
2252 local_irq_restore(flags);
2253 return ns;
2254}
2255
2256/*
2257 * We place interactive tasks back into the active array, if possible.
2258 *
2259 * To guarantee that this does not starve expired tasks we ignore the
2260 * interactivity of a task if the first expired task had to wait more
2261 * than a 'reasonable' amount of time. This deadline timeout is
2262 * load-dependent, as the frequency of array switched decreases with
2263 * increasing number of running tasks. We also ignore the interactivity
2264 * if a better static_prio task has expired:
2265 */
2266#define EXPIRED_STARVING(rq) \
2267 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2268 (jiffies - (rq)->expired_timestamp >= \
2269 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2270 ((rq)->curr->static_prio > (rq)->best_expired_prio))
2271
2272/*
2273 * Account user cpu time to a process.
2274 * @p: the process that the cpu time gets accounted to
2275 * @hardirq_offset: the offset to subtract from hardirq_count()
2276 * @cputime: the cpu time spent in user space since the last update
2277 */
2278void account_user_time(struct task_struct *p, cputime_t cputime)
2279{
2280 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2281 cputime64_t tmp;
2282
2283 p->utime = cputime_add(p->utime, cputime);
2284
2285 /* Add user time to cpustat. */
2286 tmp = cputime_to_cputime64(cputime);
2287 if (TASK_NICE(p) > 0)
2288 cpustat->nice = cputime64_add(cpustat->nice, tmp);
2289 else
2290 cpustat->user = cputime64_add(cpustat->user, tmp);
2291}
2292
2293/*
2294 * Account system cpu time to a process.
2295 * @p: the process that the cpu time gets accounted to
2296 * @hardirq_offset: the offset to subtract from hardirq_count()
2297 * @cputime: the cpu time spent in kernel space since the last update
2298 */
2299void account_system_time(struct task_struct *p, int hardirq_offset,
2300 cputime_t cputime)
2301{
2302 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2303 runqueue_t *rq = this_rq();
2304 cputime64_t tmp;
2305
2306 p->stime = cputime_add(p->stime, cputime);
2307
2308 /* Add system time to cpustat. */
2309 tmp = cputime_to_cputime64(cputime);
2310 if (hardirq_count() - hardirq_offset)
2311 cpustat->irq = cputime64_add(cpustat->irq, tmp);
2312 else if (softirq_count())
2313 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2314 else if (p != rq->idle)
2315 cpustat->system = cputime64_add(cpustat->system, tmp);
2316 else if (atomic_read(&rq->nr_iowait) > 0)
2317 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2318 else
2319 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2320 /* Account for system time used */
2321 acct_update_integrals(p);
2322 /* Update rss highwater mark */
2323 update_mem_hiwater(p);
2324}
2325
2326/*
2327 * Account for involuntary wait time.
2328 * @p: the process from which the cpu time has been stolen
2329 * @steal: the cpu time spent in involuntary wait
2330 */
2331void account_steal_time(struct task_struct *p, cputime_t steal)
2332{
2333 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2334 cputime64_t tmp = cputime_to_cputime64(steal);
2335 runqueue_t *rq = this_rq();
2336
2337 if (p == rq->idle) {
2338 p->stime = cputime_add(p->stime, steal);
2339 if (atomic_read(&rq->nr_iowait) > 0)
2340 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2341 else
2342 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2343 } else
2344 cpustat->steal = cputime64_add(cpustat->steal, tmp);
2345}
2346
2347/*
2348 * This function gets called by the timer code, with HZ frequency.
2349 * We call it with interrupts disabled.
2350 *
2351 * It also gets called by the fork code, when changing the parent's
2352 * timeslices.
2353 */
2354void scheduler_tick(void)
2355{
2356 int cpu = smp_processor_id();
2357 runqueue_t *rq = this_rq();
2358 task_t *p = current;
2359 unsigned long long now = sched_clock();
2360
2361 update_cpu_clock(p, rq, now);
2362
2363 rq->timestamp_last_tick = now;
2364
2365 if (p == rq->idle) {
2366 if (wake_priority_sleeper(rq))
2367 goto out;
2368 rebalance_tick(cpu, rq, SCHED_IDLE);
2369 return;
2370 }
2371
2372 /* Task might have expired already, but not scheduled off yet */
2373 if (p->array != rq->active) {
2374 set_tsk_need_resched(p);
2375 goto out;
2376 }
2377 spin_lock(&rq->lock);
2378 /*
2379 * The task was running during this tick - update the
2380 * time slice counter. Note: we do not update a thread's
2381 * priority until it either goes to sleep or uses up its
2382 * timeslice. This makes it possible for interactive tasks
2383 * to use up their timeslices at their highest priority levels.
2384 */
2385 if (rt_task(p)) {
2386 /*
2387 * RR tasks need a special form of timeslice management.
2388 * FIFO tasks have no timeslices.
2389 */
2390 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2391 p->time_slice = task_timeslice(p);
2392 p->first_time_slice = 0;
2393 set_tsk_need_resched(p);
2394
2395 /* put it at the end of the queue: */
2396 requeue_task(p, rq->active);
2397 }
2398 goto out_unlock;
2399 }
2400 if (!--p->time_slice) {
2401 dequeue_task(p, rq->active);
2402 set_tsk_need_resched(p);
2403 p->prio = effective_prio(p);
2404 p->time_slice = task_timeslice(p);
2405 p->first_time_slice = 0;
2406
2407 if (!rq->expired_timestamp)
2408 rq->expired_timestamp = jiffies;
2409 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2410 enqueue_task(p, rq->expired);
2411 if (p->static_prio < rq->best_expired_prio)
2412 rq->best_expired_prio = p->static_prio;
2413 } else
2414 enqueue_task(p, rq->active);
2415 } else {
2416 /*
2417 * Prevent a too long timeslice allowing a task to monopolize
2418 * the CPU. We do this by splitting up the timeslice into
2419 * smaller pieces.
2420 *
2421 * Note: this does not mean the task's timeslices expire or
2422 * get lost in any way, they just might be preempted by
2423 * another task of equal priority. (one with higher
2424 * priority would have preempted this task already.) We
2425 * requeue this task to the end of the list on this priority
2426 * level, which is in essence a round-robin of tasks with
2427 * equal priority.
2428 *
2429 * This only applies to tasks in the interactive
2430 * delta range with at least TIMESLICE_GRANULARITY to requeue.
2431 */
2432 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2433 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2434 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2435 (p->array == rq->active)) {
2436
2437 requeue_task(p, rq->active);
2438 set_tsk_need_resched(p);
2439 }
2440 }
2441out_unlock:
2442 spin_unlock(&rq->lock);
2443out:
2444 rebalance_tick(cpu, rq, NOT_IDLE);
2445}
2446
2447#ifdef CONFIG_SCHED_SMT
2448static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2449{
2450 struct sched_domain *sd = this_rq->sd;
2451 cpumask_t sibling_map;
2452 int i;
2453
2454 if (!(sd->flags & SD_SHARE_CPUPOWER))
2455 return;
2456
2457 /*
2458 * Unlock the current runqueue because we have to lock in
2459 * CPU order to avoid deadlocks. Caller knows that we might
2460 * unlock. We keep IRQs disabled.
2461 */
2462 spin_unlock(&this_rq->lock);
2463
2464 sibling_map = sd->span;
2465
2466 for_each_cpu_mask(i, sibling_map)
2467 spin_lock(&cpu_rq(i)->lock);
2468 /*
2469 * We clear this CPU from the mask. This both simplifies the
2470 * inner loop and keps this_rq locked when we exit:
2471 */
2472 cpu_clear(this_cpu, sibling_map);
2473
2474 for_each_cpu_mask(i, sibling_map) {
2475 runqueue_t *smt_rq = cpu_rq(i);
2476
2477 /*
2478 * If an SMT sibling task is sleeping due to priority
2479 * reasons wake it up now.
2480 */
2481 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2482 resched_task(smt_rq->idle);
2483 }
2484
2485 for_each_cpu_mask(i, sibling_map)
2486 spin_unlock(&cpu_rq(i)->lock);
2487 /*
2488 * We exit with this_cpu's rq still held and IRQs
2489 * still disabled:
2490 */
2491}
2492
2493static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2494{
2495 struct sched_domain *sd = this_rq->sd;
2496 cpumask_t sibling_map;
2497 prio_array_t *array;
2498 int ret = 0, i;
2499 task_t *p;
2500
2501 if (!(sd->flags & SD_SHARE_CPUPOWER))
2502 return 0;
2503
2504 /*
2505 * The same locking rules and details apply as for
2506 * wake_sleeping_dependent():
2507 */
2508 spin_unlock(&this_rq->lock);
2509 sibling_map = sd->span;
2510 for_each_cpu_mask(i, sibling_map)
2511 spin_lock(&cpu_rq(i)->lock);
2512 cpu_clear(this_cpu, sibling_map);
2513
2514 /*
2515 * Establish next task to be run - it might have gone away because
2516 * we released the runqueue lock above:
2517 */
2518 if (!this_rq->nr_running)
2519 goto out_unlock;
2520 array = this_rq->active;
2521 if (!array->nr_active)
2522 array = this_rq->expired;
2523 BUG_ON(!array->nr_active);
2524
2525 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2526 task_t, run_list);
2527
2528 for_each_cpu_mask(i, sibling_map) {
2529 runqueue_t *smt_rq = cpu_rq(i);
2530 task_t *smt_curr = smt_rq->curr;
2531
2532 /*
2533 * If a user task with lower static priority than the
2534 * running task on the SMT sibling is trying to schedule,
2535 * delay it till there is proportionately less timeslice
2536 * left of the sibling task to prevent a lower priority
2537 * task from using an unfair proportion of the
2538 * physical cpu's resources. -ck
2539 */
2540 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2541 task_timeslice(p) || rt_task(smt_curr)) &&
2542 p->mm && smt_curr->mm && !rt_task(p))
2543 ret = 1;
2544
2545 /*
2546 * Reschedule a lower priority task on the SMT sibling,
2547 * or wake it up if it has been put to sleep for priority
2548 * reasons.
2549 */
2550 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2551 task_timeslice(smt_curr) || rt_task(p)) &&
2552 smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2553 (smt_curr == smt_rq->idle && smt_rq->nr_running))
2554 resched_task(smt_curr);
2555 }
2556out_unlock:
2557 for_each_cpu_mask(i, sibling_map)
2558 spin_unlock(&cpu_rq(i)->lock);
2559 return ret;
2560}
2561#else
2562static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2563{
2564}
2565
2566static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2567{
2568 return 0;
2569}
2570#endif
2571
2572#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2573
2574void fastcall add_preempt_count(int val)
2575{
2576 /*
2577 * Underflow?
2578 */
2579 BUG_ON(((int)preempt_count() < 0));
2580 preempt_count() += val;
2581 /*
2582 * Spinlock count overflowing soon?
2583 */
2584 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2585}
2586EXPORT_SYMBOL(add_preempt_count);
2587
2588void fastcall sub_preempt_count(int val)
2589{
2590 /*
2591 * Underflow?
2592 */
2593 BUG_ON(val > preempt_count());
2594 /*
2595 * Is the spinlock portion underflowing?
2596 */
2597 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2598 preempt_count() -= val;
2599}
2600EXPORT_SYMBOL(sub_preempt_count);
2601
2602#endif
2603
2604/*
2605 * schedule() is the main scheduler function.
2606 */
2607asmlinkage void __sched schedule(void)
2608{
2609 long *switch_count;
2610 task_t *prev, *next;
2611 runqueue_t *rq;
2612 prio_array_t *array;
2613 struct list_head *queue;
2614 unsigned long long now;
2615 unsigned long run_time;
2616 int cpu, idx;
2617
2618 /*
2619 * Test if we are atomic. Since do_exit() needs to call into
2620 * schedule() atomically, we ignore that path for now.
2621 * Otherwise, whine if we are scheduling when we should not be.
2622 */
2623 if (likely(!current->exit_state)) {
2624 if (unlikely(in_atomic())) {
2625 printk(KERN_ERR "scheduling while atomic: "
2626 "%s/0x%08x/%d\n",
2627 current->comm, preempt_count(), current->pid);
2628 dump_stack();
2629 }
2630 }
2631 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2632
2633need_resched:
2634 preempt_disable();
2635 prev = current;
2636 release_kernel_lock(prev);
2637need_resched_nonpreemptible:
2638 rq = this_rq();
2639
2640 /*
2641 * The idle thread is not allowed to schedule!
2642 * Remove this check after it has been exercised a bit.
2643 */
2644 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2645 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2646 dump_stack();
2647 }
2648
2649 schedstat_inc(rq, sched_cnt);
2650 now = sched_clock();
2651 if (likely((long long)now - prev->timestamp < NS_MAX_SLEEP_AVG)) {
2652 run_time = now - prev->timestamp;
2653 if (unlikely((long long)now - prev->timestamp < 0))
2654 run_time = 0;
2655 } else
2656 run_time = NS_MAX_SLEEP_AVG;
2657
2658 /*
2659 * Tasks charged proportionately less run_time at high sleep_avg to
2660 * delay them losing their interactive status
2661 */
2662 run_time /= (CURRENT_BONUS(prev) ? : 1);
2663
2664 spin_lock_irq(&rq->lock);
2665
2666 if (unlikely(prev->flags & PF_DEAD))
2667 prev->state = EXIT_DEAD;
2668
2669 switch_count = &prev->nivcsw;
2670 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2671 switch_count = &prev->nvcsw;
2672 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2673 unlikely(signal_pending(prev))))
2674 prev->state = TASK_RUNNING;
2675 else {
2676 if (prev->state == TASK_UNINTERRUPTIBLE)
2677 rq->nr_uninterruptible++;
2678 deactivate_task(prev, rq);
2679 }
2680 }
2681
2682 cpu = smp_processor_id();
2683 if (unlikely(!rq->nr_running)) {
2684go_idle:
2685 idle_balance(cpu, rq);
2686 if (!rq->nr_running) {
2687 next = rq->idle;
2688 rq->expired_timestamp = 0;
2689 wake_sleeping_dependent(cpu, rq);
2690 /*
2691 * wake_sleeping_dependent() might have released
2692 * the runqueue, so break out if we got new
2693 * tasks meanwhile:
2694 */
2695 if (!rq->nr_running)
2696 goto switch_tasks;
2697 }
2698 } else {
2699 if (dependent_sleeper(cpu, rq)) {
2700 next = rq->idle;
2701 goto switch_tasks;
2702 }
2703 /*
2704 * dependent_sleeper() releases and reacquires the runqueue
2705 * lock, hence go into the idle loop if the rq went
2706 * empty meanwhile:
2707 */
2708 if (unlikely(!rq->nr_running))
2709 goto go_idle;
2710 }
2711
2712 array = rq->active;
2713 if (unlikely(!array->nr_active)) {
2714 /*
2715 * Switch the active and expired arrays.
2716 */
2717 schedstat_inc(rq, sched_switch);
2718 rq->active = rq->expired;
2719 rq->expired = array;
2720 array = rq->active;
2721 rq->expired_timestamp = 0;
2722 rq->best_expired_prio = MAX_PRIO;
2723 }
2724
2725 idx = sched_find_first_bit(array->bitmap);
2726 queue = array->queue + idx;
2727 next = list_entry(queue->next, task_t, run_list);
2728
2729 if (!rt_task(next) && next->activated > 0) {
2730 unsigned long long delta = now - next->timestamp;
2731 if (unlikely((long long)now - next->timestamp < 0))
2732 delta = 0;
2733
2734 if (next->activated == 1)
2735 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2736
2737 array = next->array;
2738 dequeue_task(next, array);
2739 recalc_task_prio(next, next->timestamp + delta);
2740 enqueue_task(next, array);
2741 }
2742 next->activated = 0;
2743switch_tasks:
2744 if (next == rq->idle)
2745 schedstat_inc(rq, sched_goidle);
2746 prefetch(next);
2747 clear_tsk_need_resched(prev);
2748 rcu_qsctr_inc(task_cpu(prev));
2749
2750 update_cpu_clock(prev, rq, now);
2751
2752 prev->sleep_avg -= run_time;
2753 if ((long)prev->sleep_avg <= 0)
2754 prev->sleep_avg = 0;
2755 prev->timestamp = prev->last_ran = now;
2756
2757 sched_info_switch(prev, next);
2758 if (likely(prev != next)) {
2759 next->timestamp = now;
2760 rq->nr_switches++;
2761 rq->curr = next;
2762 ++*switch_count;
2763
2764 prepare_arch_switch(rq, next);
2765 prev = context_switch(rq, prev, next);
2766 barrier();
2767
2768 finish_task_switch(prev);
2769 } else
2770 spin_unlock_irq(&rq->lock);
2771
2772 prev = current;
2773 if (unlikely(reacquire_kernel_lock(prev) < 0))
2774 goto need_resched_nonpreemptible;
2775 preempt_enable_no_resched();
2776 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2777 goto need_resched;
2778}
2779
2780EXPORT_SYMBOL(schedule);
2781
2782#ifdef CONFIG_PREEMPT
2783/*
2784 * this is is the entry point to schedule() from in-kernel preemption
2785 * off of preempt_enable. Kernel preemptions off return from interrupt
2786 * occur there and call schedule directly.
2787 */
2788asmlinkage void __sched preempt_schedule(void)
2789{
2790 struct thread_info *ti = current_thread_info();
2791#ifdef CONFIG_PREEMPT_BKL
2792 struct task_struct *task = current;
2793 int saved_lock_depth;
2794#endif
2795 /*
2796 * If there is a non-zero preempt_count or interrupts are disabled,
2797 * we do not want to preempt the current task. Just return..
2798 */
2799 if (unlikely(ti->preempt_count || irqs_disabled()))
2800 return;
2801
2802need_resched:
2803 add_preempt_count(PREEMPT_ACTIVE);
2804 /*
2805 * We keep the big kernel semaphore locked, but we
2806 * clear ->lock_depth so that schedule() doesnt
2807 * auto-release the semaphore:
2808 */
2809#ifdef CONFIG_PREEMPT_BKL
2810 saved_lock_depth = task->lock_depth;
2811 task->lock_depth = -1;
2812#endif
2813 schedule();
2814#ifdef CONFIG_PREEMPT_BKL
2815 task->lock_depth = saved_lock_depth;
2816#endif
2817 sub_preempt_count(PREEMPT_ACTIVE);
2818
2819 /* we could miss a preemption opportunity between schedule and now */
2820 barrier();
2821 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2822 goto need_resched;
2823}
2824
2825EXPORT_SYMBOL(preempt_schedule);
2826
2827/*
2828 * this is is the entry point to schedule() from kernel preemption
2829 * off of irq context.
2830 * Note, that this is called and return with irqs disabled. This will
2831 * protect us against recursive calling from irq.
2832 */
2833asmlinkage void __sched preempt_schedule_irq(void)
2834{
2835 struct thread_info *ti = current_thread_info();
2836#ifdef CONFIG_PREEMPT_BKL
2837 struct task_struct *task = current;
2838 int saved_lock_depth;
2839#endif
2840 /* Catch callers which need to be fixed*/
2841 BUG_ON(ti->preempt_count || !irqs_disabled());
2842
2843need_resched:
2844 add_preempt_count(PREEMPT_ACTIVE);
2845 /*
2846 * We keep the big kernel semaphore locked, but we
2847 * clear ->lock_depth so that schedule() doesnt
2848 * auto-release the semaphore:
2849 */
2850#ifdef CONFIG_PREEMPT_BKL
2851 saved_lock_depth = task->lock_depth;
2852 task->lock_depth = -1;
2853#endif
2854 local_irq_enable();
2855 schedule();
2856 local_irq_disable();
2857#ifdef CONFIG_PREEMPT_BKL
2858 task->lock_depth = saved_lock_depth;
2859#endif
2860 sub_preempt_count(PREEMPT_ACTIVE);
2861
2862 /* we could miss a preemption opportunity between schedule and now */
2863 barrier();
2864 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2865 goto need_resched;
2866}
2867
2868#endif /* CONFIG_PREEMPT */
2869
2870int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2871{
2872 task_t *p = curr->task;
2873 return try_to_wake_up(p, mode, sync);
2874}
2875
2876EXPORT_SYMBOL(default_wake_function);
2877
2878/*
2879 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
2880 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
2881 * number) then we wake all the non-exclusive tasks and one exclusive task.
2882 *
2883 * There are circumstances in which we can try to wake a task which has already
2884 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
2885 * zero in this (rare) case, and we handle it by continuing to scan the queue.
2886 */
2887static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2888 int nr_exclusive, int sync, void *key)
2889{
2890 struct list_head *tmp, *next;
2891
2892 list_for_each_safe(tmp, next, &q->task_list) {
2893 wait_queue_t *curr;
2894 unsigned flags;
2895 curr = list_entry(tmp, wait_queue_t, task_list);
2896 flags = curr->flags;
2897 if (curr->func(curr, mode, sync, key) &&
2898 (flags & WQ_FLAG_EXCLUSIVE) &&
2899 !--nr_exclusive)
2900 break;
2901 }
2902}
2903
2904/**
2905 * __wake_up - wake up threads blocked on a waitqueue.
2906 * @q: the waitqueue
2907 * @mode: which threads
2908 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2909 */
2910void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2911 int nr_exclusive, void *key)
2912{
2913 unsigned long flags;
2914
2915 spin_lock_irqsave(&q->lock, flags);
2916 __wake_up_common(q, mode, nr_exclusive, 0, key);
2917 spin_unlock_irqrestore(&q->lock, flags);
2918}
2919
2920EXPORT_SYMBOL(__wake_up);
2921
2922/*
2923 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2924 */
2925void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2926{
2927 __wake_up_common(q, mode, 1, 0, NULL);
2928}
2929
2930/**
2931 * __wake_up - sync- wake up threads blocked on a waitqueue.
2932 * @q: the waitqueue
2933 * @mode: which threads
2934 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2935 *
2936 * The sync wakeup differs that the waker knows that it will schedule
2937 * away soon, so while the target thread will be woken up, it will not
2938 * be migrated to another CPU - ie. the two threads are 'synchronized'
2939 * with each other. This can prevent needless bouncing between CPUs.
2940 *
2941 * On UP it can prevent extra preemption.
2942 */
2943void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2944{
2945 unsigned long flags;
2946 int sync = 1;
2947
2948 if (unlikely(!q))
2949 return;
2950
2951 if (unlikely(!nr_exclusive))
2952 sync = 0;
2953
2954 spin_lock_irqsave(&q->lock, flags);
2955 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2956 spin_unlock_irqrestore(&q->lock, flags);
2957}
2958EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
2959
2960void fastcall complete(struct completion *x)
2961{
2962 unsigned long flags;
2963
2964 spin_lock_irqsave(&x->wait.lock, flags);
2965 x->done++;
2966 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2967 1, 0, NULL);
2968 spin_unlock_irqrestore(&x->wait.lock, flags);
2969}
2970EXPORT_SYMBOL(complete);
2971
2972void fastcall complete_all(struct completion *x)
2973{
2974 unsigned long flags;
2975
2976 spin_lock_irqsave(&x->wait.lock, flags);
2977 x->done += UINT_MAX/2;
2978 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2979 0, 0, NULL);
2980 spin_unlock_irqrestore(&x->wait.lock, flags);
2981}
2982EXPORT_SYMBOL(complete_all);
2983
2984void fastcall __sched wait_for_completion(struct completion *x)
2985{
2986 might_sleep();
2987 spin_lock_irq(&x->wait.lock);
2988 if (!x->done) {
2989 DECLARE_WAITQUEUE(wait, current);
2990
2991 wait.flags |= WQ_FLAG_EXCLUSIVE;
2992 __add_wait_queue_tail(&x->wait, &wait);
2993 do {
2994 __set_current_state(TASK_UNINTERRUPTIBLE);
2995 spin_unlock_irq(&x->wait.lock);
2996 schedule();
2997 spin_lock_irq(&x->wait.lock);
2998 } while (!x->done);
2999 __remove_wait_queue(&x->wait, &wait);
3000 }
3001 x->done--;
3002 spin_unlock_irq(&x->wait.lock);
3003}
3004EXPORT_SYMBOL(wait_for_completion);
3005
3006unsigned long fastcall __sched
3007wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3008{
3009 might_sleep();
3010
3011 spin_lock_irq(&x->wait.lock);
3012 if (!x->done) {
3013 DECLARE_WAITQUEUE(wait, current);
3014
3015 wait.flags |= WQ_FLAG_EXCLUSIVE;
3016 __add_wait_queue_tail(&x->wait, &wait);
3017 do {
3018 __set_current_state(TASK_UNINTERRUPTIBLE);
3019 spin_unlock_irq(&x->wait.lock);
3020 timeout = schedule_timeout(timeout);
3021 spin_lock_irq(&x->wait.lock);
3022 if (!timeout) {
3023 __remove_wait_queue(&x->wait, &wait);
3024 goto out;
3025 }
3026 } while (!x->done);
3027 __remove_wait_queue(&x->wait, &wait);
3028 }
3029 x->done--;
3030out:
3031 spin_unlock_irq(&x->wait.lock);
3032 return timeout;
3033}
3034EXPORT_SYMBOL(wait_for_completion_timeout);
3035
3036int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3037{
3038 int ret = 0;
3039
3040 might_sleep();
3041
3042 spin_lock_irq(&x->wait.lock);
3043 if (!x->done) {
3044 DECLARE_WAITQUEUE(wait, current);
3045
3046 wait.flags |= WQ_FLAG_EXCLUSIVE;
3047 __add_wait_queue_tail(&x->wait, &wait);
3048 do {
3049 if (signal_pending(current)) {
3050 ret = -ERESTARTSYS;
3051 __remove_wait_queue(&x->wait, &wait);
3052 goto out;
3053 }
3054 __set_current_state(TASK_INTERRUPTIBLE);
3055 spin_unlock_irq(&x->wait.lock);
3056 schedule();
3057 spin_lock_irq(&x->wait.lock);
3058 } while (!x->done);
3059 __remove_wait_queue(&x->wait, &wait);
3060 }
3061 x->done--;
3062out:
3063 spin_unlock_irq(&x->wait.lock);
3064
3065 return ret;
3066}
3067EXPORT_SYMBOL(wait_for_completion_interruptible);
3068
3069unsigned long fastcall __sched
3070wait_for_completion_interruptible_timeout(struct completion *x,
3071 unsigned long timeout)
3072{
3073 might_sleep();
3074
3075 spin_lock_irq(&x->wait.lock);
3076 if (!x->done) {
3077 DECLARE_WAITQUEUE(wait, current);
3078
3079 wait.flags |= WQ_FLAG_EXCLUSIVE;
3080 __add_wait_queue_tail(&x->wait, &wait);
3081 do {
3082 if (signal_pending(current)) {
3083 timeout = -ERESTARTSYS;
3084 __remove_wait_queue(&x->wait, &wait);
3085 goto out;
3086 }
3087 __set_current_state(TASK_INTERRUPTIBLE);
3088 spin_unlock_irq(&x->wait.lock);
3089 timeout = schedule_timeout(timeout);
3090 spin_lock_irq(&x->wait.lock);
3091 if (!timeout) {
3092 __remove_wait_queue(&x->wait, &wait);
3093 goto out;
3094 }
3095 } while (!x->done);
3096 __remove_wait_queue(&x->wait, &wait);
3097 }
3098 x->done--;
3099out:
3100 spin_unlock_irq(&x->wait.lock);
3101 return timeout;
3102}
3103EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3104
3105
3106#define SLEEP_ON_VAR \
3107 unsigned long flags; \
3108 wait_queue_t wait; \
3109 init_waitqueue_entry(&wait, current);
3110
3111#define SLEEP_ON_HEAD \
3112 spin_lock_irqsave(&q->lock,flags); \
3113 __add_wait_queue(q, &wait); \
3114 spin_unlock(&q->lock);
3115
3116#define SLEEP_ON_TAIL \
3117 spin_lock_irq(&q->lock); \
3118 __remove_wait_queue(q, &wait); \
3119 spin_unlock_irqrestore(&q->lock, flags);
3120
3121void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3122{
3123 SLEEP_ON_VAR
3124
3125 current->state = TASK_INTERRUPTIBLE;
3126
3127 SLEEP_ON_HEAD
3128 schedule();
3129 SLEEP_ON_TAIL
3130}
3131
3132EXPORT_SYMBOL(interruptible_sleep_on);
3133
3134long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3135{
3136 SLEEP_ON_VAR
3137
3138 current->state = TASK_INTERRUPTIBLE;
3139
3140 SLEEP_ON_HEAD
3141 timeout = schedule_timeout(timeout);
3142 SLEEP_ON_TAIL
3143
3144 return timeout;
3145}
3146
3147EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3148
3149void fastcall __sched sleep_on(wait_queue_head_t *q)
3150{
3151 SLEEP_ON_VAR
3152
3153 current->state = TASK_UNINTERRUPTIBLE;
3154
3155 SLEEP_ON_HEAD
3156 schedule();
3157 SLEEP_ON_TAIL
3158}
3159
3160EXPORT_SYMBOL(sleep_on);
3161
3162long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3163{
3164 SLEEP_ON_VAR
3165
3166 current->state = TASK_UNINTERRUPTIBLE;
3167
3168 SLEEP_ON_HEAD
3169 timeout = schedule_timeout(timeout);
3170 SLEEP_ON_TAIL
3171
3172 return timeout;
3173}
3174
3175EXPORT_SYMBOL(sleep_on_timeout);
3176
3177void set_user_nice(task_t *p, long nice)
3178{
3179 unsigned long flags;
3180 prio_array_t *array;
3181 runqueue_t *rq;
3182 int old_prio, new_prio, delta;
3183
3184 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3185 return;
3186 /*
3187 * We have to be careful, if called from sys_setpriority(),
3188 * the task might be in the middle of scheduling on another CPU.
3189 */
3190 rq = task_rq_lock(p, &flags);
3191 /*
3192 * The RT priorities are set via sched_setscheduler(), but we still
3193 * allow the 'normal' nice value to be set - but as expected
3194 * it wont have any effect on scheduling until the task is
3195 * not SCHED_NORMAL:
3196 */
3197 if (rt_task(p)) {
3198 p->static_prio = NICE_TO_PRIO(nice);
3199 goto out_unlock;
3200 }
3201 array = p->array;
3202 if (array)
3203 dequeue_task(p, array);
3204
3205 old_prio = p->prio;
3206 new_prio = NICE_TO_PRIO(nice);
3207 delta = new_prio - old_prio;
3208 p->static_prio = NICE_TO_PRIO(nice);
3209 p->prio += delta;
3210
3211 if (array) {
3212 enqueue_task(p, array);
3213 /*
3214 * If the task increased its priority or is running and
3215 * lowered its priority, then reschedule its CPU:
3216 */
3217 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3218 resched_task(rq->curr);
3219 }
3220out_unlock:
3221 task_rq_unlock(rq, &flags);
3222}
3223
3224EXPORT_SYMBOL(set_user_nice);
3225
3226#ifdef __ARCH_WANT_SYS_NICE
3227
3228/*
3229 * sys_nice - change the priority of the current process.
3230 * @increment: priority increment
3231 *
3232 * sys_setpriority is a more generic, but much slower function that
3233 * does similar things.
3234 */
3235asmlinkage long sys_nice(int increment)
3236{
3237 int retval;
3238 long nice;
3239
3240 /*
3241 * Setpriority might change our priority at the same moment.
3242 * We don't have to worry. Conceptually one call occurs first
3243 * and we have a single winner.
3244 */
3245 if (increment < 0) {
3246 if (!capable(CAP_SYS_NICE))
3247 return -EPERM;
3248 if (increment < -40)
3249 increment = -40;
3250 }
3251 if (increment > 40)
3252 increment = 40;
3253
3254 nice = PRIO_TO_NICE(current->static_prio) + increment;
3255 if (nice < -20)
3256 nice = -20;
3257 if (nice > 19)
3258 nice = 19;
3259
3260 retval = security_task_setnice(current, nice);
3261 if (retval)
3262 return retval;
3263
3264 set_user_nice(current, nice);
3265 return 0;
3266}
3267
3268#endif
3269
3270/**
3271 * task_prio - return the priority value of a given task.
3272 * @p: the task in question.
3273 *
3274 * This is the priority value as seen by users in /proc.
3275 * RT tasks are offset by -200. Normal tasks are centered
3276 * around 0, value goes from -16 to +15.
3277 */
3278int task_prio(const task_t *p)
3279{
3280 return p->prio - MAX_RT_PRIO;
3281}
3282
3283/**
3284 * task_nice - return the nice value of a given task.
3285 * @p: the task in question.
3286 */
3287int task_nice(const task_t *p)
3288{
3289 return TASK_NICE(p);
3290}
3291
3292/*
3293 * The only users of task_nice are binfmt_elf and binfmt_elf32.
3294 * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3295 * Therefore, task_nice is needed if there is a compat_mode.
3296 */
3297#ifdef CONFIG_COMPAT
3298EXPORT_SYMBOL_GPL(task_nice);
3299#endif
3300
3301/**
3302 * idle_cpu - is a given cpu idle currently?
3303 * @cpu: the processor in question.
3304 */
3305int idle_cpu(int cpu)
3306{
3307 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3308}
3309
3310EXPORT_SYMBOL_GPL(idle_cpu);
3311
3312/**
3313 * idle_task - return the idle task for a given cpu.
3314 * @cpu: the processor in question.
3315 */
3316task_t *idle_task(int cpu)
3317{
3318 return cpu_rq(cpu)->idle;
3319}
3320
3321/**
3322 * find_process_by_pid - find a process with a matching PID value.
3323 * @pid: the pid in question.
3324 */
3325static inline task_t *find_process_by_pid(pid_t pid)
3326{
3327 return pid ? find_task_by_pid(pid) : current;
3328}
3329
3330/* Actually do priority change: must hold rq lock. */
3331static void __setscheduler(struct task_struct *p, int policy, int prio)
3332{
3333 BUG_ON(p->array);
3334 p->policy = policy;
3335 p->rt_priority = prio;
3336 if (policy != SCHED_NORMAL)
3337 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3338 else
3339 p->prio = p->static_prio;
3340}
3341
3342/**
3343 * sched_setscheduler - change the scheduling policy and/or RT priority of
3344 * a thread.
3345 * @p: the task in question.
3346 * @policy: new policy.
3347 * @param: structure containing the new RT priority.
3348 */
3349int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
3350{
3351 int retval;
3352 int oldprio, oldpolicy = -1;
3353 prio_array_t *array;
3354 unsigned long flags;
3355 runqueue_t *rq;
3356
3357recheck:
3358 /* double check policy once rq lock held */
3359 if (policy < 0)
3360 policy = oldpolicy = p->policy;
3361 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3362 policy != SCHED_NORMAL)
3363 return -EINVAL;
3364 /*
3365 * Valid priorities for SCHED_FIFO and SCHED_RR are
3366 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3367 */
3368 if (param->sched_priority < 0 ||
3369 param->sched_priority > MAX_USER_RT_PRIO-1)
3370 return -EINVAL;
3371 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3372 return -EINVAL;
3373
3374 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3375 !capable(CAP_SYS_NICE))
3376 return -EPERM;
3377 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3378 !capable(CAP_SYS_NICE))
3379 return -EPERM;
3380
3381 retval = security_task_setscheduler(p, policy, param);
3382 if (retval)
3383 return retval;
3384 /*
3385 * To be able to change p->policy safely, the apropriate
3386 * runqueue lock must be held.
3387 */
3388 rq = task_rq_lock(p, &flags);
3389 /* recheck policy now with rq lock held */
3390 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3391 policy = oldpolicy = -1;
3392 task_rq_unlock(rq, &flags);
3393 goto recheck;
3394 }
3395 array = p->array;
3396 if (array)
3397 deactivate_task(p, rq);
3398 oldprio = p->prio;
3399 __setscheduler(p, policy, param->sched_priority);
3400 if (array) {
3401 __activate_task(p, rq);
3402 /*
3403 * Reschedule if we are currently running on this runqueue and
3404 * our priority decreased, or if we are not currently running on
3405 * this runqueue and our priority is higher than the current's
3406 */
3407 if (task_running(rq, p)) {
3408 if (p->prio > oldprio)
3409 resched_task(rq->curr);
3410 } else if (TASK_PREEMPTS_CURR(p, rq))
3411 resched_task(rq->curr);
3412 }
3413 task_rq_unlock(rq, &flags);
3414 return 0;
3415}
3416EXPORT_SYMBOL_GPL(sched_setscheduler);
3417
3418static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3419{
3420 int retval;
3421 struct sched_param lparam;
3422 struct task_struct *p;
3423
3424 if (!param || pid < 0)
3425 return -EINVAL;
3426 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3427 return -EFAULT;
3428 read_lock_irq(&tasklist_lock);
3429 p = find_process_by_pid(pid);
3430 if (!p) {
3431 read_unlock_irq(&tasklist_lock);
3432 return -ESRCH;
3433 }
3434 retval = sched_setscheduler(p, policy, &lparam);
3435 read_unlock_irq(&tasklist_lock);
3436 return retval;
3437}
3438
3439/**
3440 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3441 * @pid: the pid in question.
3442 * @policy: new policy.
3443 * @param: structure containing the new RT priority.
3444 */
3445asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3446 struct sched_param __user *param)
3447{
3448 return do_sched_setscheduler(pid, policy, param);
3449}
3450
3451/**
3452 * sys_sched_setparam - set/change the RT priority of a thread
3453 * @pid: the pid in question.
3454 * @param: structure containing the new RT priority.
3455 */
3456asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3457{
3458 return do_sched_setscheduler(pid, -1, param);
3459}
3460
3461/**
3462 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3463 * @pid: the pid in question.
3464 */
3465asmlinkage long sys_sched_getscheduler(pid_t pid)
3466{
3467 int retval = -EINVAL;
3468 task_t *p;
3469
3470 if (pid < 0)
3471 goto out_nounlock;
3472
3473 retval = -ESRCH;
3474 read_lock(&tasklist_lock);
3475 p = find_process_by_pid(pid);
3476 if (p) {
3477 retval = security_task_getscheduler(p);
3478 if (!retval)
3479 retval = p->policy;
3480 }
3481 read_unlock(&tasklist_lock);
3482
3483out_nounlock:
3484 return retval;
3485}
3486
3487/**
3488 * sys_sched_getscheduler - get the RT priority of a thread
3489 * @pid: the pid in question.
3490 * @param: structure containing the RT priority.
3491 */
3492asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3493{
3494 struct sched_param lp;
3495 int retval = -EINVAL;
3496 task_t *p;
3497
3498 if (!param || pid < 0)
3499 goto out_nounlock;
3500
3501 read_lock(&tasklist_lock);
3502 p = find_process_by_pid(pid);
3503 retval = -ESRCH;
3504 if (!p)
3505 goto out_unlock;
3506
3507 retval = security_task_getscheduler(p);
3508 if (retval)
3509 goto out_unlock;
3510
3511 lp.sched_priority = p->rt_priority;
3512 read_unlock(&tasklist_lock);
3513
3514 /*
3515 * This one might sleep, we cannot do it with a spinlock held ...
3516 */
3517 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3518
3519out_nounlock:
3520 return retval;
3521
3522out_unlock:
3523 read_unlock(&tasklist_lock);
3524 return retval;
3525}
3526
3527long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3528{
3529 task_t *p;
3530 int retval;
3531 cpumask_t cpus_allowed;
3532
3533 lock_cpu_hotplug();
3534 read_lock(&tasklist_lock);
3535
3536 p = find_process_by_pid(pid);
3537 if (!p) {
3538 read_unlock(&tasklist_lock);
3539 unlock_cpu_hotplug();
3540 return -ESRCH;
3541 }
3542
3543 /*
3544 * It is not safe to call set_cpus_allowed with the
3545 * tasklist_lock held. We will bump the task_struct's
3546 * usage count and then drop tasklist_lock.
3547 */
3548 get_task_struct(p);
3549 read_unlock(&tasklist_lock);
3550
3551 retval = -EPERM;
3552 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3553 !capable(CAP_SYS_NICE))
3554 goto out_unlock;
3555
3556 cpus_allowed = cpuset_cpus_allowed(p);
3557 cpus_and(new_mask, new_mask, cpus_allowed);
3558 retval = set_cpus_allowed(p, new_mask);
3559
3560out_unlock:
3561 put_task_struct(p);
3562 unlock_cpu_hotplug();
3563 return retval;
3564}
3565
3566static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3567 cpumask_t *new_mask)
3568{
3569 if (len < sizeof(cpumask_t)) {
3570 memset(new_mask, 0, sizeof(cpumask_t));
3571 } else if (len > sizeof(cpumask_t)) {
3572 len = sizeof(cpumask_t);
3573 }
3574 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3575}
3576
3577/**
3578 * sys_sched_setaffinity - set the cpu affinity of a process
3579 * @pid: pid of the process
3580 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3581 * @user_mask_ptr: user-space pointer to the new cpu mask
3582 */
3583asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3584 unsigned long __user *user_mask_ptr)
3585{
3586 cpumask_t new_mask;
3587 int retval;
3588
3589 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3590 if (retval)
3591 return retval;
3592
3593 return sched_setaffinity(pid, new_mask);
3594}
3595
3596/*
3597 * Represents all cpu's present in the system
3598 * In systems capable of hotplug, this map could dynamically grow
3599 * as new cpu's are detected in the system via any platform specific
3600 * method, such as ACPI for e.g.
3601 */
3602
3603cpumask_t cpu_present_map;
3604EXPORT_SYMBOL(cpu_present_map);
3605
3606#ifndef CONFIG_SMP
3607cpumask_t cpu_online_map = CPU_MASK_ALL;
3608cpumask_t cpu_possible_map = CPU_MASK_ALL;
3609#endif
3610
3611long sched_getaffinity(pid_t pid, cpumask_t *mask)
3612{
3613 int retval;
3614 task_t *p;
3615
3616 lock_cpu_hotplug();
3617 read_lock(&tasklist_lock);
3618
3619 retval = -ESRCH;
3620 p = find_process_by_pid(pid);
3621 if (!p)
3622 goto out_unlock;
3623
3624 retval = 0;
3625 cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3626
3627out_unlock:
3628 read_unlock(&tasklist_lock);
3629 unlock_cpu_hotplug();
3630 if (retval)
3631 return retval;
3632
3633 return 0;
3634}
3635
3636/**
3637 * sys_sched_getaffinity - get the cpu affinity of a process
3638 * @pid: pid of the process
3639 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3640 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3641 */
3642asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3643 unsigned long __user *user_mask_ptr)
3644{
3645 int ret;
3646 cpumask_t mask;
3647
3648 if (len < sizeof(cpumask_t))
3649 return -EINVAL;
3650
3651 ret = sched_getaffinity(pid, &mask);
3652 if (ret < 0)
3653 return ret;
3654
3655 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3656 return -EFAULT;
3657
3658 return sizeof(cpumask_t);
3659}
3660
3661/**
3662 * sys_sched_yield - yield the current processor to other threads.
3663 *
3664 * this function yields the current CPU by moving the calling thread
3665 * to the expired array. If there are no other threads running on this
3666 * CPU then this function will return.
3667 */
3668asmlinkage long sys_sched_yield(void)
3669{
3670 runqueue_t *rq = this_rq_lock();
3671 prio_array_t *array = current->array;
3672 prio_array_t *target = rq->expired;
3673
3674 schedstat_inc(rq, yld_cnt);
3675 /*
3676 * We implement yielding by moving the task into the expired
3677 * queue.
3678 *
3679 * (special rule: RT tasks will just roundrobin in the active
3680 * array.)
3681 */
3682 if (rt_task(current))
3683 target = rq->active;
3684
3685 if (current->array->nr_active == 1) {
3686 schedstat_inc(rq, yld_act_empty);
3687 if (!rq->expired->nr_active)
3688 schedstat_inc(rq, yld_both_empty);
3689 } else if (!rq->expired->nr_active)
3690 schedstat_inc(rq, yld_exp_empty);
3691
3692 if (array != target) {
3693 dequeue_task(current, array);
3694 enqueue_task(current, target);
3695 } else
3696 /*
3697 * requeue_task is cheaper so perform that if possible.
3698 */
3699 requeue_task(current, array);
3700
3701 /*
3702 * Since we are going to call schedule() anyway, there's
3703 * no need to preempt or enable interrupts:
3704 */
3705 __release(rq->lock);
3706 _raw_spin_unlock(&rq->lock);
3707 preempt_enable_no_resched();
3708
3709 schedule();
3710
3711 return 0;
3712}
3713
3714static inline void __cond_resched(void)
3715{
3716 do {
3717 add_preempt_count(PREEMPT_ACTIVE);
3718 schedule();
3719 sub_preempt_count(PREEMPT_ACTIVE);
3720 } while (need_resched());
3721}
3722
3723int __sched cond_resched(void)
3724{
3725 if (need_resched()) {
3726 __cond_resched();
3727 return 1;
3728 }
3729 return 0;
3730}
3731
3732EXPORT_SYMBOL(cond_resched);
3733
3734/*
3735 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
3736 * call schedule, and on return reacquire the lock.
3737 *
3738 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
3739 * operations here to prevent schedule() from being called twice (once via
3740 * spin_unlock(), once by hand).
3741 */
3742int cond_resched_lock(spinlock_t * lock)
3743{
3744 if (need_lockbreak(lock)) {
3745 spin_unlock(lock);
3746 cpu_relax();
3747 spin_lock(lock);
3748 }
3749 if (need_resched()) {
3750 _raw_spin_unlock(lock);
3751 preempt_enable_no_resched();
3752 __cond_resched();
3753 spin_lock(lock);
3754 return 1;
3755 }
3756 return 0;
3757}
3758
3759EXPORT_SYMBOL(cond_resched_lock);
3760
3761int __sched cond_resched_softirq(void)
3762{
3763 BUG_ON(!in_softirq());
3764
3765 if (need_resched()) {
3766 __local_bh_enable();
3767 __cond_resched();
3768 local_bh_disable();
3769 return 1;
3770 }
3771 return 0;
3772}
3773
3774EXPORT_SYMBOL(cond_resched_softirq);
3775
3776
3777/**
3778 * yield - yield the current processor to other threads.
3779 *
3780 * this is a shortcut for kernel-space yielding - it marks the
3781 * thread runnable and calls sys_sched_yield().
3782 */
3783void __sched yield(void)
3784{
3785 set_current_state(TASK_RUNNING);
3786 sys_sched_yield();
3787}
3788
3789EXPORT_SYMBOL(yield);
3790
3791/*
3792 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
3793 * that process accounting knows that this is a task in IO wait state.
3794 *
3795 * But don't do that if it is a deliberate, throttling IO wait (this task
3796 * has set its backing_dev_info: the queue against which it should throttle)
3797 */
3798void __sched io_schedule(void)
3799{
3800 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
3801
3802 atomic_inc(&rq->nr_iowait);
3803 schedule();
3804 atomic_dec(&rq->nr_iowait);
3805}
3806
3807EXPORT_SYMBOL(io_schedule);
3808
3809long __sched io_schedule_timeout(long timeout)
3810{
3811 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
3812 long ret;
3813
3814 atomic_inc(&rq->nr_iowait);
3815 ret = schedule_timeout(timeout);
3816 atomic_dec(&rq->nr_iowait);
3817 return ret;
3818}
3819
3820/**
3821 * sys_sched_get_priority_max - return maximum RT priority.
3822 * @policy: scheduling class.
3823 *
3824 * this syscall returns the maximum rt_priority that can be used
3825 * by a given scheduling class.
3826 */
3827asmlinkage long sys_sched_get_priority_max(int policy)
3828{
3829 int ret = -EINVAL;
3830
3831 switch (policy) {
3832 case SCHED_FIFO:
3833 case SCHED_RR:
3834 ret = MAX_USER_RT_PRIO-1;
3835 break;
3836 case SCHED_NORMAL:
3837 ret = 0;
3838 break;
3839 }
3840 return ret;
3841}
3842
3843/**
3844 * sys_sched_get_priority_min - return minimum RT priority.
3845 * @policy: scheduling class.
3846 *
3847 * this syscall returns the minimum rt_priority that can be used
3848 * by a given scheduling class.
3849 */
3850asmlinkage long sys_sched_get_priority_min(int policy)
3851{
3852 int ret = -EINVAL;
3853
3854 switch (policy) {
3855 case SCHED_FIFO:
3856 case SCHED_RR:
3857 ret = 1;
3858 break;
3859 case SCHED_NORMAL:
3860 ret = 0;
3861 }
3862 return ret;
3863}
3864
3865/**
3866 * sys_sched_rr_get_interval - return the default timeslice of a process.
3867 * @pid: pid of the process.
3868 * @interval: userspace pointer to the timeslice value.
3869 *
3870 * this syscall writes the default timeslice value of a given process
3871 * into the user-space timespec buffer. A value of '0' means infinity.
3872 */
3873asmlinkage
3874long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3875{
3876 int retval = -EINVAL;
3877 struct timespec t;
3878 task_t *p;
3879
3880 if (pid < 0)
3881 goto out_nounlock;
3882
3883 retval = -ESRCH;
3884 read_lock(&tasklist_lock);
3885 p = find_process_by_pid(pid);
3886 if (!p)
3887 goto out_unlock;
3888
3889 retval = security_task_getscheduler(p);
3890 if (retval)
3891 goto out_unlock;
3892
3893 jiffies_to_timespec(p->policy & SCHED_FIFO ?
3894 0 : task_timeslice(p), &t);
3895 read_unlock(&tasklist_lock);
3896 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3897out_nounlock:
3898 return retval;
3899out_unlock:
3900 read_unlock(&tasklist_lock);
3901 return retval;
3902}
3903
3904static inline struct task_struct *eldest_child(struct task_struct *p)
3905{
3906 if (list_empty(&p->children)) return NULL;
3907 return list_entry(p->children.next,struct task_struct,sibling);
3908}
3909
3910static inline struct task_struct *older_sibling(struct task_struct *p)
3911{
3912 if (p->sibling.prev==&p->parent->children) return NULL;
3913 return list_entry(p->sibling.prev,struct task_struct,sibling);
3914}
3915
3916static inline struct task_struct *younger_sibling(struct task_struct *p)
3917{
3918 if (p->sibling.next==&p->parent->children) return NULL;
3919 return list_entry(p->sibling.next,struct task_struct,sibling);
3920}
3921
3922static void show_task(task_t * p)
3923{
3924 task_t *relative;
3925 unsigned state;
3926 unsigned long free = 0;
3927 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
3928
3929 printk("%-13.13s ", p->comm);
3930 state = p->state ? __ffs(p->state) + 1 : 0;
3931 if (state < ARRAY_SIZE(stat_nam))
3932 printk(stat_nam[state]);
3933 else
3934 printk("?");
3935#if (BITS_PER_LONG == 32)
3936 if (state == TASK_RUNNING)
3937 printk(" running ");
3938 else
3939 printk(" %08lX ", thread_saved_pc(p));
3940#else
3941 if (state == TASK_RUNNING)
3942 printk(" running task ");
3943 else
3944 printk(" %016lx ", thread_saved_pc(p));
3945#endif
3946#ifdef CONFIG_DEBUG_STACK_USAGE
3947 {
3948 unsigned long * n = (unsigned long *) (p->thread_info+1);
3949 while (!*n)
3950 n++;
3951 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3952 }
3953#endif
3954 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3955 if ((relative = eldest_child(p)))
3956 printk("%5d ", relative->pid);
3957 else
3958 printk(" ");
3959 if ((relative = younger_sibling(p)))
3960 printk("%7d", relative->pid);
3961 else
3962 printk(" ");
3963 if ((relative = older_sibling(p)))
3964 printk(" %5d", relative->pid);
3965 else
3966 printk(" ");
3967 if (!p->mm)
3968 printk(" (L-TLB)\n");
3969 else
3970 printk(" (NOTLB)\n");
3971
3972 if (state != TASK_RUNNING)
3973 show_stack(p, NULL);
3974}
3975
3976void show_state(void)
3977{
3978 task_t *g, *p;
3979
3980#if (BITS_PER_LONG == 32)
3981 printk("\n"
3982 " sibling\n");
3983 printk(" task PC pid father child younger older\n");
3984#else
3985 printk("\n"
3986 " sibling\n");
3987 printk(" task PC pid father child younger older\n");
3988#endif
3989 read_lock(&tasklist_lock);
3990 do_each_thread(g, p) {
3991 /*
3992 * reset the NMI-timeout, listing all files on a slow
3993 * console might take alot of time:
3994 */
3995 touch_nmi_watchdog();
3996 show_task(p);
3997 } while_each_thread(g, p);
3998
3999 read_unlock(&tasklist_lock);
4000}
4001
4002void __devinit init_idle(task_t *idle, int cpu)
4003{
4004 runqueue_t *rq = cpu_rq(cpu);
4005 unsigned long flags;
4006
4007 idle->sleep_avg = 0;
4008 idle->array = NULL;
4009 idle->prio = MAX_PRIO;
4010 idle->state = TASK_RUNNING;
4011 idle->cpus_allowed = cpumask_of_cpu(cpu);
4012 set_task_cpu(idle, cpu);
4013
4014 spin_lock_irqsave(&rq->lock, flags);
4015 rq->curr = rq->idle = idle;
4016 set_tsk_need_resched(idle);
4017 spin_unlock_irqrestore(&rq->lock, flags);
4018
4019 /* Set the preempt count _outside_ the spinlocks! */
4020#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4021 idle->thread_info->preempt_count = (idle->lock_depth >= 0);
4022#else
4023 idle->thread_info->preempt_count = 0;
4024#endif
4025}
4026
4027/*
4028 * In a system that switches off the HZ timer nohz_cpu_mask
4029 * indicates which cpus entered this state. This is used
4030 * in the rcu update to wait only for active cpus. For system
4031 * which do not switch off the HZ timer nohz_cpu_mask should
4032 * always be CPU_MASK_NONE.
4033 */
4034cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4035
4036#ifdef CONFIG_SMP
4037/*
4038 * This is how migration works:
4039 *
4040 * 1) we queue a migration_req_t structure in the source CPU's
4041 * runqueue and wake up that CPU's migration thread.
4042 * 2) we down() the locked semaphore => thread blocks.
4043 * 3) migration thread wakes up (implicitly it forces the migrated
4044 * thread off the CPU)
4045 * 4) it gets the migration request and checks whether the migrated
4046 * task is still in the wrong runqueue.
4047 * 5) if it's in the wrong runqueue then the migration thread removes
4048 * it and puts it into the right queue.
4049 * 6) migration thread up()s the semaphore.
4050 * 7) we wake up and the migration is done.
4051 */
4052
4053/*
4054 * Change a given task's CPU affinity. Migrate the thread to a
4055 * proper CPU and schedule it away if the CPU it's executing on
4056 * is removed from the allowed bitmask.
4057 *
4058 * NOTE: the caller must have a valid reference to the task, the
4059 * task must not exit() & deallocate itself prematurely. The
4060 * call is not atomic; no spinlocks may be held.
4061 */
4062int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4063{
4064 unsigned long flags;
4065 int ret = 0;
4066 migration_req_t req;
4067 runqueue_t *rq;
4068
4069 rq = task_rq_lock(p, &flags);
4070 if (!cpus_intersects(new_mask, cpu_online_map)) {
4071 ret = -EINVAL;
4072 goto out;
4073 }
4074
4075 p->cpus_allowed = new_mask;
4076 /* Can the task run on the task's current CPU? If so, we're done */
4077 if (cpu_isset(task_cpu(p), new_mask))
4078 goto out;
4079
4080 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4081 /* Need help from migration thread: drop lock and wait. */
4082 task_rq_unlock(rq, &flags);
4083 wake_up_process(rq->migration_thread);
4084 wait_for_completion(&req.done);
4085 tlb_migrate_finish(p->mm);
4086 return 0;
4087 }
4088out:
4089 task_rq_unlock(rq, &flags);
4090 return ret;
4091}
4092
4093EXPORT_SYMBOL_GPL(set_cpus_allowed);
4094
4095/*
4096 * Move (not current) task off this cpu, onto dest cpu. We're doing
4097 * this because either it can't run here any more (set_cpus_allowed()
4098 * away from this CPU, or CPU going down), or because we're
4099 * attempting to rebalance this task on exec (sched_exec).
4100 *
4101 * So we race with normal scheduler movements, but that's OK, as long
4102 * as the task is no longer on this CPU.
4103 */
4104static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4105{
4106 runqueue_t *rq_dest, *rq_src;
4107
4108 if (unlikely(cpu_is_offline(dest_cpu)))
4109 return;
4110
4111 rq_src = cpu_rq(src_cpu);
4112 rq_dest = cpu_rq(dest_cpu);
4113
4114 double_rq_lock(rq_src, rq_dest);
4115 /* Already moved. */
4116 if (task_cpu(p) != src_cpu)
4117 goto out;
4118 /* Affinity changed (again). */
4119 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4120 goto out;
4121
4122 set_task_cpu(p, dest_cpu);
4123 if (p->array) {
4124 /*
4125 * Sync timestamp with rq_dest's before activating.
4126 * The same thing could be achieved by doing this step
4127 * afterwards, and pretending it was a local activate.
4128 * This way is cleaner and logically correct.
4129 */
4130 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4131 + rq_dest->timestamp_last_tick;
4132 deactivate_task(p, rq_src);
4133 activate_task(p, rq_dest, 0);
4134 if (TASK_PREEMPTS_CURR(p, rq_dest))
4135 resched_task(rq_dest->curr);
4136 }
4137
4138out:
4139 double_rq_unlock(rq_src, rq_dest);
4140}
4141
4142/*
4143 * migration_thread - this is a highprio system thread that performs
4144 * thread migration by bumping thread off CPU then 'pushing' onto
4145 * another runqueue.
4146 */
4147static int migration_thread(void * data)
4148{
4149 runqueue_t *rq;
4150 int cpu = (long)data;
4151
4152 rq = cpu_rq(cpu);
4153 BUG_ON(rq->migration_thread != current);
4154
4155 set_current_state(TASK_INTERRUPTIBLE);
4156 while (!kthread_should_stop()) {
4157 struct list_head *head;
4158 migration_req_t *req;
4159
4160 if (current->flags & PF_FREEZE)
4161 refrigerator(PF_FREEZE);
4162
4163 spin_lock_irq(&rq->lock);
4164
4165 if (cpu_is_offline(cpu)) {
4166 spin_unlock_irq(&rq->lock);
4167 goto wait_to_die;
4168 }
4169
4170 if (rq->active_balance) {
4171 active_load_balance(rq, cpu);
4172 rq->active_balance = 0;
4173 }
4174
4175 head = &rq->migration_queue;
4176
4177 if (list_empty(head)) {
4178 spin_unlock_irq(&rq->lock);
4179 schedule();
4180 set_current_state(TASK_INTERRUPTIBLE);
4181 continue;
4182 }
4183 req = list_entry(head->next, migration_req_t, list);
4184 list_del_init(head->next);
4185
4186 if (req->type == REQ_MOVE_TASK) {
4187 spin_unlock(&rq->lock);
4188 __migrate_task(req->task, cpu, req->dest_cpu);
4189 local_irq_enable();
4190 } else if (req->type == REQ_SET_DOMAIN) {
4191 rq->sd = req->sd;
4192 spin_unlock_irq(&rq->lock);
4193 } else {
4194 spin_unlock_irq(&rq->lock);
4195 WARN_ON(1);
4196 }
4197
4198 complete(&req->done);
4199 }
4200 __set_current_state(TASK_RUNNING);
4201 return 0;
4202
4203wait_to_die:
4204 /* Wait for kthread_stop */
4205 set_current_state(TASK_INTERRUPTIBLE);
4206 while (!kthread_should_stop()) {
4207 schedule();
4208 set_current_state(TASK_INTERRUPTIBLE);
4209 }
4210 __set_current_state(TASK_RUNNING);
4211 return 0;
4212}
4213
4214#ifdef CONFIG_HOTPLUG_CPU
4215/* Figure out where task on dead CPU should go, use force if neccessary. */
4216static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4217{
4218 int dest_cpu;
4219 cpumask_t mask;
4220
4221 /* On same node? */
4222 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4223 cpus_and(mask, mask, tsk->cpus_allowed);
4224 dest_cpu = any_online_cpu(mask);
4225
4226 /* On any allowed CPU? */
4227 if (dest_cpu == NR_CPUS)
4228 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4229
4230 /* No more Mr. Nice Guy. */
4231 if (dest_cpu == NR_CPUS) {
4232 tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
4233 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4234
4235 /*
4236 * Don't tell them about moving exiting tasks or
4237 * kernel threads (both mm NULL), since they never
4238 * leave kernel.
4239 */
4240 if (tsk->mm && printk_ratelimit())
4241 printk(KERN_INFO "process %d (%s) no "
4242 "longer affine to cpu%d\n",
4243 tsk->pid, tsk->comm, dead_cpu);
4244 }
4245 __migrate_task(tsk, dead_cpu, dest_cpu);
4246}
4247
4248/*
4249 * While a dead CPU has no uninterruptible tasks queued at this point,
4250 * it might still have a nonzero ->nr_uninterruptible counter, because
4251 * for performance reasons the counter is not stricly tracking tasks to
4252 * their home CPUs. So we just add the counter to another CPU's counter,
4253 * to keep the global sum constant after CPU-down:
4254 */
4255static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4256{
4257 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4258 unsigned long flags;
4259
4260 local_irq_save(flags);
4261 double_rq_lock(rq_src, rq_dest);
4262 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4263 rq_src->nr_uninterruptible = 0;
4264 double_rq_unlock(rq_src, rq_dest);
4265 local_irq_restore(flags);
4266}
4267
4268/* Run through task list and migrate tasks from the dead cpu. */
4269static void migrate_live_tasks(int src_cpu)
4270{
4271 struct task_struct *tsk, *t;
4272
4273 write_lock_irq(&tasklist_lock);
4274
4275 do_each_thread(t, tsk) {
4276 if (tsk == current)
4277 continue;
4278
4279 if (task_cpu(tsk) == src_cpu)
4280 move_task_off_dead_cpu(src_cpu, tsk);
4281 } while_each_thread(t, tsk);
4282
4283 write_unlock_irq(&tasklist_lock);
4284}
4285
4286/* Schedules idle task to be the next runnable task on current CPU.
4287 * It does so by boosting its priority to highest possible and adding it to
4288 * the _front_ of runqueue. Used by CPU offline code.
4289 */
4290void sched_idle_next(void)
4291{
4292 int cpu = smp_processor_id();
4293 runqueue_t *rq = this_rq();
4294 struct task_struct *p = rq->idle;
4295 unsigned long flags;
4296
4297 /* cpu has to be offline */
4298 BUG_ON(cpu_online(cpu));
4299
4300 /* Strictly not necessary since rest of the CPUs are stopped by now
4301 * and interrupts disabled on current cpu.
4302 */
4303 spin_lock_irqsave(&rq->lock, flags);
4304
4305 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4306 /* Add idle task to _front_ of it's priority queue */
4307 __activate_idle_task(p, rq);
4308
4309 spin_unlock_irqrestore(&rq->lock, flags);
4310}
4311
4312/* Ensures that the idle task is using init_mm right before its cpu goes
4313 * offline.
4314 */
4315void idle_task_exit(void)
4316{
4317 struct mm_struct *mm = current->active_mm;
4318
4319 BUG_ON(cpu_online(smp_processor_id()));
4320
4321 if (mm != &init_mm)
4322 switch_mm(mm, &init_mm, current);
4323 mmdrop(mm);
4324}
4325
4326static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4327{
4328 struct runqueue *rq = cpu_rq(dead_cpu);
4329
4330 /* Must be exiting, otherwise would be on tasklist. */
4331 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4332
4333 /* Cannot have done final schedule yet: would have vanished. */
4334 BUG_ON(tsk->flags & PF_DEAD);
4335
4336 get_task_struct(tsk);
4337
4338 /*
4339 * Drop lock around migration; if someone else moves it,
4340 * that's OK. No task can be added to this CPU, so iteration is
4341 * fine.
4342 */
4343 spin_unlock_irq(&rq->lock);
4344 move_task_off_dead_cpu(dead_cpu, tsk);
4345 spin_lock_irq(&rq->lock);
4346
4347 put_task_struct(tsk);
4348}
4349
4350/* release_task() removes task from tasklist, so we won't find dead tasks. */
4351static void migrate_dead_tasks(unsigned int dead_cpu)
4352{
4353 unsigned arr, i;
4354 struct runqueue *rq = cpu_rq(dead_cpu);
4355
4356 for (arr = 0; arr < 2; arr++) {
4357 for (i = 0; i < MAX_PRIO; i++) {
4358 struct list_head *list = &rq->arrays[arr].queue[i];
4359 while (!list_empty(list))
4360 migrate_dead(dead_cpu,
4361 list_entry(list->next, task_t,
4362 run_list));
4363 }
4364 }
4365}
4366#endif /* CONFIG_HOTPLUG_CPU */
4367
4368/*
4369 * migration_call - callback that gets triggered when a CPU is added.
4370 * Here we can start up the necessary migration thread for the new CPU.
4371 */
4372static int migration_call(struct notifier_block *nfb, unsigned long action,
4373 void *hcpu)
4374{
4375 int cpu = (long)hcpu;
4376 struct task_struct *p;
4377 struct runqueue *rq;
4378 unsigned long flags;
4379
4380 switch (action) {
4381 case CPU_UP_PREPARE:
4382 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4383 if (IS_ERR(p))
4384 return NOTIFY_BAD;
4385 p->flags |= PF_NOFREEZE;
4386 kthread_bind(p, cpu);
4387 /* Must be high prio: stop_machine expects to yield to it. */
4388 rq = task_rq_lock(p, &flags);
4389 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4390 task_rq_unlock(rq, &flags);
4391 cpu_rq(cpu)->migration_thread = p;
4392 break;
4393 case CPU_ONLINE:
4394 /* Strictly unneccessary, as first user will wake it. */
4395 wake_up_process(cpu_rq(cpu)->migration_thread);
4396 break;
4397#ifdef CONFIG_HOTPLUG_CPU
4398 case CPU_UP_CANCELED:
4399 /* Unbind it from offline cpu so it can run. Fall thru. */
4400 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4401 kthread_stop(cpu_rq(cpu)->migration_thread);
4402 cpu_rq(cpu)->migration_thread = NULL;
4403 break;
4404 case CPU_DEAD:
4405 migrate_live_tasks(cpu);
4406 rq = cpu_rq(cpu);
4407 kthread_stop(rq->migration_thread);
4408 rq->migration_thread = NULL;
4409 /* Idle task back to normal (off runqueue, low prio) */
4410 rq = task_rq_lock(rq->idle, &flags);
4411 deactivate_task(rq->idle, rq);
4412 rq->idle->static_prio = MAX_PRIO;
4413 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4414 migrate_dead_tasks(cpu);
4415 task_rq_unlock(rq, &flags);
4416 migrate_nr_uninterruptible(rq);
4417 BUG_ON(rq->nr_running != 0);
4418
4419 /* No need to migrate the tasks: it was best-effort if
4420 * they didn't do lock_cpu_hotplug(). Just wake up
4421 * the requestors. */
4422 spin_lock_irq(&rq->lock);
4423 while (!list_empty(&rq->migration_queue)) {
4424 migration_req_t *req;
4425 req = list_entry(rq->migration_queue.next,
4426 migration_req_t, list);
4427 BUG_ON(req->type != REQ_MOVE_TASK);
4428 list_del_init(&req->list);
4429 complete(&req->done);
4430 }
4431 spin_unlock_irq(&rq->lock);
4432 break;
4433#endif
4434 }
4435 return NOTIFY_OK;
4436}
4437
4438/* Register at highest priority so that task migration (migrate_all_tasks)
4439 * happens before everything else.
4440 */
4441static struct notifier_block __devinitdata migration_notifier = {
4442 .notifier_call = migration_call,
4443 .priority = 10
4444};
4445
4446int __init migration_init(void)
4447{
4448 void *cpu = (void *)(long)smp_processor_id();
4449 /* Start one for boot CPU. */
4450 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4451 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4452 register_cpu_notifier(&migration_notifier);
4453 return 0;
4454}
4455#endif
4456
4457#ifdef CONFIG_SMP
4458#define SCHED_DOMAIN_DEBUG
4459#ifdef SCHED_DOMAIN_DEBUG
4460static void sched_domain_debug(struct sched_domain *sd, int cpu)
4461{
4462 int level = 0;
4463
4464 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4465
4466 do {
4467 int i;
4468 char str[NR_CPUS];
4469 struct sched_group *group = sd->groups;
4470 cpumask_t groupmask;
4471
4472 cpumask_scnprintf(str, NR_CPUS, sd->span);
4473 cpus_clear(groupmask);
4474
4475 printk(KERN_DEBUG);
4476 for (i = 0; i < level + 1; i++)
4477 printk(" ");
4478 printk("domain %d: ", level);
4479
4480 if (!(sd->flags & SD_LOAD_BALANCE)) {
4481 printk("does not load-balance\n");
4482 if (sd->parent)
4483 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4484 break;
4485 }
4486
4487 printk("span %s\n", str);
4488
4489 if (!cpu_isset(cpu, sd->span))
4490 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4491 if (!cpu_isset(cpu, group->cpumask))
4492 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4493
4494 printk(KERN_DEBUG);
4495 for (i = 0; i < level + 2; i++)
4496 printk(" ");
4497 printk("groups:");
4498 do {
4499 if (!group) {
4500 printk("\n");
4501 printk(KERN_ERR "ERROR: group is NULL\n");
4502 break;
4503 }
4504
4505 if (!group->cpu_power) {
4506 printk("\n");
4507 printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4508 }
4509
4510 if (!cpus_weight(group->cpumask)) {
4511 printk("\n");
4512 printk(KERN_ERR "ERROR: empty group\n");
4513 }
4514
4515 if (cpus_intersects(groupmask, group->cpumask)) {
4516 printk("\n");
4517 printk(KERN_ERR "ERROR: repeated CPUs\n");
4518 }
4519
4520 cpus_or(groupmask, groupmask, group->cpumask);
4521
4522 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4523 printk(" %s", str);
4524
4525 group = group->next;
4526 } while (group != sd->groups);
4527 printk("\n");
4528
4529 if (!cpus_equal(sd->span, groupmask))
4530 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4531
4532 level++;
4533 sd = sd->parent;
4534
4535 if (sd) {
4536 if (!cpus_subset(groupmask, sd->span))
4537 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4538 }
4539
4540 } while (sd);
4541}
4542#else
4543#define sched_domain_debug(sd, cpu) {}
4544#endif
4545
4546/*
4547 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4548 * hold the hotplug lock.
4549 */
4550void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
4551{
4552 migration_req_t req;
4553 unsigned long flags;
4554 runqueue_t *rq = cpu_rq(cpu);
4555 int local = 1;
4556
4557 sched_domain_debug(sd, cpu);
4558
4559 spin_lock_irqsave(&rq->lock, flags);
4560
4561 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4562 rq->sd = sd;
4563 } else {
4564 init_completion(&req.done);
4565 req.type = REQ_SET_DOMAIN;
4566 req.sd = sd;
4567 list_add(&req.list, &rq->migration_queue);
4568 local = 0;
4569 }
4570
4571 spin_unlock_irqrestore(&rq->lock, flags);
4572
4573 if (!local) {
4574 wake_up_process(rq->migration_thread);
4575 wait_for_completion(&req.done);
4576 }
4577}
4578
4579/* cpus with isolated domains */
4580cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4581
4582/* Setup the mask of cpus configured for isolated domains */
4583static int __init isolated_cpu_setup(char *str)
4584{
4585 int ints[NR_CPUS], i;
4586
4587 str = get_options(str, ARRAY_SIZE(ints), ints);
4588 cpus_clear(cpu_isolated_map);
4589 for (i = 1; i <= ints[0]; i++)
4590 if (ints[i] < NR_CPUS)
4591 cpu_set(ints[i], cpu_isolated_map);
4592 return 1;
4593}
4594
4595__setup ("isolcpus=", isolated_cpu_setup);
4596
4597/*
4598 * init_sched_build_groups takes an array of groups, the cpumask we wish
4599 * to span, and a pointer to a function which identifies what group a CPU
4600 * belongs to. The return value of group_fn must be a valid index into the
4601 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
4602 * keep track of groups covered with a cpumask_t).
4603 *
4604 * init_sched_build_groups will build a circular linked list of the groups
4605 * covered by the given span, and will set each group's ->cpumask correctly,
4606 * and ->cpu_power to 0.
4607 */
4608void __devinit init_sched_build_groups(struct sched_group groups[],
4609 cpumask_t span, int (*group_fn)(int cpu))
4610{
4611 struct sched_group *first = NULL, *last = NULL;
4612 cpumask_t covered = CPU_MASK_NONE;
4613 int i;
4614
4615 for_each_cpu_mask(i, span) {
4616 int group = group_fn(i);
4617 struct sched_group *sg = &groups[group];
4618 int j;
4619
4620 if (cpu_isset(i, covered))
4621 continue;
4622
4623 sg->cpumask = CPU_MASK_NONE;
4624 sg->cpu_power = 0;
4625
4626 for_each_cpu_mask(j, span) {
4627 if (group_fn(j) != group)
4628 continue;
4629
4630 cpu_set(j, covered);
4631 cpu_set(j, sg->cpumask);
4632 }
4633 if (!first)
4634 first = sg;
4635 if (last)
4636 last->next = sg;
4637 last = sg;
4638 }
4639 last->next = first;
4640}
4641
4642
4643#ifdef ARCH_HAS_SCHED_DOMAIN
4644extern void __devinit arch_init_sched_domains(void);
4645extern void __devinit arch_destroy_sched_domains(void);
4646#else
4647#ifdef CONFIG_SCHED_SMT
4648static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4649static struct sched_group sched_group_cpus[NR_CPUS];
4650static int __devinit cpu_to_cpu_group(int cpu)
4651{
4652 return cpu;
4653}
4654#endif
4655
4656static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4657static struct sched_group sched_group_phys[NR_CPUS];
4658static int __devinit cpu_to_phys_group(int cpu)
4659{
4660#ifdef CONFIG_SCHED_SMT
4661 return first_cpu(cpu_sibling_map[cpu]);
4662#else
4663 return cpu;
4664#endif
4665}
4666
4667#ifdef CONFIG_NUMA
4668
4669static DEFINE_PER_CPU(struct sched_domain, node_domains);
4670static struct sched_group sched_group_nodes[MAX_NUMNODES];
4671static int __devinit cpu_to_node_group(int cpu)
4672{
4673 return cpu_to_node(cpu);
4674}
4675#endif
4676
4677#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4678/*
4679 * The domains setup code relies on siblings not spanning
4680 * multiple nodes. Make sure the architecture has a proper
4681 * siblings map:
4682 */
4683static void check_sibling_maps(void)
4684{
4685 int i, j;
4686
4687 for_each_online_cpu(i) {
4688 for_each_cpu_mask(j, cpu_sibling_map[i]) {
4689 if (cpu_to_node(i) != cpu_to_node(j)) {
4690 printk(KERN_INFO "warning: CPU %d siblings map "
4691 "to different node - isolating "
4692 "them.\n", i);
4693 cpu_sibling_map[i] = cpumask_of_cpu(i);
4694 break;
4695 }
4696 }
4697 }
4698}
4699#endif
4700
4701/*
4702 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
4703 */
4704static void __devinit arch_init_sched_domains(void)
4705{
4706 int i;
4707 cpumask_t cpu_default_map;
4708
4709#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4710 check_sibling_maps();
4711#endif
4712 /*
4713 * Setup mask for cpus without special case scheduling requirements.
4714 * For now this just excludes isolated cpus, but could be used to
4715 * exclude other special cases in the future.
4716 */
4717 cpus_complement(cpu_default_map, cpu_isolated_map);
4718 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4719
4720 /*
4721 * Set up domains. Isolated domains just stay on the dummy domain.
4722 */
4723 for_each_cpu_mask(i, cpu_default_map) {
4724 int group;
4725 struct sched_domain *sd = NULL, *p;
4726 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4727
4728 cpus_and(nodemask, nodemask, cpu_default_map);
4729
4730#ifdef CONFIG_NUMA
4731 sd = &per_cpu(node_domains, i);
4732 group = cpu_to_node_group(i);
4733 *sd = SD_NODE_INIT;
4734 sd->span = cpu_default_map;
4735 sd->groups = &sched_group_nodes[group];
4736#endif
4737
4738 p = sd;
4739 sd = &per_cpu(phys_domains, i);
4740 group = cpu_to_phys_group(i);
4741 *sd = SD_CPU_INIT;
4742 sd->span = nodemask;
4743 sd->parent = p;
4744 sd->groups = &sched_group_phys[group];
4745
4746#ifdef CONFIG_SCHED_SMT
4747 p = sd;
4748 sd = &per_cpu(cpu_domains, i);
4749 group = cpu_to_cpu_group(i);
4750 *sd = SD_SIBLING_INIT;
4751 sd->span = cpu_sibling_map[i];
4752 cpus_and(sd->span, sd->span, cpu_default_map);
4753 sd->parent = p;
4754 sd->groups = &sched_group_cpus[group];
4755#endif
4756 }
4757
4758#ifdef CONFIG_SCHED_SMT
4759 /* Set up CPU (sibling) groups */
4760 for_each_online_cpu(i) {
4761 cpumask_t this_sibling_map = cpu_sibling_map[i];
4762 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4763 if (i != first_cpu(this_sibling_map))
4764 continue;
4765
4766 init_sched_build_groups(sched_group_cpus, this_sibling_map,
4767 &cpu_to_cpu_group);
4768 }
4769#endif
4770
4771 /* Set up physical groups */
4772 for (i = 0; i < MAX_NUMNODES; i++) {
4773 cpumask_t nodemask = node_to_cpumask(i);
4774
4775 cpus_and(nodemask, nodemask, cpu_default_map);
4776 if (cpus_empty(nodemask))
4777 continue;
4778
4779 init_sched_build_groups(sched_group_phys, nodemask,
4780 &cpu_to_phys_group);
4781 }
4782
4783#ifdef CONFIG_NUMA
4784 /* Set up node groups */
4785 init_sched_build_groups(sched_group_nodes, cpu_default_map,
4786 &cpu_to_node_group);
4787#endif
4788
4789 /* Calculate CPU power for physical packages and nodes */
4790 for_each_cpu_mask(i, cpu_default_map) {
4791 int power;
4792 struct sched_domain *sd;
4793#ifdef CONFIG_SCHED_SMT
4794 sd = &per_cpu(cpu_domains, i);
4795 power = SCHED_LOAD_SCALE;
4796 sd->groups->cpu_power = power;
4797#endif
4798
4799 sd = &per_cpu(phys_domains, i);
4800 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4801 (cpus_weight(sd->groups->cpumask)-1) / 10;
4802 sd->groups->cpu_power = power;
4803
4804#ifdef CONFIG_NUMA
4805 if (i == first_cpu(sd->groups->cpumask)) {
4806 /* Only add "power" once for each physical package. */
4807 sd = &per_cpu(node_domains, i);
4808 sd->groups->cpu_power += power;
4809 }
4810#endif
4811 }
4812
4813 /* Attach the domains */
4814 for_each_online_cpu(i) {
4815 struct sched_domain *sd;
4816#ifdef CONFIG_SCHED_SMT
4817 sd = &per_cpu(cpu_domains, i);
4818#else
4819 sd = &per_cpu(phys_domains, i);
4820#endif
4821 cpu_attach_domain(sd, i);
4822 }
4823}
4824
4825#ifdef CONFIG_HOTPLUG_CPU
4826static void __devinit arch_destroy_sched_domains(void)
4827{
4828 /* Do nothing: everything is statically allocated. */
4829}
4830#endif
4831
4832#endif /* ARCH_HAS_SCHED_DOMAIN */
4833
4834/*
4835 * Initial dummy domain for early boot and for hotplug cpu. Being static,
4836 * it is initialized to zero, so all balancing flags are cleared which is
4837 * what we want.
4838 */
4839static struct sched_domain sched_domain_dummy;
4840
4841#ifdef CONFIG_HOTPLUG_CPU
4842/*
4843 * Force a reinitialization of the sched domains hierarchy. The domains
4844 * and groups cannot be updated in place without racing with the balancing
4845 * code, so we temporarily attach all running cpus to a "dummy" domain
4846 * which will prevent rebalancing while the sched domains are recalculated.
4847 */
4848static int update_sched_domains(struct notifier_block *nfb,
4849 unsigned long action, void *hcpu)
4850{
4851 int i;
4852
4853 switch (action) {
4854 case CPU_UP_PREPARE:
4855 case CPU_DOWN_PREPARE:
4856 for_each_online_cpu(i)
4857 cpu_attach_domain(&sched_domain_dummy, i);
4858 arch_destroy_sched_domains();
4859 return NOTIFY_OK;
4860
4861 case CPU_UP_CANCELED:
4862 case CPU_DOWN_FAILED:
4863 case CPU_ONLINE:
4864 case CPU_DEAD:
4865 /*
4866 * Fall through and re-initialise the domains.
4867 */
4868 break;
4869 default:
4870 return NOTIFY_DONE;
4871 }
4872
4873 /* The hotplug lock is already held by cpu_up/cpu_down */
4874 arch_init_sched_domains();
4875
4876 return NOTIFY_OK;
4877}
4878#endif
4879
4880void __init sched_init_smp(void)
4881{
4882 lock_cpu_hotplug();
4883 arch_init_sched_domains();
4884 unlock_cpu_hotplug();
4885 /* XXX: Theoretical race here - CPU may be hotplugged now */
4886 hotcpu_notifier(update_sched_domains, 0);
4887}
4888#else
4889void __init sched_init_smp(void)
4890{
4891}
4892#endif /* CONFIG_SMP */
4893
4894int in_sched_functions(unsigned long addr)
4895{
4896 /* Linker adds these: start and end of __sched functions */
4897 extern char __sched_text_start[], __sched_text_end[];
4898 return in_lock_functions(addr) ||
4899 (addr >= (unsigned long)__sched_text_start
4900 && addr < (unsigned long)__sched_text_end);
4901}
4902
4903void __init sched_init(void)
4904{
4905 runqueue_t *rq;
4906 int i, j, k;
4907
4908 for (i = 0; i < NR_CPUS; i++) {
4909 prio_array_t *array;
4910
4911 rq = cpu_rq(i);
4912 spin_lock_init(&rq->lock);
4913 rq->active = rq->arrays;
4914 rq->expired = rq->arrays + 1;
4915 rq->best_expired_prio = MAX_PRIO;
4916
4917#ifdef CONFIG_SMP
4918 rq->sd = &sched_domain_dummy;
4919 rq->cpu_load = 0;
4920 rq->active_balance = 0;
4921 rq->push_cpu = 0;
4922 rq->migration_thread = NULL;
4923 INIT_LIST_HEAD(&rq->migration_queue);
4924#endif
4925 atomic_set(&rq->nr_iowait, 0);
4926
4927 for (j = 0; j < 2; j++) {
4928 array = rq->arrays + j;
4929 for (k = 0; k < MAX_PRIO; k++) {
4930 INIT_LIST_HEAD(array->queue + k);
4931 __clear_bit(k, array->bitmap);
4932 }
4933 // delimiter for bitsearch
4934 __set_bit(MAX_PRIO, array->bitmap);
4935 }
4936 }
4937
4938 /*
4939 * The boot idle thread does lazy MMU switching as well:
4940 */
4941 atomic_inc(&init_mm.mm_count);
4942 enter_lazy_tlb(&init_mm, current);
4943
4944 /*
4945 * Make us the idle thread. Technically, schedule() should not be
4946 * called from this thread, however somewhere below it might be,
4947 * but because we are the idle thread, we just pick up running again
4948 * when this runqueue becomes "idle".
4949 */
4950 init_idle(current, smp_processor_id());
4951}
4952
4953#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4954void __might_sleep(char *file, int line)
4955{
4956#if defined(in_atomic)
4957 static unsigned long prev_jiffy; /* ratelimiting */
4958
4959 if ((in_atomic() || irqs_disabled()) &&
4960 system_state == SYSTEM_RUNNING && !oops_in_progress) {
4961 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4962 return;
4963 prev_jiffy = jiffies;
4964 printk(KERN_ERR "Debug: sleeping function called from invalid"
4965 " context at %s:%d\n", file, line);
4966 printk("in_atomic():%d, irqs_disabled():%d\n",
4967 in_atomic(), irqs_disabled());
4968 dump_stack();
4969 }
4970#endif
4971}
4972EXPORT_SYMBOL(__might_sleep);
4973#endif
4974
4975#ifdef CONFIG_MAGIC_SYSRQ
4976void normalize_rt_tasks(void)
4977{
4978 struct task_struct *p;
4979 prio_array_t *array;
4980 unsigned long flags;
4981 runqueue_t *rq;
4982
4983 read_lock_irq(&tasklist_lock);
4984 for_each_process (p) {
4985 if (!rt_task(p))
4986 continue;
4987
4988 rq = task_rq_lock(p, &flags);
4989
4990 array = p->array;
4991 if (array)
4992 deactivate_task(p, task_rq(p));
4993 __setscheduler(p, SCHED_NORMAL, 0);
4994 if (array) {
4995 __activate_task(p, task_rq(p));
4996 resched_task(rq->curr);
4997 }
4998
4999 task_rq_unlock(rq, &flags);
5000 }
5001 read_unlock_irq(&tasklist_lock);
5002}
5003
5004#endif /* CONFIG_MAGIC_SYSRQ */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
new file mode 100644
index 000000000000..c3391b6020e8
--- /dev/null
+++ b/kernel/seccomp.c
@@ -0,0 +1,56 @@
1/*
2 * linux/kernel/seccomp.c
3 *
4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
5 *
6 * This defines a simple but solid secure-computing mode.
7 */
8
9#include <linux/seccomp.h>
10#include <linux/sched.h>
11
12/* #define SECCOMP_DEBUG 1 */
13
14/*
15 * Secure computing mode 1 allows only read/write/exit/sigreturn.
16 * To be fully secure this must be combined with rlimit
17 * to limit the stack allocations too.
18 */
19static int mode1_syscalls[] = {
20 __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
21 0, /* null terminated */
22};
23
24#ifdef TIF_32BIT
25static int mode1_syscalls_32[] = {
26 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
27 0, /* null terminated */
28};
29#endif
30
31void __secure_computing(int this_syscall)
32{
33 int mode = current->seccomp.mode;
34 int * syscall;
35
36 switch (mode) {
37 case 1:
38 syscall = mode1_syscalls;
39#ifdef TIF_32BIT
40 if (test_thread_flag(TIF_32BIT))
41 syscall = mode1_syscalls_32;
42#endif
43 do {
44 if (*syscall == this_syscall)
45 return;
46 } while (*++syscall);
47 break;
48 default:
49 BUG();
50 }
51
52#ifdef SECCOMP_DEBUG
53 dump_stack();
54#endif
55 do_exit(SIGKILL);
56}
diff --git a/kernel/signal.c b/kernel/signal.c
new file mode 100644
index 000000000000..f00a1d610f0b
--- /dev/null
+++ b/kernel/signal.c
@@ -0,0 +1,2662 @@
1/*
2 * linux/kernel/signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson
7 *
8 * 2003-06-02 Jim Houston - Concurrent Computer Corp.
9 * Changes to use preallocated sigqueue structures
10 * to allow signals to be sent reliably.
11 */
12
13#include <linux/config.h>
14#include <linux/slab.h>
15#include <linux/module.h>
16#include <linux/smp_lock.h>
17#include <linux/init.h>
18#include <linux/sched.h>
19#include <linux/fs.h>
20#include <linux/tty.h>
21#include <linux/binfmts.h>
22#include <linux/security.h>
23#include <linux/syscalls.h>
24#include <linux/ptrace.h>
25#include <linux/posix-timers.h>
26#include <asm/param.h>
27#include <asm/uaccess.h>
28#include <asm/unistd.h>
29#include <asm/siginfo.h>
30
31/*
32 * SLAB caches for signal bits.
33 */
34
35static kmem_cache_t *sigqueue_cachep;
36
37/*
38 * In POSIX a signal is sent either to a specific thread (Linux task)
39 * or to the process as a whole (Linux thread group). How the signal
40 * is sent determines whether it's to one thread or the whole group,
41 * which determines which signal mask(s) are involved in blocking it
42 * from being delivered until later. When the signal is delivered,
43 * either it's caught or ignored by a user handler or it has a default
44 * effect that applies to the whole thread group (POSIX process).
45 *
46 * The possible effects an unblocked signal set to SIG_DFL can have are:
47 * ignore - Nothing Happens
48 * terminate - kill the process, i.e. all threads in the group,
49 * similar to exit_group. The group leader (only) reports
50 * WIFSIGNALED status to its parent.
51 * coredump - write a core dump file describing all threads using
52 * the same mm and then kill all those threads
53 * stop - stop all the threads in the group, i.e. TASK_STOPPED state
54 *
55 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
56 * Other signals when not blocked and set to SIG_DFL behaves as follows.
57 * The job control signals also have other special effects.
58 *
59 * +--------------------+------------------+
60 * | POSIX signal | default action |
61 * +--------------------+------------------+
62 * | SIGHUP | terminate |
63 * | SIGINT | terminate |
64 * | SIGQUIT | coredump |
65 * | SIGILL | coredump |
66 * | SIGTRAP | coredump |
67 * | SIGABRT/SIGIOT | coredump |
68 * | SIGBUS | coredump |
69 * | SIGFPE | coredump |
70 * | SIGKILL | terminate(+) |
71 * | SIGUSR1 | terminate |
72 * | SIGSEGV | coredump |
73 * | SIGUSR2 | terminate |
74 * | SIGPIPE | terminate |
75 * | SIGALRM | terminate |
76 * | SIGTERM | terminate |
77 * | SIGCHLD | ignore |
78 * | SIGCONT | ignore(*) |
79 * | SIGSTOP | stop(*)(+) |
80 * | SIGTSTP | stop(*) |
81 * | SIGTTIN | stop(*) |
82 * | SIGTTOU | stop(*) |
83 * | SIGURG | ignore |
84 * | SIGXCPU | coredump |
85 * | SIGXFSZ | coredump |
86 * | SIGVTALRM | terminate |
87 * | SIGPROF | terminate |
88 * | SIGPOLL/SIGIO | terminate |
89 * | SIGSYS/SIGUNUSED | coredump |
90 * | SIGSTKFLT | terminate |
91 * | SIGWINCH | ignore |
92 * | SIGPWR | terminate |
93 * | SIGRTMIN-SIGRTMAX | terminate |
94 * +--------------------+------------------+
95 * | non-POSIX signal | default action |
96 * +--------------------+------------------+
97 * | SIGEMT | coredump |
98 * +--------------------+------------------+
99 *
100 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
101 * (*) Special job control effects:
102 * When SIGCONT is sent, it resumes the process (all threads in the group)
103 * from TASK_STOPPED state and also clears any pending/queued stop signals
104 * (any of those marked with "stop(*)"). This happens regardless of blocking,
105 * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
106 * any pending/queued SIGCONT signals; this happens regardless of blocking,
107 * catching, or ignored the stop signal, though (except for SIGSTOP) the
108 * default action of stopping the process may happen later or never.
109 */
110
111#ifdef SIGEMT
112#define M_SIGEMT M(SIGEMT)
113#else
114#define M_SIGEMT 0
115#endif
116
117#if SIGRTMIN > BITS_PER_LONG
118#define M(sig) (1ULL << ((sig)-1))
119#else
120#define M(sig) (1UL << ((sig)-1))
121#endif
122#define T(sig, mask) (M(sig) & (mask))
123
124#define SIG_KERNEL_ONLY_MASK (\
125 M(SIGKILL) | M(SIGSTOP) )
126
127#define SIG_KERNEL_STOP_MASK (\
128 M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) )
129
130#define SIG_KERNEL_COREDUMP_MASK (\
131 M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \
132 M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \
133 M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT )
134
135#define SIG_KERNEL_IGNORE_MASK (\
136 M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) )
137
138#define sig_kernel_only(sig) \
139 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK))
140#define sig_kernel_coredump(sig) \
141 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK))
142#define sig_kernel_ignore(sig) \
143 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK))
144#define sig_kernel_stop(sig) \
145 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
146
147#define sig_user_defined(t, signr) \
148 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
149 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
150
151#define sig_fatal(t, signr) \
152 (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
153 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
154
155static int sig_ignored(struct task_struct *t, int sig)
156{
157 void __user * handler;
158
159 /*
160 * Tracers always want to know about signals..
161 */
162 if (t->ptrace & PT_PTRACED)
163 return 0;
164
165 /*
166 * Blocked signals are never ignored, since the
167 * signal handler may change by the time it is
168 * unblocked.
169 */
170 if (sigismember(&t->blocked, sig))
171 return 0;
172
173 /* Is it explicitly or implicitly ignored? */
174 handler = t->sighand->action[sig-1].sa.sa_handler;
175 return handler == SIG_IGN ||
176 (handler == SIG_DFL && sig_kernel_ignore(sig));
177}
178
179/*
180 * Re-calculate pending state from the set of locally pending
181 * signals, globally pending signals, and blocked signals.
182 */
183static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
184{
185 unsigned long ready;
186 long i;
187
188 switch (_NSIG_WORDS) {
189 default:
190 for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
191 ready |= signal->sig[i] &~ blocked->sig[i];
192 break;
193
194 case 4: ready = signal->sig[3] &~ blocked->sig[3];
195 ready |= signal->sig[2] &~ blocked->sig[2];
196 ready |= signal->sig[1] &~ blocked->sig[1];
197 ready |= signal->sig[0] &~ blocked->sig[0];
198 break;
199
200 case 2: ready = signal->sig[1] &~ blocked->sig[1];
201 ready |= signal->sig[0] &~ blocked->sig[0];
202 break;
203
204 case 1: ready = signal->sig[0] &~ blocked->sig[0];
205 }
206 return ready != 0;
207}
208
209#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
210
211fastcall void recalc_sigpending_tsk(struct task_struct *t)
212{
213 if (t->signal->group_stop_count > 0 ||
214 PENDING(&t->pending, &t->blocked) ||
215 PENDING(&t->signal->shared_pending, &t->blocked))
216 set_tsk_thread_flag(t, TIF_SIGPENDING);
217 else
218 clear_tsk_thread_flag(t, TIF_SIGPENDING);
219}
220
221void recalc_sigpending(void)
222{
223 recalc_sigpending_tsk(current);
224}
225
226/* Given the mask, find the first available signal that should be serviced. */
227
228static int
229next_signal(struct sigpending *pending, sigset_t *mask)
230{
231 unsigned long i, *s, *m, x;
232 int sig = 0;
233
234 s = pending->signal.sig;
235 m = mask->sig;
236 switch (_NSIG_WORDS) {
237 default:
238 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
239 if ((x = *s &~ *m) != 0) {
240 sig = ffz(~x) + i*_NSIG_BPW + 1;
241 break;
242 }
243 break;
244
245 case 2: if ((x = s[0] &~ m[0]) != 0)
246 sig = 1;
247 else if ((x = s[1] &~ m[1]) != 0)
248 sig = _NSIG_BPW + 1;
249 else
250 break;
251 sig += ffz(~x);
252 break;
253
254 case 1: if ((x = *s &~ *m) != 0)
255 sig = ffz(~x) + 1;
256 break;
257 }
258
259 return sig;
260}
261
262static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags,
263 int override_rlimit)
264{
265 struct sigqueue *q = NULL;
266
267 atomic_inc(&t->user->sigpending);
268 if (override_rlimit ||
269 atomic_read(&t->user->sigpending) <=
270 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
271 q = kmem_cache_alloc(sigqueue_cachep, flags);
272 if (unlikely(q == NULL)) {
273 atomic_dec(&t->user->sigpending);
274 } else {
275 INIT_LIST_HEAD(&q->list);
276 q->flags = 0;
277 q->lock = NULL;
278 q->user = get_uid(t->user);
279 }
280 return(q);
281}
282
283static inline void __sigqueue_free(struct sigqueue *q)
284{
285 if (q->flags & SIGQUEUE_PREALLOC)
286 return;
287 atomic_dec(&q->user->sigpending);
288 free_uid(q->user);
289 kmem_cache_free(sigqueue_cachep, q);
290}
291
292static void flush_sigqueue(struct sigpending *queue)
293{
294 struct sigqueue *q;
295
296 sigemptyset(&queue->signal);
297 while (!list_empty(&queue->list)) {
298 q = list_entry(queue->list.next, struct sigqueue , list);
299 list_del_init(&q->list);
300 __sigqueue_free(q);
301 }
302}
303
304/*
305 * Flush all pending signals for a task.
306 */
307
308void
309flush_signals(struct task_struct *t)
310{
311 unsigned long flags;
312
313 spin_lock_irqsave(&t->sighand->siglock, flags);
314 clear_tsk_thread_flag(t,TIF_SIGPENDING);
315 flush_sigqueue(&t->pending);
316 flush_sigqueue(&t->signal->shared_pending);
317 spin_unlock_irqrestore(&t->sighand->siglock, flags);
318}
319
320/*
321 * This function expects the tasklist_lock write-locked.
322 */
323void __exit_sighand(struct task_struct *tsk)
324{
325 struct sighand_struct * sighand = tsk->sighand;
326
327 /* Ok, we're done with the signal handlers */
328 tsk->sighand = NULL;
329 if (atomic_dec_and_test(&sighand->count))
330 kmem_cache_free(sighand_cachep, sighand);
331}
332
333void exit_sighand(struct task_struct *tsk)
334{
335 write_lock_irq(&tasklist_lock);
336 __exit_sighand(tsk);
337 write_unlock_irq(&tasklist_lock);
338}
339
340/*
341 * This function expects the tasklist_lock write-locked.
342 */
343void __exit_signal(struct task_struct *tsk)
344{
345 struct signal_struct * sig = tsk->signal;
346 struct sighand_struct * sighand = tsk->sighand;
347
348 if (!sig)
349 BUG();
350 if (!atomic_read(&sig->count))
351 BUG();
352 spin_lock(&sighand->siglock);
353 posix_cpu_timers_exit(tsk);
354 if (atomic_dec_and_test(&sig->count)) {
355 posix_cpu_timers_exit_group(tsk);
356 if (tsk == sig->curr_target)
357 sig->curr_target = next_thread(tsk);
358 tsk->signal = NULL;
359 spin_unlock(&sighand->siglock);
360 flush_sigqueue(&sig->shared_pending);
361 } else {
362 /*
363 * If there is any task waiting for the group exit
364 * then notify it:
365 */
366 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
367 wake_up_process(sig->group_exit_task);
368 sig->group_exit_task = NULL;
369 }
370 if (tsk == sig->curr_target)
371 sig->curr_target = next_thread(tsk);
372 tsk->signal = NULL;
373 /*
374 * Accumulate here the counters for all threads but the
375 * group leader as they die, so they can be added into
376 * the process-wide totals when those are taken.
377 * The group leader stays around as a zombie as long
378 * as there are other threads. When it gets reaped,
379 * the exit.c code will add its counts into these totals.
380 * We won't ever get here for the group leader, since it
381 * will have been the last reference on the signal_struct.
382 */
383 sig->utime = cputime_add(sig->utime, tsk->utime);
384 sig->stime = cputime_add(sig->stime, tsk->stime);
385 sig->min_flt += tsk->min_flt;
386 sig->maj_flt += tsk->maj_flt;
387 sig->nvcsw += tsk->nvcsw;
388 sig->nivcsw += tsk->nivcsw;
389 sig->sched_time += tsk->sched_time;
390 spin_unlock(&sighand->siglock);
391 sig = NULL; /* Marker for below. */
392 }
393 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
394 flush_sigqueue(&tsk->pending);
395 if (sig) {
396 /*
397 * We are cleaning up the signal_struct here. We delayed
398 * calling exit_itimers until after flush_sigqueue, just in
399 * case our thread-local pending queue contained a queued
400 * timer signal that would have been cleared in
401 * exit_itimers. When that called sigqueue_free, it would
402 * attempt to re-take the tasklist_lock and deadlock. This
403 * can never happen if we ensure that all queues the
404 * timer's signal might be queued on have been flushed
405 * first. The shared_pending queue, and our own pending
406 * queue are the only queues the timer could be on, since
407 * there are no other threads left in the group and timer
408 * signals are constrained to threads inside the group.
409 */
410 exit_itimers(sig);
411 exit_thread_group_keys(sig);
412 kmem_cache_free(signal_cachep, sig);
413 }
414}
415
416void exit_signal(struct task_struct *tsk)
417{
418 write_lock_irq(&tasklist_lock);
419 __exit_signal(tsk);
420 write_unlock_irq(&tasklist_lock);
421}
422
423/*
424 * Flush all handlers for a task.
425 */
426
427void
428flush_signal_handlers(struct task_struct *t, int force_default)
429{
430 int i;
431 struct k_sigaction *ka = &t->sighand->action[0];
432 for (i = _NSIG ; i != 0 ; i--) {
433 if (force_default || ka->sa.sa_handler != SIG_IGN)
434 ka->sa.sa_handler = SIG_DFL;
435 ka->sa.sa_flags = 0;
436 sigemptyset(&ka->sa.sa_mask);
437 ka++;
438 }
439}
440
441
442/* Notify the system that a driver wants to block all signals for this
443 * process, and wants to be notified if any signals at all were to be
444 * sent/acted upon. If the notifier routine returns non-zero, then the
445 * signal will be acted upon after all. If the notifier routine returns 0,
446 * then then signal will be blocked. Only one block per process is
447 * allowed. priv is a pointer to private data that the notifier routine
448 * can use to determine if the signal should be blocked or not. */
449
450void
451block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
452{
453 unsigned long flags;
454
455 spin_lock_irqsave(&current->sighand->siglock, flags);
456 current->notifier_mask = mask;
457 current->notifier_data = priv;
458 current->notifier = notifier;
459 spin_unlock_irqrestore(&current->sighand->siglock, flags);
460}
461
462/* Notify the system that blocking has ended. */
463
464void
465unblock_all_signals(void)
466{
467 unsigned long flags;
468
469 spin_lock_irqsave(&current->sighand->siglock, flags);
470 current->notifier = NULL;
471 current->notifier_data = NULL;
472 recalc_sigpending();
473 spin_unlock_irqrestore(&current->sighand->siglock, flags);
474}
475
476static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
477{
478 struct sigqueue *q, *first = NULL;
479 int still_pending = 0;
480
481 if (unlikely(!sigismember(&list->signal, sig)))
482 return 0;
483
484 /*
485 * Collect the siginfo appropriate to this signal. Check if
486 * there is another siginfo for the same signal.
487 */
488 list_for_each_entry(q, &list->list, list) {
489 if (q->info.si_signo == sig) {
490 if (first) {
491 still_pending = 1;
492 break;
493 }
494 first = q;
495 }
496 }
497 if (first) {
498 list_del_init(&first->list);
499 copy_siginfo(info, &first->info);
500 __sigqueue_free(first);
501 if (!still_pending)
502 sigdelset(&list->signal, sig);
503 } else {
504
505 /* Ok, it wasn't in the queue. This must be
506 a fast-pathed signal or we must have been
507 out of queue space. So zero out the info.
508 */
509 sigdelset(&list->signal, sig);
510 info->si_signo = sig;
511 info->si_errno = 0;
512 info->si_code = 0;
513 info->si_pid = 0;
514 info->si_uid = 0;
515 }
516 return 1;
517}
518
519static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
520 siginfo_t *info)
521{
522 int sig = 0;
523
524 sig = next_signal(pending, mask);
525 if (sig) {
526 if (current->notifier) {
527 if (sigismember(current->notifier_mask, sig)) {
528 if (!(current->notifier)(current->notifier_data)) {
529 clear_thread_flag(TIF_SIGPENDING);
530 return 0;
531 }
532 }
533 }
534
535 if (!collect_signal(sig, pending, info))
536 sig = 0;
537
538 }
539 recalc_sigpending();
540
541 return sig;
542}
543
544/*
545 * Dequeue a signal and return the element to the caller, which is
546 * expected to free it.
547 *
548 * All callers have to hold the siglock.
549 */
550int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
551{
552 int signr = __dequeue_signal(&tsk->pending, mask, info);
553 if (!signr)
554 signr = __dequeue_signal(&tsk->signal->shared_pending,
555 mask, info);
556 if (signr && unlikely(sig_kernel_stop(signr))) {
557 /*
558 * Set a marker that we have dequeued a stop signal. Our
559 * caller might release the siglock and then the pending
560 * stop signal it is about to process is no longer in the
561 * pending bitmasks, but must still be cleared by a SIGCONT
562 * (and overruled by a SIGKILL). So those cases clear this
563 * shared flag after we've set it. Note that this flag may
564 * remain set after the signal we return is ignored or
565 * handled. That doesn't matter because its only purpose
566 * is to alert stop-signal processing code when another
567 * processor has come along and cleared the flag.
568 */
569 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
570 }
571 if ( signr &&
572 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
573 info->si_sys_private){
574 /*
575 * Release the siglock to ensure proper locking order
576 * of timer locks outside of siglocks. Note, we leave
577 * irqs disabled here, since the posix-timers code is
578 * about to disable them again anyway.
579 */
580 spin_unlock(&tsk->sighand->siglock);
581 do_schedule_next_timer(info);
582 spin_lock(&tsk->sighand->siglock);
583 }
584 return signr;
585}
586
587/*
588 * Tell a process that it has a new active signal..
589 *
590 * NOTE! we rely on the previous spin_lock to
591 * lock interrupts for us! We can only be called with
592 * "siglock" held, and the local interrupt must
593 * have been disabled when that got acquired!
594 *
595 * No need to set need_resched since signal event passing
596 * goes through ->blocked
597 */
598void signal_wake_up(struct task_struct *t, int resume)
599{
600 unsigned int mask;
601
602 set_tsk_thread_flag(t, TIF_SIGPENDING);
603
604 /*
605 * For SIGKILL, we want to wake it up in the stopped/traced case.
606 * We don't check t->state here because there is a race with it
607 * executing another processor and just now entering stopped state.
608 * By using wake_up_state, we ensure the process will wake up and
609 * handle its death signal.
610 */
611 mask = TASK_INTERRUPTIBLE;
612 if (resume)
613 mask |= TASK_STOPPED | TASK_TRACED;
614 if (!wake_up_state(t, mask))
615 kick_process(t);
616}
617
618/*
619 * Remove signals in mask from the pending set and queue.
620 * Returns 1 if any signals were found.
621 *
622 * All callers must be holding the siglock.
623 */
624static int rm_from_queue(unsigned long mask, struct sigpending *s)
625{
626 struct sigqueue *q, *n;
627
628 if (!sigtestsetmask(&s->signal, mask))
629 return 0;
630
631 sigdelsetmask(&s->signal, mask);
632 list_for_each_entry_safe(q, n, &s->list, list) {
633 if (q->info.si_signo < SIGRTMIN &&
634 (mask & sigmask(q->info.si_signo))) {
635 list_del_init(&q->list);
636 __sigqueue_free(q);
637 }
638 }
639 return 1;
640}
641
642/*
643 * Bad permissions for sending the signal
644 */
645static int check_kill_permission(int sig, struct siginfo *info,
646 struct task_struct *t)
647{
648 int error = -EINVAL;
649 if (sig < 0 || sig > _NSIG)
650 return error;
651 error = -EPERM;
652 if ((!info || ((unsigned long)info != 1 &&
653 (unsigned long)info != 2 && SI_FROMUSER(info)))
654 && ((sig != SIGCONT) ||
655 (current->signal->session != t->signal->session))
656 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
657 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
658 && !capable(CAP_KILL))
659 return error;
660 return security_task_kill(t, info, sig);
661}
662
663/* forward decl */
664static void do_notify_parent_cldstop(struct task_struct *tsk,
665 struct task_struct *parent,
666 int why);
667
668/*
669 * Handle magic process-wide effects of stop/continue signals.
670 * Unlike the signal actions, these happen immediately at signal-generation
671 * time regardless of blocking, ignoring, or handling. This does the
672 * actual continuing for SIGCONT, but not the actual stopping for stop
673 * signals. The process stop is done as a signal action for SIG_DFL.
674 */
675static void handle_stop_signal(int sig, struct task_struct *p)
676{
677 struct task_struct *t;
678
679 if (p->flags & SIGNAL_GROUP_EXIT)
680 /*
681 * The process is in the middle of dying already.
682 */
683 return;
684
685 if (sig_kernel_stop(sig)) {
686 /*
687 * This is a stop signal. Remove SIGCONT from all queues.
688 */
689 rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending);
690 t = p;
691 do {
692 rm_from_queue(sigmask(SIGCONT), &t->pending);
693 t = next_thread(t);
694 } while (t != p);
695 } else if (sig == SIGCONT) {
696 /*
697 * Remove all stop signals from all queues,
698 * and wake all threads.
699 */
700 if (unlikely(p->signal->group_stop_count > 0)) {
701 /*
702 * There was a group stop in progress. We'll
703 * pretend it finished before we got here. We are
704 * obliged to report it to the parent: if the
705 * SIGSTOP happened "after" this SIGCONT, then it
706 * would have cleared this pending SIGCONT. If it
707 * happened "before" this SIGCONT, then the parent
708 * got the SIGCHLD about the stop finishing before
709 * the continue happened. We do the notification
710 * now, and it's as if the stop had finished and
711 * the SIGCHLD was pending on entry to this kill.
712 */
713 p->signal->group_stop_count = 0;
714 p->signal->flags = SIGNAL_STOP_CONTINUED;
715 spin_unlock(&p->sighand->siglock);
716 if (p->ptrace & PT_PTRACED)
717 do_notify_parent_cldstop(p, p->parent,
718 CLD_STOPPED);
719 else
720 do_notify_parent_cldstop(
721 p->group_leader,
722 p->group_leader->real_parent,
723 CLD_STOPPED);
724 spin_lock(&p->sighand->siglock);
725 }
726 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
727 t = p;
728 do {
729 unsigned int state;
730 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
731
732 /*
733 * If there is a handler for SIGCONT, we must make
734 * sure that no thread returns to user mode before
735 * we post the signal, in case it was the only
736 * thread eligible to run the signal handler--then
737 * it must not do anything between resuming and
738 * running the handler. With the TIF_SIGPENDING
739 * flag set, the thread will pause and acquire the
740 * siglock that we hold now and until we've queued
741 * the pending signal.
742 *
743 * Wake up the stopped thread _after_ setting
744 * TIF_SIGPENDING
745 */
746 state = TASK_STOPPED;
747 if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
748 set_tsk_thread_flag(t, TIF_SIGPENDING);
749 state |= TASK_INTERRUPTIBLE;
750 }
751 wake_up_state(t, state);
752
753 t = next_thread(t);
754 } while (t != p);
755
756 if (p->signal->flags & SIGNAL_STOP_STOPPED) {
757 /*
758 * We were in fact stopped, and are now continued.
759 * Notify the parent with CLD_CONTINUED.
760 */
761 p->signal->flags = SIGNAL_STOP_CONTINUED;
762 p->signal->group_exit_code = 0;
763 spin_unlock(&p->sighand->siglock);
764 if (p->ptrace & PT_PTRACED)
765 do_notify_parent_cldstop(p, p->parent,
766 CLD_CONTINUED);
767 else
768 do_notify_parent_cldstop(
769 p->group_leader,
770 p->group_leader->real_parent,
771 CLD_CONTINUED);
772 spin_lock(&p->sighand->siglock);
773 } else {
774 /*
775 * We are not stopped, but there could be a stop
776 * signal in the middle of being processed after
777 * being removed from the queue. Clear that too.
778 */
779 p->signal->flags = 0;
780 }
781 } else if (sig == SIGKILL) {
782 /*
783 * Make sure that any pending stop signal already dequeued
784 * is undone by the wakeup for SIGKILL.
785 */
786 p->signal->flags = 0;
787 }
788}
789
790static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
791 struct sigpending *signals)
792{
793 struct sigqueue * q = NULL;
794 int ret = 0;
795
796 /*
797 * fast-pathed signals for kernel-internal things like SIGSTOP
798 * or SIGKILL.
799 */
800 if ((unsigned long)info == 2)
801 goto out_set;
802
803 /* Real-time signals must be queued if sent by sigqueue, or
804 some other real-time mechanism. It is implementation
805 defined whether kill() does so. We attempt to do so, on
806 the principle of least surprise, but since kill is not
807 allowed to fail with EAGAIN when low on memory we just
808 make sure at least one signal gets delivered and don't
809 pass on the info struct. */
810
811 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
812 ((unsigned long) info < 2 ||
813 info->si_code >= 0)));
814 if (q) {
815 list_add_tail(&q->list, &signals->list);
816 switch ((unsigned long) info) {
817 case 0:
818 q->info.si_signo = sig;
819 q->info.si_errno = 0;
820 q->info.si_code = SI_USER;
821 q->info.si_pid = current->pid;
822 q->info.si_uid = current->uid;
823 break;
824 case 1:
825 q->info.si_signo = sig;
826 q->info.si_errno = 0;
827 q->info.si_code = SI_KERNEL;
828 q->info.si_pid = 0;
829 q->info.si_uid = 0;
830 break;
831 default:
832 copy_siginfo(&q->info, info);
833 break;
834 }
835 } else {
836 if (sig >= SIGRTMIN && info && (unsigned long)info != 1
837 && info->si_code != SI_USER)
838 /*
839 * Queue overflow, abort. We may abort if the signal was rt
840 * and sent by user using something other than kill().
841 */
842 return -EAGAIN;
843 if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
844 /*
845 * Set up a return to indicate that we dropped
846 * the signal.
847 */
848 ret = info->si_sys_private;
849 }
850
851out_set:
852 sigaddset(&signals->signal, sig);
853 return ret;
854}
855
856#define LEGACY_QUEUE(sigptr, sig) \
857 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
858
859
860static int
861specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
862{
863 int ret = 0;
864
865 if (!irqs_disabled())
866 BUG();
867 assert_spin_locked(&t->sighand->siglock);
868
869 if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
870 /*
871 * Set up a return to indicate that we dropped the signal.
872 */
873 ret = info->si_sys_private;
874
875 /* Short-circuit ignored signals. */
876 if (sig_ignored(t, sig))
877 goto out;
878
879 /* Support queueing exactly one non-rt signal, so that we
880 can get more detailed information about the cause of
881 the signal. */
882 if (LEGACY_QUEUE(&t->pending, sig))
883 goto out;
884
885 ret = send_signal(sig, info, t, &t->pending);
886 if (!ret && !sigismember(&t->blocked, sig))
887 signal_wake_up(t, sig == SIGKILL);
888out:
889 return ret;
890}
891
892/*
893 * Force a signal that the process can't ignore: if necessary
894 * we unblock the signal and change any SIG_IGN to SIG_DFL.
895 */
896
897int
898force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
899{
900 unsigned long int flags;
901 int ret;
902
903 spin_lock_irqsave(&t->sighand->siglock, flags);
904 if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
905 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
906 sigdelset(&t->blocked, sig);
907 recalc_sigpending_tsk(t);
908 }
909 ret = specific_send_sig_info(sig, info, t);
910 spin_unlock_irqrestore(&t->sighand->siglock, flags);
911
912 return ret;
913}
914
915void
916force_sig_specific(int sig, struct task_struct *t)
917{
918 unsigned long int flags;
919
920 spin_lock_irqsave(&t->sighand->siglock, flags);
921 if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
922 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
923 sigdelset(&t->blocked, sig);
924 recalc_sigpending_tsk(t);
925 specific_send_sig_info(sig, (void *)2, t);
926 spin_unlock_irqrestore(&t->sighand->siglock, flags);
927}
928
929/*
930 * Test if P wants to take SIG. After we've checked all threads with this,
931 * it's equivalent to finding no threads not blocking SIG. Any threads not
932 * blocking SIG were ruled out because they are not running and already
933 * have pending signals. Such threads will dequeue from the shared queue
934 * as soon as they're available, so putting the signal on the shared queue
935 * will be equivalent to sending it to one such thread.
936 */
937#define wants_signal(sig, p, mask) \
938 (!sigismember(&(p)->blocked, sig) \
939 && !((p)->state & mask) \
940 && !((p)->flags & PF_EXITING) \
941 && (task_curr(p) || !signal_pending(p)))
942
943
944static void
945__group_complete_signal(int sig, struct task_struct *p)
946{
947 unsigned int mask;
948 struct task_struct *t;
949
950 /*
951 * Don't bother traced and stopped tasks (but
952 * SIGKILL will punch through that).
953 */
954 mask = TASK_STOPPED | TASK_TRACED;
955 if (sig == SIGKILL)
956 mask = 0;
957
958 /*
959 * Now find a thread we can wake up to take the signal off the queue.
960 *
961 * If the main thread wants the signal, it gets first crack.
962 * Probably the least surprising to the average bear.
963 */
964 if (wants_signal(sig, p, mask))
965 t = p;
966 else if (thread_group_empty(p))
967 /*
968 * There is just one thread and it does not need to be woken.
969 * It will dequeue unblocked signals before it runs again.
970 */
971 return;
972 else {
973 /*
974 * Otherwise try to find a suitable thread.
975 */
976 t = p->signal->curr_target;
977 if (t == NULL)
978 /* restart balancing at this thread */
979 t = p->signal->curr_target = p;
980 BUG_ON(t->tgid != p->tgid);
981
982 while (!wants_signal(sig, t, mask)) {
983 t = next_thread(t);
984 if (t == p->signal->curr_target)
985 /*
986 * No thread needs to be woken.
987 * Any eligible threads will see
988 * the signal in the queue soon.
989 */
990 return;
991 }
992 p->signal->curr_target = t;
993 }
994
995 /*
996 * Found a killable thread. If the signal will be fatal,
997 * then start taking the whole group down immediately.
998 */
999 if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) &&
1000 !sigismember(&t->real_blocked, sig) &&
1001 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
1002 /*
1003 * This signal will be fatal to the whole group.
1004 */
1005 if (!sig_kernel_coredump(sig)) {
1006 /*
1007 * Start a group exit and wake everybody up.
1008 * This way we don't have other threads
1009 * running and doing things after a slower
1010 * thread has the fatal signal pending.
1011 */
1012 p->signal->flags = SIGNAL_GROUP_EXIT;
1013 p->signal->group_exit_code = sig;
1014 p->signal->group_stop_count = 0;
1015 t = p;
1016 do {
1017 sigaddset(&t->pending.signal, SIGKILL);
1018 signal_wake_up(t, 1);
1019 t = next_thread(t);
1020 } while (t != p);
1021 return;
1022 }
1023
1024 /*
1025 * There will be a core dump. We make all threads other
1026 * than the chosen one go into a group stop so that nothing
1027 * happens until it gets scheduled, takes the signal off
1028 * the shared queue, and does the core dump. This is a
1029 * little more complicated than strictly necessary, but it
1030 * keeps the signal state that winds up in the core dump
1031 * unchanged from the death state, e.g. which thread had
1032 * the core-dump signal unblocked.
1033 */
1034 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
1035 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
1036 p->signal->group_stop_count = 0;
1037 p->signal->group_exit_task = t;
1038 t = p;
1039 do {
1040 p->signal->group_stop_count++;
1041 signal_wake_up(t, 0);
1042 t = next_thread(t);
1043 } while (t != p);
1044 wake_up_process(p->signal->group_exit_task);
1045 return;
1046 }
1047
1048 /*
1049 * The signal is already in the shared-pending queue.
1050 * Tell the chosen thread to wake up and dequeue it.
1051 */
1052 signal_wake_up(t, sig == SIGKILL);
1053 return;
1054}
1055
1056int
1057__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1058{
1059 int ret = 0;
1060
1061 assert_spin_locked(&p->sighand->siglock);
1062 handle_stop_signal(sig, p);
1063
1064 if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
1065 /*
1066 * Set up a return to indicate that we dropped the signal.
1067 */
1068 ret = info->si_sys_private;
1069
1070 /* Short-circuit ignored signals. */
1071 if (sig_ignored(p, sig))
1072 return ret;
1073
1074 if (LEGACY_QUEUE(&p->signal->shared_pending, sig))
1075 /* This is a non-RT signal and we already have one queued. */
1076 return ret;
1077
1078 /*
1079 * Put this signal on the shared-pending queue, or fail with EAGAIN.
1080 * We always use the shared queue for process-wide signals,
1081 * to avoid several races.
1082 */
1083 ret = send_signal(sig, info, p, &p->signal->shared_pending);
1084 if (unlikely(ret))
1085 return ret;
1086
1087 __group_complete_signal(sig, p);
1088 return 0;
1089}
1090
1091/*
1092 * Nuke all other threads in the group.
1093 */
1094void zap_other_threads(struct task_struct *p)
1095{
1096 struct task_struct *t;
1097
1098 p->signal->flags = SIGNAL_GROUP_EXIT;
1099 p->signal->group_stop_count = 0;
1100
1101 if (thread_group_empty(p))
1102 return;
1103
1104 for (t = next_thread(p); t != p; t = next_thread(t)) {
1105 /*
1106 * Don't bother with already dead threads
1107 */
1108 if (t->exit_state)
1109 continue;
1110
1111 /*
1112 * We don't want to notify the parent, since we are
1113 * killed as part of a thread group due to another
1114 * thread doing an execve() or similar. So set the
1115 * exit signal to -1 to allow immediate reaping of
1116 * the process. But don't detach the thread group
1117 * leader.
1118 */
1119 if (t != p->group_leader)
1120 t->exit_signal = -1;
1121
1122 sigaddset(&t->pending.signal, SIGKILL);
1123 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
1124 signal_wake_up(t, 1);
1125 }
1126}
1127
1128/*
1129 * Must be called with the tasklist_lock held for reading!
1130 */
1131int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1132{
1133 unsigned long flags;
1134 int ret;
1135
1136 ret = check_kill_permission(sig, info, p);
1137 if (!ret && sig && p->sighand) {
1138 spin_lock_irqsave(&p->sighand->siglock, flags);
1139 ret = __group_send_sig_info(sig, info, p);
1140 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1141 }
1142
1143 return ret;
1144}
1145
1146/*
1147 * kill_pg_info() sends a signal to a process group: this is what the tty
1148 * control characters do (^C, ^Z etc)
1149 */
1150
1151int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
1152{
1153 struct task_struct *p = NULL;
1154 int retval, success;
1155
1156 if (pgrp <= 0)
1157 return -EINVAL;
1158
1159 success = 0;
1160 retval = -ESRCH;
1161 do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
1162 int err = group_send_sig_info(sig, info, p);
1163 success |= !err;
1164 retval = err;
1165 } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
1166 return success ? 0 : retval;
1167}
1168
1169int
1170kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
1171{
1172 int retval;
1173
1174 read_lock(&tasklist_lock);
1175 retval = __kill_pg_info(sig, info, pgrp);
1176 read_unlock(&tasklist_lock);
1177
1178 return retval;
1179}
1180
1181int
1182kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1183{
1184 int error;
1185 struct task_struct *p;
1186
1187 read_lock(&tasklist_lock);
1188 p = find_task_by_pid(pid);
1189 error = -ESRCH;
1190 if (p)
1191 error = group_send_sig_info(sig, info, p);
1192 read_unlock(&tasklist_lock);
1193 return error;
1194}
1195
1196
1197/*
1198 * kill_something_info() interprets pid in interesting ways just like kill(2).
1199 *
1200 * POSIX specifies that kill(-1,sig) is unspecified, but what we have
1201 * is probably wrong. Should make it like BSD or SYSV.
1202 */
1203
1204static int kill_something_info(int sig, struct siginfo *info, int pid)
1205{
1206 if (!pid) {
1207 return kill_pg_info(sig, info, process_group(current));
1208 } else if (pid == -1) {
1209 int retval = 0, count = 0;
1210 struct task_struct * p;
1211
1212 read_lock(&tasklist_lock);
1213 for_each_process(p) {
1214 if (p->pid > 1 && p->tgid != current->tgid) {
1215 int err = group_send_sig_info(sig, info, p);
1216 ++count;
1217 if (err != -EPERM)
1218 retval = err;
1219 }
1220 }
1221 read_unlock(&tasklist_lock);
1222 return count ? retval : -ESRCH;
1223 } else if (pid < 0) {
1224 return kill_pg_info(sig, info, -pid);
1225 } else {
1226 return kill_proc_info(sig, info, pid);
1227 }
1228}
1229
1230/*
1231 * These are for backward compatibility with the rest of the kernel source.
1232 */
1233
1234/*
1235 * These two are the most common entry points. They send a signal
1236 * just to the specific thread.
1237 */
1238int
1239send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1240{
1241 int ret;
1242 unsigned long flags;
1243
1244 /*
1245 * Make sure legacy kernel users don't send in bad values
1246 * (normal paths check this in check_kill_permission).
1247 */
1248 if (sig < 0 || sig > _NSIG)
1249 return -EINVAL;
1250
1251 /*
1252 * We need the tasklist lock even for the specific
1253 * thread case (when we don't need to follow the group
1254 * lists) in order to avoid races with "p->sighand"
1255 * going away or changing from under us.
1256 */
1257 read_lock(&tasklist_lock);
1258 spin_lock_irqsave(&p->sighand->siglock, flags);
1259 ret = specific_send_sig_info(sig, info, p);
1260 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1261 read_unlock(&tasklist_lock);
1262 return ret;
1263}
1264
1265int
1266send_sig(int sig, struct task_struct *p, int priv)
1267{
1268 return send_sig_info(sig, (void*)(long)(priv != 0), p);
1269}
1270
1271/*
1272 * This is the entry point for "process-wide" signals.
1273 * They will go to an appropriate thread in the thread group.
1274 */
1275int
1276send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1277{
1278 int ret;
1279 read_lock(&tasklist_lock);
1280 ret = group_send_sig_info(sig, info, p);
1281 read_unlock(&tasklist_lock);
1282 return ret;
1283}
1284
1285void
1286force_sig(int sig, struct task_struct *p)
1287{
1288 force_sig_info(sig, (void*)1L, p);
1289}
1290
1291/*
1292 * When things go south during signal handling, we
1293 * will force a SIGSEGV. And if the signal that caused
1294 * the problem was already a SIGSEGV, we'll want to
1295 * make sure we don't even try to deliver the signal..
1296 */
1297int
1298force_sigsegv(int sig, struct task_struct *p)
1299{
1300 if (sig == SIGSEGV) {
1301 unsigned long flags;
1302 spin_lock_irqsave(&p->sighand->siglock, flags);
1303 p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
1304 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1305 }
1306 force_sig(SIGSEGV, p);
1307 return 0;
1308}
1309
1310int
1311kill_pg(pid_t pgrp, int sig, int priv)
1312{
1313 return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
1314}
1315
1316int
1317kill_proc(pid_t pid, int sig, int priv)
1318{
1319 return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
1320}
1321
1322/*
1323 * These functions support sending signals using preallocated sigqueue
1324 * structures. This is needed "because realtime applications cannot
1325 * afford to lose notifications of asynchronous events, like timer
1326 * expirations or I/O completions". In the case of Posix Timers
1327 * we allocate the sigqueue structure from the timer_create. If this
1328 * allocation fails we are able to report the failure to the application
1329 * with an EAGAIN error.
1330 */
1331
1332struct sigqueue *sigqueue_alloc(void)
1333{
1334 struct sigqueue *q;
1335
1336 if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0)))
1337 q->flags |= SIGQUEUE_PREALLOC;
1338 return(q);
1339}
1340
1341void sigqueue_free(struct sigqueue *q)
1342{
1343 unsigned long flags;
1344 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1345 /*
1346 * If the signal is still pending remove it from the
1347 * pending queue.
1348 */
1349 if (unlikely(!list_empty(&q->list))) {
1350 read_lock(&tasklist_lock);
1351 spin_lock_irqsave(q->lock, flags);
1352 if (!list_empty(&q->list))
1353 list_del_init(&q->list);
1354 spin_unlock_irqrestore(q->lock, flags);
1355 read_unlock(&tasklist_lock);
1356 }
1357 q->flags &= ~SIGQUEUE_PREALLOC;
1358 __sigqueue_free(q);
1359}
1360
1361int
1362send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1363{
1364 unsigned long flags;
1365 int ret = 0;
1366
1367 /*
1368 * We need the tasklist lock even for the specific
1369 * thread case (when we don't need to follow the group
1370 * lists) in order to avoid races with "p->sighand"
1371 * going away or changing from under us.
1372 */
1373 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1374 read_lock(&tasklist_lock);
1375 spin_lock_irqsave(&p->sighand->siglock, flags);
1376
1377 if (unlikely(!list_empty(&q->list))) {
1378 /*
1379 * If an SI_TIMER entry is already queue just increment
1380 * the overrun count.
1381 */
1382 if (q->info.si_code != SI_TIMER)
1383 BUG();
1384 q->info.si_overrun++;
1385 goto out;
1386 }
1387 /* Short-circuit ignored signals. */
1388 if (sig_ignored(p, sig)) {
1389 ret = 1;
1390 goto out;
1391 }
1392
1393 q->lock = &p->sighand->siglock;
1394 list_add_tail(&q->list, &p->pending.list);
1395 sigaddset(&p->pending.signal, sig);
1396 if (!sigismember(&p->blocked, sig))
1397 signal_wake_up(p, sig == SIGKILL);
1398
1399out:
1400 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1401 read_unlock(&tasklist_lock);
1402 return(ret);
1403}
1404
1405int
1406send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1407{
1408 unsigned long flags;
1409 int ret = 0;
1410
1411 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1412 read_lock(&tasklist_lock);
1413 spin_lock_irqsave(&p->sighand->siglock, flags);
1414 handle_stop_signal(sig, p);
1415
1416 /* Short-circuit ignored signals. */
1417 if (sig_ignored(p, sig)) {
1418 ret = 1;
1419 goto out;
1420 }
1421
1422 if (unlikely(!list_empty(&q->list))) {
1423 /*
1424 * If an SI_TIMER entry is already queue just increment
1425 * the overrun count. Other uses should not try to
1426 * send the signal multiple times.
1427 */
1428 if (q->info.si_code != SI_TIMER)
1429 BUG();
1430 q->info.si_overrun++;
1431 goto out;
1432 }
1433
1434 /*
1435 * Put this signal on the shared-pending queue.
1436 * We always use the shared queue for process-wide signals,
1437 * to avoid several races.
1438 */
1439 q->lock = &p->sighand->siglock;
1440 list_add_tail(&q->list, &p->signal->shared_pending.list);
1441 sigaddset(&p->signal->shared_pending.signal, sig);
1442
1443 __group_complete_signal(sig, p);
1444out:
1445 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1446 read_unlock(&tasklist_lock);
1447 return(ret);
1448}
1449
1450/*
1451 * Wake up any threads in the parent blocked in wait* syscalls.
1452 */
1453static inline void __wake_up_parent(struct task_struct *p,
1454 struct task_struct *parent)
1455{
1456 wake_up_interruptible_sync(&parent->signal->wait_chldexit);
1457}
1458
1459/*
1460 * Let a parent know about the death of a child.
1461 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1462 */
1463
1464void do_notify_parent(struct task_struct *tsk, int sig)
1465{
1466 struct siginfo info;
1467 unsigned long flags;
1468 struct sighand_struct *psig;
1469
1470 BUG_ON(sig == -1);
1471
1472 /* do_notify_parent_cldstop should have been called instead. */
1473 BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED));
1474
1475 BUG_ON(!tsk->ptrace &&
1476 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1477
1478 info.si_signo = sig;
1479 info.si_errno = 0;
1480 info.si_pid = tsk->pid;
1481 info.si_uid = tsk->uid;
1482
1483 /* FIXME: find out whether or not this is supposed to be c*time. */
1484 info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
1485 tsk->signal->utime));
1486 info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
1487 tsk->signal->stime));
1488
1489 info.si_status = tsk->exit_code & 0x7f;
1490 if (tsk->exit_code & 0x80)
1491 info.si_code = CLD_DUMPED;
1492 else if (tsk->exit_code & 0x7f)
1493 info.si_code = CLD_KILLED;
1494 else {
1495 info.si_code = CLD_EXITED;
1496 info.si_status = tsk->exit_code >> 8;
1497 }
1498
1499 psig = tsk->parent->sighand;
1500 spin_lock_irqsave(&psig->siglock, flags);
1501 if (sig == SIGCHLD &&
1502 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1503 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1504 /*
1505 * We are exiting and our parent doesn't care. POSIX.1
1506 * defines special semantics for setting SIGCHLD to SIG_IGN
1507 * or setting the SA_NOCLDWAIT flag: we should be reaped
1508 * automatically and not left for our parent's wait4 call.
1509 * Rather than having the parent do it as a magic kind of
1510 * signal handler, we just set this to tell do_exit that we
1511 * can be cleaned up without becoming a zombie. Note that
1512 * we still call __wake_up_parent in this case, because a
1513 * blocked sys_wait4 might now return -ECHILD.
1514 *
1515 * Whether we send SIGCHLD or not for SA_NOCLDWAIT
1516 * is implementation-defined: we do (if you don't want
1517 * it, just use SIG_IGN instead).
1518 */
1519 tsk->exit_signal = -1;
1520 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1521 sig = 0;
1522 }
1523 if (sig > 0 && sig <= _NSIG)
1524 __group_send_sig_info(sig, &info, tsk->parent);
1525 __wake_up_parent(tsk, tsk->parent);
1526 spin_unlock_irqrestore(&psig->siglock, flags);
1527}
1528
1529static void
1530do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
1531 int why)
1532{
1533 struct siginfo info;
1534 unsigned long flags;
1535 struct sighand_struct *sighand;
1536
1537 info.si_signo = SIGCHLD;
1538 info.si_errno = 0;
1539 info.si_pid = tsk->pid;
1540 info.si_uid = tsk->uid;
1541
1542 /* FIXME: find out whether or not this is supposed to be c*time. */
1543 info.si_utime = cputime_to_jiffies(tsk->utime);
1544 info.si_stime = cputime_to_jiffies(tsk->stime);
1545
1546 info.si_code = why;
1547 switch (why) {
1548 case CLD_CONTINUED:
1549 info.si_status = SIGCONT;
1550 break;
1551 case CLD_STOPPED:
1552 info.si_status = tsk->signal->group_exit_code & 0x7f;
1553 break;
1554 case CLD_TRAPPED:
1555 info.si_status = tsk->exit_code & 0x7f;
1556 break;
1557 default:
1558 BUG();
1559 }
1560
1561 sighand = parent->sighand;
1562 spin_lock_irqsave(&sighand->siglock, flags);
1563 if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
1564 !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
1565 __group_send_sig_info(SIGCHLD, &info, parent);
1566 /*
1567 * Even if SIGCHLD is not generated, we must wake up wait4 calls.
1568 */
1569 __wake_up_parent(tsk, parent);
1570 spin_unlock_irqrestore(&sighand->siglock, flags);
1571}
1572
1573/*
1574 * This must be called with current->sighand->siglock held.
1575 *
1576 * This should be the path for all ptrace stops.
1577 * We always set current->last_siginfo while stopped here.
1578 * That makes it a way to test a stopped process for
1579 * being ptrace-stopped vs being job-control-stopped.
1580 *
1581 * If we actually decide not to stop at all because the tracer is gone,
1582 * we leave nostop_code in current->exit_code.
1583 */
1584static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1585{
1586 /*
1587 * If there is a group stop in progress,
1588 * we must participate in the bookkeeping.
1589 */
1590 if (current->signal->group_stop_count > 0)
1591 --current->signal->group_stop_count;
1592
1593 current->last_siginfo = info;
1594 current->exit_code = exit_code;
1595
1596 /* Let the debugger run. */
1597 set_current_state(TASK_TRACED);
1598 spin_unlock_irq(&current->sighand->siglock);
1599 read_lock(&tasklist_lock);
1600 if (likely(current->ptrace & PT_PTRACED) &&
1601 likely(current->parent != current->real_parent ||
1602 !(current->ptrace & PT_ATTACHED)) &&
1603 (likely(current->parent->signal != current->signal) ||
1604 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1605 do_notify_parent_cldstop(current, current->parent,
1606 CLD_TRAPPED);
1607 read_unlock(&tasklist_lock);
1608 schedule();
1609 } else {
1610 /*
1611 * By the time we got the lock, our tracer went away.
1612 * Don't stop here.
1613 */
1614 read_unlock(&tasklist_lock);
1615 set_current_state(TASK_RUNNING);
1616 current->exit_code = nostop_code;
1617 }
1618
1619 /*
1620 * We are back. Now reacquire the siglock before touching
1621 * last_siginfo, so that we are sure to have synchronized with
1622 * any signal-sending on another CPU that wants to examine it.
1623 */
1624 spin_lock_irq(&current->sighand->siglock);
1625 current->last_siginfo = NULL;
1626
1627 /*
1628 * Queued signals ignored us while we were stopped for tracing.
1629 * So check for any that we should take before resuming user mode.
1630 */
1631 recalc_sigpending();
1632}
1633
1634void ptrace_notify(int exit_code)
1635{
1636 siginfo_t info;
1637
1638 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1639
1640 memset(&info, 0, sizeof info);
1641 info.si_signo = SIGTRAP;
1642 info.si_code = exit_code;
1643 info.si_pid = current->pid;
1644 info.si_uid = current->uid;
1645
1646 /* Let the debugger run. */
1647 spin_lock_irq(&current->sighand->siglock);
1648 ptrace_stop(exit_code, 0, &info);
1649 spin_unlock_irq(&current->sighand->siglock);
1650}
1651
1652#ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER
1653
1654static void
1655finish_stop(int stop_count)
1656{
1657 /*
1658 * If there are no other threads in the group, or if there is
1659 * a group stop in progress and we are the last to stop,
1660 * report to the parent. When ptraced, every thread reports itself.
1661 */
1662 if (stop_count < 0 || (current->ptrace & PT_PTRACED)) {
1663 read_lock(&tasklist_lock);
1664 do_notify_parent_cldstop(current, current->parent,
1665 CLD_STOPPED);
1666 read_unlock(&tasklist_lock);
1667 }
1668 else if (stop_count == 0) {
1669 read_lock(&tasklist_lock);
1670 do_notify_parent_cldstop(current->group_leader,
1671 current->group_leader->real_parent,
1672 CLD_STOPPED);
1673 read_unlock(&tasklist_lock);
1674 }
1675
1676 schedule();
1677 /*
1678 * Now we don't run again until continued.
1679 */
1680 current->exit_code = 0;
1681}
1682
1683/*
1684 * This performs the stopping for SIGSTOP and other stop signals.
1685 * We have to stop all threads in the thread group.
1686 * Returns nonzero if we've actually stopped and released the siglock.
1687 * Returns zero if we didn't stop and still hold the siglock.
1688 */
1689static int
1690do_signal_stop(int signr)
1691{
1692 struct signal_struct *sig = current->signal;
1693 struct sighand_struct *sighand = current->sighand;
1694 int stop_count = -1;
1695
1696 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
1697 return 0;
1698
1699 if (sig->group_stop_count > 0) {
1700 /*
1701 * There is a group stop in progress. We don't need to
1702 * start another one.
1703 */
1704 signr = sig->group_exit_code;
1705 stop_count = --sig->group_stop_count;
1706 current->exit_code = signr;
1707 set_current_state(TASK_STOPPED);
1708 if (stop_count == 0)
1709 sig->flags = SIGNAL_STOP_STOPPED;
1710 spin_unlock_irq(&sighand->siglock);
1711 }
1712 else if (thread_group_empty(current)) {
1713 /*
1714 * Lock must be held through transition to stopped state.
1715 */
1716 current->exit_code = current->signal->group_exit_code = signr;
1717 set_current_state(TASK_STOPPED);
1718 sig->flags = SIGNAL_STOP_STOPPED;
1719 spin_unlock_irq(&sighand->siglock);
1720 }
1721 else {
1722 /*
1723 * There is no group stop already in progress.
1724 * We must initiate one now, but that requires
1725 * dropping siglock to get both the tasklist lock
1726 * and siglock again in the proper order. Note that
1727 * this allows an intervening SIGCONT to be posted.
1728 * We need to check for that and bail out if necessary.
1729 */
1730 struct task_struct *t;
1731
1732 spin_unlock_irq(&sighand->siglock);
1733
1734 /* signals can be posted during this window */
1735
1736 read_lock(&tasklist_lock);
1737 spin_lock_irq(&sighand->siglock);
1738
1739 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
1740 /*
1741 * Another stop or continue happened while we
1742 * didn't have the lock. We can just swallow this
1743 * signal now. If we raced with a SIGCONT, that
1744 * should have just cleared it now. If we raced
1745 * with another processor delivering a stop signal,
1746 * then the SIGCONT that wakes us up should clear it.
1747 */
1748 read_unlock(&tasklist_lock);
1749 return 0;
1750 }
1751
1752 if (sig->group_stop_count == 0) {
1753 sig->group_exit_code = signr;
1754 stop_count = 0;
1755 for (t = next_thread(current); t != current;
1756 t = next_thread(t))
1757 /*
1758 * Setting state to TASK_STOPPED for a group
1759 * stop is always done with the siglock held,
1760 * so this check has no races.
1761 */
1762 if (t->state < TASK_STOPPED) {
1763 stop_count++;
1764 signal_wake_up(t, 0);
1765 }
1766 sig->group_stop_count = stop_count;
1767 }
1768 else {
1769 /* A race with another thread while unlocked. */
1770 signr = sig->group_exit_code;
1771 stop_count = --sig->group_stop_count;
1772 }
1773
1774 current->exit_code = signr;
1775 set_current_state(TASK_STOPPED);
1776 if (stop_count == 0)
1777 sig->flags = SIGNAL_STOP_STOPPED;
1778
1779 spin_unlock_irq(&sighand->siglock);
1780 read_unlock(&tasklist_lock);
1781 }
1782
1783 finish_stop(stop_count);
1784 return 1;
1785}
1786
1787/*
1788 * Do appropriate magic when group_stop_count > 0.
1789 * We return nonzero if we stopped, after releasing the siglock.
1790 * We return zero if we still hold the siglock and should look
1791 * for another signal without checking group_stop_count again.
1792 */
1793static inline int handle_group_stop(void)
1794{
1795 int stop_count;
1796
1797 if (current->signal->group_exit_task == current) {
1798 /*
1799 * Group stop is so we can do a core dump,
1800 * We are the initiating thread, so get on with it.
1801 */
1802 current->signal->group_exit_task = NULL;
1803 return 0;
1804 }
1805
1806 if (current->signal->flags & SIGNAL_GROUP_EXIT)
1807 /*
1808 * Group stop is so another thread can do a core dump,
1809 * or else we are racing against a death signal.
1810 * Just punt the stop so we can get the next signal.
1811 */
1812 return 0;
1813
1814 /*
1815 * There is a group stop in progress. We stop
1816 * without any associated signal being in our queue.
1817 */
1818 stop_count = --current->signal->group_stop_count;
1819 if (stop_count == 0)
1820 current->signal->flags = SIGNAL_STOP_STOPPED;
1821 current->exit_code = current->signal->group_exit_code;
1822 set_current_state(TASK_STOPPED);
1823 spin_unlock_irq(&current->sighand->siglock);
1824 finish_stop(stop_count);
1825 return 1;
1826}
1827
1828int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1829 struct pt_regs *regs, void *cookie)
1830{
1831 sigset_t *mask = &current->blocked;
1832 int signr = 0;
1833
1834relock:
1835 spin_lock_irq(&current->sighand->siglock);
1836 for (;;) {
1837 struct k_sigaction *ka;
1838
1839 if (unlikely(current->signal->group_stop_count > 0) &&
1840 handle_group_stop())
1841 goto relock;
1842
1843 signr = dequeue_signal(current, mask, info);
1844
1845 if (!signr)
1846 break; /* will return 0 */
1847
1848 if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) {
1849 ptrace_signal_deliver(regs, cookie);
1850
1851 /* Let the debugger run. */
1852 ptrace_stop(signr, signr, info);
1853
1854 /* We're back. Did the debugger cancel the sig? */
1855 signr = current->exit_code;
1856 if (signr == 0)
1857 continue;
1858
1859 current->exit_code = 0;
1860
1861 /* Update the siginfo structure if the signal has
1862 changed. If the debugger wanted something
1863 specific in the siginfo structure then it should
1864 have updated *info via PTRACE_SETSIGINFO. */
1865 if (signr != info->si_signo) {
1866 info->si_signo = signr;
1867 info->si_errno = 0;
1868 info->si_code = SI_USER;
1869 info->si_pid = current->parent->pid;
1870 info->si_uid = current->parent->uid;
1871 }
1872
1873 /* If the (new) signal is now blocked, requeue it. */
1874 if (sigismember(&current->blocked, signr)) {
1875 specific_send_sig_info(signr, info, current);
1876 continue;
1877 }
1878 }
1879
1880 ka = &current->sighand->action[signr-1];
1881 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1882 continue;
1883 if (ka->sa.sa_handler != SIG_DFL) {
1884 /* Run the handler. */
1885 *return_ka = *ka;
1886
1887 if (ka->sa.sa_flags & SA_ONESHOT)
1888 ka->sa.sa_handler = SIG_DFL;
1889
1890 break; /* will return non-zero "signr" value */
1891 }
1892
1893 /*
1894 * Now we are doing the default action for this signal.
1895 */
1896 if (sig_kernel_ignore(signr)) /* Default is nothing. */
1897 continue;
1898
1899 /* Init gets no signals it doesn't want. */
1900 if (current->pid == 1)
1901 continue;
1902
1903 if (sig_kernel_stop(signr)) {
1904 /*
1905 * The default action is to stop all threads in
1906 * the thread group. The job control signals
1907 * do nothing in an orphaned pgrp, but SIGSTOP
1908 * always works. Note that siglock needs to be
1909 * dropped during the call to is_orphaned_pgrp()
1910 * because of lock ordering with tasklist_lock.
1911 * This allows an intervening SIGCONT to be posted.
1912 * We need to check for that and bail out if necessary.
1913 */
1914 if (signr != SIGSTOP) {
1915 spin_unlock_irq(&current->sighand->siglock);
1916
1917 /* signals can be posted during this window */
1918
1919 if (is_orphaned_pgrp(process_group(current)))
1920 goto relock;
1921
1922 spin_lock_irq(&current->sighand->siglock);
1923 }
1924
1925 if (likely(do_signal_stop(signr))) {
1926 /* It released the siglock. */
1927 goto relock;
1928 }
1929
1930 /*
1931 * We didn't actually stop, due to a race
1932 * with SIGCONT or something like that.
1933 */
1934 continue;
1935 }
1936
1937 spin_unlock_irq(&current->sighand->siglock);
1938
1939 /*
1940 * Anything else is fatal, maybe with a core dump.
1941 */
1942 current->flags |= PF_SIGNALED;
1943 if (sig_kernel_coredump(signr)) {
1944 /*
1945 * If it was able to dump core, this kills all
1946 * other threads in the group and synchronizes with
1947 * their demise. If we lost the race with another
1948 * thread getting here, it set group_exit_code
1949 * first and our do_group_exit call below will use
1950 * that value and ignore the one we pass it.
1951 */
1952 do_coredump((long)signr, signr, regs);
1953 }
1954
1955 /*
1956 * Death signals, no core dump.
1957 */
1958 do_group_exit(signr);
1959 /* NOTREACHED */
1960 }
1961 spin_unlock_irq(&current->sighand->siglock);
1962 return signr;
1963}
1964
1965#endif
1966
1967EXPORT_SYMBOL(recalc_sigpending);
1968EXPORT_SYMBOL_GPL(dequeue_signal);
1969EXPORT_SYMBOL(flush_signals);
1970EXPORT_SYMBOL(force_sig);
1971EXPORT_SYMBOL(kill_pg);
1972EXPORT_SYMBOL(kill_proc);
1973EXPORT_SYMBOL(ptrace_notify);
1974EXPORT_SYMBOL(send_sig);
1975EXPORT_SYMBOL(send_sig_info);
1976EXPORT_SYMBOL(sigprocmask);
1977EXPORT_SYMBOL(block_all_signals);
1978EXPORT_SYMBOL(unblock_all_signals);
1979
1980
1981/*
1982 * System call entry points.
1983 */
1984
1985asmlinkage long sys_restart_syscall(void)
1986{
1987 struct restart_block *restart = &current_thread_info()->restart_block;
1988 return restart->fn(restart);
1989}
1990
1991long do_no_restart_syscall(struct restart_block *param)
1992{
1993 return -EINTR;
1994}
1995
1996/*
1997 * We don't need to get the kernel lock - this is all local to this
1998 * particular thread.. (and that's good, because this is _heavily_
1999 * used by various programs)
2000 */
2001
2002/*
2003 * This is also useful for kernel threads that want to temporarily
2004 * (or permanently) block certain signals.
2005 *
2006 * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
2007 * interface happily blocks "unblockable" signals like SIGKILL
2008 * and friends.
2009 */
2010int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2011{
2012 int error;
2013 sigset_t old_block;
2014
2015 spin_lock_irq(&current->sighand->siglock);
2016 old_block = current->blocked;
2017 error = 0;
2018 switch (how) {
2019 case SIG_BLOCK:
2020 sigorsets(&current->blocked, &current->blocked, set);
2021 break;
2022 case SIG_UNBLOCK:
2023 signandsets(&current->blocked, &current->blocked, set);
2024 break;
2025 case SIG_SETMASK:
2026 current->blocked = *set;
2027 break;
2028 default:
2029 error = -EINVAL;
2030 }
2031 recalc_sigpending();
2032 spin_unlock_irq(&current->sighand->siglock);
2033 if (oldset)
2034 *oldset = old_block;
2035 return error;
2036}
2037
2038asmlinkage long
2039sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize)
2040{
2041 int error = -EINVAL;
2042 sigset_t old_set, new_set;
2043
2044 /* XXX: Don't preclude handling different sized sigset_t's. */
2045 if (sigsetsize != sizeof(sigset_t))
2046 goto out;
2047
2048 if (set) {
2049 error = -EFAULT;
2050 if (copy_from_user(&new_set, set, sizeof(*set)))
2051 goto out;
2052 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
2053
2054 error = sigprocmask(how, &new_set, &old_set);
2055 if (error)
2056 goto out;
2057 if (oset)
2058 goto set_old;
2059 } else if (oset) {
2060 spin_lock_irq(&current->sighand->siglock);
2061 old_set = current->blocked;
2062 spin_unlock_irq(&current->sighand->siglock);
2063
2064 set_old:
2065 error = -EFAULT;
2066 if (copy_to_user(oset, &old_set, sizeof(*oset)))
2067 goto out;
2068 }
2069 error = 0;
2070out:
2071 return error;
2072}
2073
2074long do_sigpending(void __user *set, unsigned long sigsetsize)
2075{
2076 long error = -EINVAL;
2077 sigset_t pending;
2078
2079 if (sigsetsize > sizeof(sigset_t))
2080 goto out;
2081
2082 spin_lock_irq(&current->sighand->siglock);
2083 sigorsets(&pending, &current->pending.signal,
2084 &current->signal->shared_pending.signal);
2085 spin_unlock_irq(&current->sighand->siglock);
2086
2087 /* Outside the lock because only this thread touches it. */
2088 sigandsets(&pending, &current->blocked, &pending);
2089
2090 error = -EFAULT;
2091 if (!copy_to_user(set, &pending, sigsetsize))
2092 error = 0;
2093
2094out:
2095 return error;
2096}
2097
2098asmlinkage long
2099sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize)
2100{
2101 return do_sigpending(set, sigsetsize);
2102}
2103
2104#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
2105
2106int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2107{
2108 int err;
2109
2110 if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
2111 return -EFAULT;
2112 if (from->si_code < 0)
2113 return __copy_to_user(to, from, sizeof(siginfo_t))
2114 ? -EFAULT : 0;
2115 /*
2116 * If you change siginfo_t structure, please be sure
2117 * this code is fixed accordingly.
2118 * It should never copy any pad contained in the structure
2119 * to avoid security leaks, but must copy the generic
2120 * 3 ints plus the relevant union member.
2121 */
2122 err = __put_user(from->si_signo, &to->si_signo);
2123 err |= __put_user(from->si_errno, &to->si_errno);
2124 err |= __put_user((short)from->si_code, &to->si_code);
2125 switch (from->si_code & __SI_MASK) {
2126 case __SI_KILL:
2127 err |= __put_user(from->si_pid, &to->si_pid);
2128 err |= __put_user(from->si_uid, &to->si_uid);
2129 break;
2130 case __SI_TIMER:
2131 err |= __put_user(from->si_tid, &to->si_tid);
2132 err |= __put_user(from->si_overrun, &to->si_overrun);
2133 err |= __put_user(from->si_ptr, &to->si_ptr);
2134 break;
2135 case __SI_POLL:
2136 err |= __put_user(from->si_band, &to->si_band);
2137 err |= __put_user(from->si_fd, &to->si_fd);
2138 break;
2139 case __SI_FAULT:
2140 err |= __put_user(from->si_addr, &to->si_addr);
2141#ifdef __ARCH_SI_TRAPNO
2142 err |= __put_user(from->si_trapno, &to->si_trapno);
2143#endif
2144 break;
2145 case __SI_CHLD:
2146 err |= __put_user(from->si_pid, &to->si_pid);
2147 err |= __put_user(from->si_uid, &to->si_uid);
2148 err |= __put_user(from->si_status, &to->si_status);
2149 err |= __put_user(from->si_utime, &to->si_utime);
2150 err |= __put_user(from->si_stime, &to->si_stime);
2151 break;
2152 case __SI_RT: /* This is not generated by the kernel as of now. */
2153 case __SI_MESGQ: /* But this is */
2154 err |= __put_user(from->si_pid, &to->si_pid);
2155 err |= __put_user(from->si_uid, &to->si_uid);
2156 err |= __put_user(from->si_ptr, &to->si_ptr);
2157 break;
2158 default: /* this is just in case for now ... */
2159 err |= __put_user(from->si_pid, &to->si_pid);
2160 err |= __put_user(from->si_uid, &to->si_uid);
2161 break;
2162 }
2163 return err;
2164}
2165
2166#endif
2167
2168asmlinkage long
2169sys_rt_sigtimedwait(const sigset_t __user *uthese,
2170 siginfo_t __user *uinfo,
2171 const struct timespec __user *uts,
2172 size_t sigsetsize)
2173{
2174 int ret, sig;
2175 sigset_t these;
2176 struct timespec ts;
2177 siginfo_t info;
2178 long timeout = 0;
2179
2180 /* XXX: Don't preclude handling different sized sigset_t's. */
2181 if (sigsetsize != sizeof(sigset_t))
2182 return -EINVAL;
2183
2184 if (copy_from_user(&these, uthese, sizeof(these)))
2185 return -EFAULT;
2186
2187 /*
2188 * Invert the set of allowed signals to get those we
2189 * want to block.
2190 */
2191 sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
2192 signotset(&these);
2193
2194 if (uts) {
2195 if (copy_from_user(&ts, uts, sizeof(ts)))
2196 return -EFAULT;
2197 if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
2198 || ts.tv_sec < 0)
2199 return -EINVAL;
2200 }
2201
2202 spin_lock_irq(&current->sighand->siglock);
2203 sig = dequeue_signal(current, &these, &info);
2204 if (!sig) {
2205 timeout = MAX_SCHEDULE_TIMEOUT;
2206 if (uts)
2207 timeout = (timespec_to_jiffies(&ts)
2208 + (ts.tv_sec || ts.tv_nsec));
2209
2210 if (timeout) {
2211 /* None ready -- temporarily unblock those we're
2212 * interested while we are sleeping in so that we'll
2213 * be awakened when they arrive. */
2214 current->real_blocked = current->blocked;
2215 sigandsets(&current->blocked, &current->blocked, &these);
2216 recalc_sigpending();
2217 spin_unlock_irq(&current->sighand->siglock);
2218
2219 current->state = TASK_INTERRUPTIBLE;
2220 timeout = schedule_timeout(timeout);
2221
2222 if (current->flags & PF_FREEZE)
2223 refrigerator(PF_FREEZE);
2224 spin_lock_irq(&current->sighand->siglock);
2225 sig = dequeue_signal(current, &these, &info);
2226 current->blocked = current->real_blocked;
2227 siginitset(&current->real_blocked, 0);
2228 recalc_sigpending();
2229 }
2230 }
2231 spin_unlock_irq(&current->sighand->siglock);
2232
2233 if (sig) {
2234 ret = sig;
2235 if (uinfo) {
2236 if (copy_siginfo_to_user(uinfo, &info))
2237 ret = -EFAULT;
2238 }
2239 } else {
2240 ret = -EAGAIN;
2241 if (timeout)
2242 ret = -EINTR;
2243 }
2244
2245 return ret;
2246}
2247
2248asmlinkage long
2249sys_kill(int pid, int sig)
2250{
2251 struct siginfo info;
2252
2253 info.si_signo = sig;
2254 info.si_errno = 0;
2255 info.si_code = SI_USER;
2256 info.si_pid = current->tgid;
2257 info.si_uid = current->uid;
2258
2259 return kill_something_info(sig, &info, pid);
2260}
2261
2262/**
2263 * sys_tgkill - send signal to one specific thread
2264 * @tgid: the thread group ID of the thread
2265 * @pid: the PID of the thread
2266 * @sig: signal to be sent
2267 *
2268 * This syscall also checks the tgid and returns -ESRCH even if the PID
2269 * exists but it's not belonging to the target process anymore. This
2270 * method solves the problem of threads exiting and PIDs getting reused.
2271 */
2272asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2273{
2274 struct siginfo info;
2275 int error;
2276 struct task_struct *p;
2277
2278 /* This is only valid for single tasks */
2279 if (pid <= 0 || tgid <= 0)
2280 return -EINVAL;
2281
2282 info.si_signo = sig;
2283 info.si_errno = 0;
2284 info.si_code = SI_TKILL;
2285 info.si_pid = current->tgid;
2286 info.si_uid = current->uid;
2287
2288 read_lock(&tasklist_lock);
2289 p = find_task_by_pid(pid);
2290 error = -ESRCH;
2291 if (p && (p->tgid == tgid)) {
2292 error = check_kill_permission(sig, &info, p);
2293 /*
2294 * The null signal is a permissions and process existence
2295 * probe. No signal is actually delivered.
2296 */
2297 if (!error && sig && p->sighand) {
2298 spin_lock_irq(&p->sighand->siglock);
2299 handle_stop_signal(sig, p);
2300 error = specific_send_sig_info(sig, &info, p);
2301 spin_unlock_irq(&p->sighand->siglock);
2302 }
2303 }
2304 read_unlock(&tasklist_lock);
2305 return error;
2306}
2307
2308/*
2309 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2310 */
2311asmlinkage long
2312sys_tkill(int pid, int sig)
2313{
2314 struct siginfo info;
2315 int error;
2316 struct task_struct *p;
2317
2318 /* This is only valid for single tasks */
2319 if (pid <= 0)
2320 return -EINVAL;
2321
2322 info.si_signo = sig;
2323 info.si_errno = 0;
2324 info.si_code = SI_TKILL;
2325 info.si_pid = current->tgid;
2326 info.si_uid = current->uid;
2327
2328 read_lock(&tasklist_lock);
2329 p = find_task_by_pid(pid);
2330 error = -ESRCH;
2331 if (p) {
2332 error = check_kill_permission(sig, &info, p);
2333 /*
2334 * The null signal is a permissions and process existence
2335 * probe. No signal is actually delivered.
2336 */
2337 if (!error && sig && p->sighand) {
2338 spin_lock_irq(&p->sighand->siglock);
2339 handle_stop_signal(sig, p);
2340 error = specific_send_sig_info(sig, &info, p);
2341 spin_unlock_irq(&p->sighand->siglock);
2342 }
2343 }
2344 read_unlock(&tasklist_lock);
2345 return error;
2346}
2347
2348asmlinkage long
2349sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
2350{
2351 siginfo_t info;
2352
2353 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2354 return -EFAULT;
2355
2356 /* Not even root can pretend to send signals from the kernel.
2357 Nor can they impersonate a kill(), which adds source info. */
2358 if (info.si_code >= 0)
2359 return -EPERM;
2360 info.si_signo = sig;
2361
2362 /* POSIX.1b doesn't mention process groups. */
2363 return kill_proc_info(sig, &info, pid);
2364}
2365
2366int
2367do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2368{
2369 struct k_sigaction *k;
2370
2371 if (sig < 1 || sig > _NSIG || (act && sig_kernel_only(sig)))
2372 return -EINVAL;
2373
2374 k = &current->sighand->action[sig-1];
2375
2376 spin_lock_irq(&current->sighand->siglock);
2377 if (signal_pending(current)) {
2378 /*
2379 * If there might be a fatal signal pending on multiple
2380 * threads, make sure we take it before changing the action.
2381 */
2382 spin_unlock_irq(&current->sighand->siglock);
2383 return -ERESTARTNOINTR;
2384 }
2385
2386 if (oact)
2387 *oact = *k;
2388
2389 if (act) {
2390 /*
2391 * POSIX 3.3.1.3:
2392 * "Setting a signal action to SIG_IGN for a signal that is
2393 * pending shall cause the pending signal to be discarded,
2394 * whether or not it is blocked."
2395 *
2396 * "Setting a signal action to SIG_DFL for a signal that is
2397 * pending and whose default action is to ignore the signal
2398 * (for example, SIGCHLD), shall cause the pending signal to
2399 * be discarded, whether or not it is blocked"
2400 */
2401 if (act->sa.sa_handler == SIG_IGN ||
2402 (act->sa.sa_handler == SIG_DFL &&
2403 sig_kernel_ignore(sig))) {
2404 /*
2405 * This is a fairly rare case, so we only take the
2406 * tasklist_lock once we're sure we'll need it.
2407 * Now we must do this little unlock and relock
2408 * dance to maintain the lock hierarchy.
2409 */
2410 struct task_struct *t = current;
2411 spin_unlock_irq(&t->sighand->siglock);
2412 read_lock(&tasklist_lock);
2413 spin_lock_irq(&t->sighand->siglock);
2414 *k = *act;
2415 sigdelsetmask(&k->sa.sa_mask,
2416 sigmask(SIGKILL) | sigmask(SIGSTOP));
2417 rm_from_queue(sigmask(sig), &t->signal->shared_pending);
2418 do {
2419 rm_from_queue(sigmask(sig), &t->pending);
2420 recalc_sigpending_tsk(t);
2421 t = next_thread(t);
2422 } while (t != current);
2423 spin_unlock_irq(&current->sighand->siglock);
2424 read_unlock(&tasklist_lock);
2425 return 0;
2426 }
2427
2428 *k = *act;
2429 sigdelsetmask(&k->sa.sa_mask,
2430 sigmask(SIGKILL) | sigmask(SIGSTOP));
2431 }
2432
2433 spin_unlock_irq(&current->sighand->siglock);
2434 return 0;
2435}
2436
2437int
2438do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
2439{
2440 stack_t oss;
2441 int error;
2442
2443 if (uoss) {
2444 oss.ss_sp = (void __user *) current->sas_ss_sp;
2445 oss.ss_size = current->sas_ss_size;
2446 oss.ss_flags = sas_ss_flags(sp);
2447 }
2448
2449 if (uss) {
2450 void __user *ss_sp;
2451 size_t ss_size;
2452 int ss_flags;
2453
2454 error = -EFAULT;
2455 if (!access_ok(VERIFY_READ, uss, sizeof(*uss))
2456 || __get_user(ss_sp, &uss->ss_sp)
2457 || __get_user(ss_flags, &uss->ss_flags)
2458 || __get_user(ss_size, &uss->ss_size))
2459 goto out;
2460
2461 error = -EPERM;
2462 if (on_sig_stack(sp))
2463 goto out;
2464
2465 error = -EINVAL;
2466 /*
2467 *
2468 * Note - this code used to test ss_flags incorrectly
2469 * old code may have been written using ss_flags==0
2470 * to mean ss_flags==SS_ONSTACK (as this was the only
2471 * way that worked) - this fix preserves that older
2472 * mechanism
2473 */
2474 if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
2475 goto out;
2476
2477 if (ss_flags == SS_DISABLE) {
2478 ss_size = 0;
2479 ss_sp = NULL;
2480 } else {
2481 error = -ENOMEM;
2482 if (ss_size < MINSIGSTKSZ)
2483 goto out;
2484 }
2485
2486 current->sas_ss_sp = (unsigned long) ss_sp;
2487 current->sas_ss_size = ss_size;
2488 }
2489
2490 if (uoss) {
2491 error = -EFAULT;
2492 if (copy_to_user(uoss, &oss, sizeof(oss)))
2493 goto out;
2494 }
2495
2496 error = 0;
2497out:
2498 return error;
2499}
2500
2501#ifdef __ARCH_WANT_SYS_SIGPENDING
2502
2503asmlinkage long
2504sys_sigpending(old_sigset_t __user *set)
2505{
2506 return do_sigpending(set, sizeof(*set));
2507}
2508
2509#endif
2510
2511#ifdef __ARCH_WANT_SYS_SIGPROCMASK
2512/* Some platforms have their own version with special arguments others
2513 support only sys_rt_sigprocmask. */
2514
2515asmlinkage long
2516sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset)
2517{
2518 int error;
2519 old_sigset_t old_set, new_set;
2520
2521 if (set) {
2522 error = -EFAULT;
2523 if (copy_from_user(&new_set, set, sizeof(*set)))
2524 goto out;
2525 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
2526
2527 spin_lock_irq(&current->sighand->siglock);
2528 old_set = current->blocked.sig[0];
2529
2530 error = 0;
2531 switch (how) {
2532 default:
2533 error = -EINVAL;
2534 break;
2535 case SIG_BLOCK:
2536 sigaddsetmask(&current->blocked, new_set);
2537 break;
2538 case SIG_UNBLOCK:
2539 sigdelsetmask(&current->blocked, new_set);
2540 break;
2541 case SIG_SETMASK:
2542 current->blocked.sig[0] = new_set;
2543 break;
2544 }
2545
2546 recalc_sigpending();
2547 spin_unlock_irq(&current->sighand->siglock);
2548 if (error)
2549 goto out;
2550 if (oset)
2551 goto set_old;
2552 } else if (oset) {
2553 old_set = current->blocked.sig[0];
2554 set_old:
2555 error = -EFAULT;
2556 if (copy_to_user(oset, &old_set, sizeof(*oset)))
2557 goto out;
2558 }
2559 error = 0;
2560out:
2561 return error;
2562}
2563#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2564
2565#ifdef __ARCH_WANT_SYS_RT_SIGACTION
2566asmlinkage long
2567sys_rt_sigaction(int sig,
2568 const struct sigaction __user *act,
2569 struct sigaction __user *oact,
2570 size_t sigsetsize)
2571{
2572 struct k_sigaction new_sa, old_sa;
2573 int ret = -EINVAL;
2574
2575 /* XXX: Don't preclude handling different sized sigset_t's. */
2576 if (sigsetsize != sizeof(sigset_t))
2577 goto out;
2578
2579 if (act) {
2580 if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
2581 return -EFAULT;
2582 }
2583
2584 ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
2585
2586 if (!ret && oact) {
2587 if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
2588 return -EFAULT;
2589 }
2590out:
2591 return ret;
2592}
2593#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
2594
2595#ifdef __ARCH_WANT_SYS_SGETMASK
2596
2597/*
2598 * For backwards compatibility. Functionality superseded by sigprocmask.
2599 */
2600asmlinkage long
2601sys_sgetmask(void)
2602{
2603 /* SMP safe */
2604 return current->blocked.sig[0];
2605}
2606
2607asmlinkage long
2608sys_ssetmask(int newmask)
2609{
2610 int old;
2611
2612 spin_lock_irq(&current->sighand->siglock);
2613 old = current->blocked.sig[0];
2614
2615 siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
2616 sigmask(SIGSTOP)));
2617 recalc_sigpending();
2618 spin_unlock_irq(&current->sighand->siglock);
2619
2620 return old;
2621}
2622#endif /* __ARCH_WANT_SGETMASK */
2623
2624#ifdef __ARCH_WANT_SYS_SIGNAL
2625/*
2626 * For backwards compatibility. Functionality superseded by sigaction.
2627 */
2628asmlinkage unsigned long
2629sys_signal(int sig, __sighandler_t handler)
2630{
2631 struct k_sigaction new_sa, old_sa;
2632 int ret;
2633
2634 new_sa.sa.sa_handler = handler;
2635 new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
2636
2637 ret = do_sigaction(sig, &new_sa, &old_sa);
2638
2639 return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
2640}
2641#endif /* __ARCH_WANT_SYS_SIGNAL */
2642
2643#ifdef __ARCH_WANT_SYS_PAUSE
2644
2645asmlinkage long
2646sys_pause(void)
2647{
2648 current->state = TASK_INTERRUPTIBLE;
2649 schedule();
2650 return -ERESTARTNOHAND;
2651}
2652
2653#endif
2654
2655void __init signals_init(void)
2656{
2657 sigqueue_cachep =
2658 kmem_cache_create("sigqueue",
2659 sizeof(struct sigqueue),
2660 __alignof__(struct sigqueue),
2661 SLAB_PANIC, NULL, NULL);
2662}
diff --git a/kernel/softirq.c b/kernel/softirq.c
new file mode 100644
index 000000000000..b4ab6af1dea8
--- /dev/null
+++ b/kernel/softirq.c
@@ -0,0 +1,496 @@
1/*
2 * linux/kernel/softirq.c
3 *
4 * Copyright (C) 1992 Linus Torvalds
5 *
6 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
7 */
8
9#include <linux/module.h>
10#include <linux/kernel_stat.h>
11#include <linux/interrupt.h>
12#include <linux/init.h>
13#include <linux/mm.h>
14#include <linux/notifier.h>
15#include <linux/percpu.h>
16#include <linux/cpu.h>
17#include <linux/kthread.h>
18#include <linux/rcupdate.h>
19
20#include <asm/irq.h>
21/*
22 - No shared variables, all the data are CPU local.
23 - If a softirq needs serialization, let it serialize itself
24 by its own spinlocks.
25 - Even if softirq is serialized, only local cpu is marked for
26 execution. Hence, we get something sort of weak cpu binding.
27 Though it is still not clear, will it result in better locality
28 or will not.
29
30 Examples:
31 - NET RX softirq. It is multithreaded and does not require
32 any global serialization.
33 - NET TX softirq. It kicks software netdevice queues, hence
34 it is logically serialized per device, but this serialization
35 is invisible to common code.
36 - Tasklets: serialized wrt itself.
37 */
38
39#ifndef __ARCH_IRQ_STAT
40irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
41EXPORT_SYMBOL(irq_stat);
42#endif
43
44static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
45
46static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
47
48/*
49 * we cannot loop indefinitely here to avoid userspace starvation,
50 * but we also don't want to introduce a worst case 1/HZ latency
51 * to the pending events, so lets the scheduler to balance
52 * the softirq load for us.
53 */
54static inline void wakeup_softirqd(void)
55{
56 /* Interrupts are disabled: no need to stop preemption */
57 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
58
59 if (tsk && tsk->state != TASK_RUNNING)
60 wake_up_process(tsk);
61}
62
63/*
64 * We restart softirq processing MAX_SOFTIRQ_RESTART times,
65 * and we fall back to softirqd after that.
66 *
67 * This number has been established via experimentation.
68 * The two things to balance is latency against fairness -
69 * we want to handle softirqs as soon as possible, but they
70 * should not be able to lock up the box.
71 */
72#define MAX_SOFTIRQ_RESTART 10
73
74asmlinkage void __do_softirq(void)
75{
76 struct softirq_action *h;
77 __u32 pending;
78 int max_restart = MAX_SOFTIRQ_RESTART;
79 int cpu;
80
81 pending = local_softirq_pending();
82
83 local_bh_disable();
84 cpu = smp_processor_id();
85restart:
86 /* Reset the pending bitmask before enabling irqs */
87 local_softirq_pending() = 0;
88
89 local_irq_enable();
90
91 h = softirq_vec;
92
93 do {
94 if (pending & 1) {
95 h->action(h);
96 rcu_bh_qsctr_inc(cpu);
97 }
98 h++;
99 pending >>= 1;
100 } while (pending);
101
102 local_irq_disable();
103
104 pending = local_softirq_pending();
105 if (pending && --max_restart)
106 goto restart;
107
108 if (pending)
109 wakeup_softirqd();
110
111 __local_bh_enable();
112}
113
114#ifndef __ARCH_HAS_DO_SOFTIRQ
115
116asmlinkage void do_softirq(void)
117{
118 __u32 pending;
119 unsigned long flags;
120
121 if (in_interrupt())
122 return;
123
124 local_irq_save(flags);
125
126 pending = local_softirq_pending();
127
128 if (pending)
129 __do_softirq();
130
131 local_irq_restore(flags);
132}
133
134EXPORT_SYMBOL(do_softirq);
135
136#endif
137
138void local_bh_enable(void)
139{
140 WARN_ON(irqs_disabled());
141 /*
142 * Keep preemption disabled until we are done with
143 * softirq processing:
144 */
145 sub_preempt_count(SOFTIRQ_OFFSET - 1);
146
147 if (unlikely(!in_interrupt() && local_softirq_pending()))
148 do_softirq();
149
150 dec_preempt_count();
151 preempt_check_resched();
152}
153EXPORT_SYMBOL(local_bh_enable);
154
155#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
156# define invoke_softirq() __do_softirq()
157#else
158# define invoke_softirq() do_softirq()
159#endif
160
161/*
162 * Exit an interrupt context. Process softirqs if needed and possible:
163 */
164void irq_exit(void)
165{
166 account_system_vtime(current);
167 sub_preempt_count(IRQ_EXIT_OFFSET);
168 if (!in_interrupt() && local_softirq_pending())
169 invoke_softirq();
170 preempt_enable_no_resched();
171}
172
173/*
174 * This function must run with irqs disabled!
175 */
176inline fastcall void raise_softirq_irqoff(unsigned int nr)
177{
178 __raise_softirq_irqoff(nr);
179
180 /*
181 * If we're in an interrupt or softirq, we're done
182 * (this also catches softirq-disabled code). We will
183 * actually run the softirq once we return from
184 * the irq or softirq.
185 *
186 * Otherwise we wake up ksoftirqd to make sure we
187 * schedule the softirq soon.
188 */
189 if (!in_interrupt())
190 wakeup_softirqd();
191}
192
193EXPORT_SYMBOL(raise_softirq_irqoff);
194
195void fastcall raise_softirq(unsigned int nr)
196{
197 unsigned long flags;
198
199 local_irq_save(flags);
200 raise_softirq_irqoff(nr);
201 local_irq_restore(flags);
202}
203
204void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
205{
206 softirq_vec[nr].data = data;
207 softirq_vec[nr].action = action;
208}
209
210EXPORT_SYMBOL(open_softirq);
211
212/* Tasklets */
213struct tasklet_head
214{
215 struct tasklet_struct *list;
216};
217
218/* Some compilers disobey section attribute on statics when not
219 initialized -- RR */
220static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
221static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
222
223void fastcall __tasklet_schedule(struct tasklet_struct *t)
224{
225 unsigned long flags;
226
227 local_irq_save(flags);
228 t->next = __get_cpu_var(tasklet_vec).list;
229 __get_cpu_var(tasklet_vec).list = t;
230 raise_softirq_irqoff(TASKLET_SOFTIRQ);
231 local_irq_restore(flags);
232}
233
234EXPORT_SYMBOL(__tasklet_schedule);
235
236void fastcall __tasklet_hi_schedule(struct tasklet_struct *t)
237{
238 unsigned long flags;
239
240 local_irq_save(flags);
241 t->next = __get_cpu_var(tasklet_hi_vec).list;
242 __get_cpu_var(tasklet_hi_vec).list = t;
243 raise_softirq_irqoff(HI_SOFTIRQ);
244 local_irq_restore(flags);
245}
246
247EXPORT_SYMBOL(__tasklet_hi_schedule);
248
249static void tasklet_action(struct softirq_action *a)
250{
251 struct tasklet_struct *list;
252
253 local_irq_disable();
254 list = __get_cpu_var(tasklet_vec).list;
255 __get_cpu_var(tasklet_vec).list = NULL;
256 local_irq_enable();
257
258 while (list) {
259 struct tasklet_struct *t = list;
260
261 list = list->next;
262
263 if (tasklet_trylock(t)) {
264 if (!atomic_read(&t->count)) {
265 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
266 BUG();
267 t->func(t->data);
268 tasklet_unlock(t);
269 continue;
270 }
271 tasklet_unlock(t);
272 }
273
274 local_irq_disable();
275 t->next = __get_cpu_var(tasklet_vec).list;
276 __get_cpu_var(tasklet_vec).list = t;
277 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
278 local_irq_enable();
279 }
280}
281
282static void tasklet_hi_action(struct softirq_action *a)
283{
284 struct tasklet_struct *list;
285
286 local_irq_disable();
287 list = __get_cpu_var(tasklet_hi_vec).list;
288 __get_cpu_var(tasklet_hi_vec).list = NULL;
289 local_irq_enable();
290
291 while (list) {
292 struct tasklet_struct *t = list;
293
294 list = list->next;
295
296 if (tasklet_trylock(t)) {
297 if (!atomic_read(&t->count)) {
298 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
299 BUG();
300 t->func(t->data);
301 tasklet_unlock(t);
302 continue;
303 }
304 tasklet_unlock(t);
305 }
306
307 local_irq_disable();
308 t->next = __get_cpu_var(tasklet_hi_vec).list;
309 __get_cpu_var(tasklet_hi_vec).list = t;
310 __raise_softirq_irqoff(HI_SOFTIRQ);
311 local_irq_enable();
312 }
313}
314
315
316void tasklet_init(struct tasklet_struct *t,
317 void (*func)(unsigned long), unsigned long data)
318{
319 t->next = NULL;
320 t->state = 0;
321 atomic_set(&t->count, 0);
322 t->func = func;
323 t->data = data;
324}
325
326EXPORT_SYMBOL(tasklet_init);
327
328void tasklet_kill(struct tasklet_struct *t)
329{
330 if (in_interrupt())
331 printk("Attempt to kill tasklet from interrupt\n");
332
333 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
334 do
335 yield();
336 while (test_bit(TASKLET_STATE_SCHED, &t->state));
337 }
338 tasklet_unlock_wait(t);
339 clear_bit(TASKLET_STATE_SCHED, &t->state);
340}
341
342EXPORT_SYMBOL(tasklet_kill);
343
344void __init softirq_init(void)
345{
346 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
347 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
348}
349
350static int ksoftirqd(void * __bind_cpu)
351{
352 set_user_nice(current, 19);
353 current->flags |= PF_NOFREEZE;
354
355 set_current_state(TASK_INTERRUPTIBLE);
356
357 while (!kthread_should_stop()) {
358 preempt_disable();
359 if (!local_softirq_pending()) {
360 preempt_enable_no_resched();
361 schedule();
362 preempt_disable();
363 }
364
365 __set_current_state(TASK_RUNNING);
366
367 while (local_softirq_pending()) {
368 /* Preempt disable stops cpu going offline.
369 If already offline, we'll be on wrong CPU:
370 don't process */
371 if (cpu_is_offline((long)__bind_cpu))
372 goto wait_to_die;
373 do_softirq();
374 preempt_enable_no_resched();
375 cond_resched();
376 preempt_disable();
377 }
378 preempt_enable();
379 set_current_state(TASK_INTERRUPTIBLE);
380 }
381 __set_current_state(TASK_RUNNING);
382 return 0;
383
384wait_to_die:
385 preempt_enable();
386 /* Wait for kthread_stop */
387 set_current_state(TASK_INTERRUPTIBLE);
388 while (!kthread_should_stop()) {
389 schedule();
390 set_current_state(TASK_INTERRUPTIBLE);
391 }
392 __set_current_state(TASK_RUNNING);
393 return 0;
394}
395
396#ifdef CONFIG_HOTPLUG_CPU
397/*
398 * tasklet_kill_immediate is called to remove a tasklet which can already be
399 * scheduled for execution on @cpu.
400 *
401 * Unlike tasklet_kill, this function removes the tasklet
402 * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
403 *
404 * When this function is called, @cpu must be in the CPU_DEAD state.
405 */
406void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
407{
408 struct tasklet_struct **i;
409
410 BUG_ON(cpu_online(cpu));
411 BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
412
413 if (!test_bit(TASKLET_STATE_SCHED, &t->state))
414 return;
415
416 /* CPU is dead, so no lock needed. */
417 for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) {
418 if (*i == t) {
419 *i = t->next;
420 return;
421 }
422 }
423 BUG();
424}
425
426static void takeover_tasklets(unsigned int cpu)
427{
428 struct tasklet_struct **i;
429
430 /* CPU is dead, so no lock needed. */
431 local_irq_disable();
432
433 /* Find end, append list for that CPU. */
434 for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next);
435 *i = per_cpu(tasklet_vec, cpu).list;
436 per_cpu(tasklet_vec, cpu).list = NULL;
437 raise_softirq_irqoff(TASKLET_SOFTIRQ);
438
439 for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next);
440 *i = per_cpu(tasklet_hi_vec, cpu).list;
441 per_cpu(tasklet_hi_vec, cpu).list = NULL;
442 raise_softirq_irqoff(HI_SOFTIRQ);
443
444 local_irq_enable();
445}
446#endif /* CONFIG_HOTPLUG_CPU */
447
448static int __devinit cpu_callback(struct notifier_block *nfb,
449 unsigned long action,
450 void *hcpu)
451{
452 int hotcpu = (unsigned long)hcpu;
453 struct task_struct *p;
454
455 switch (action) {
456 case CPU_UP_PREPARE:
457 BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
458 BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
459 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
460 if (IS_ERR(p)) {
461 printk("ksoftirqd for %i failed\n", hotcpu);
462 return NOTIFY_BAD;
463 }
464 kthread_bind(p, hotcpu);
465 per_cpu(ksoftirqd, hotcpu) = p;
466 break;
467 case CPU_ONLINE:
468 wake_up_process(per_cpu(ksoftirqd, hotcpu));
469 break;
470#ifdef CONFIG_HOTPLUG_CPU
471 case CPU_UP_CANCELED:
472 /* Unbind so it can run. Fall thru. */
473 kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id());
474 case CPU_DEAD:
475 p = per_cpu(ksoftirqd, hotcpu);
476 per_cpu(ksoftirqd, hotcpu) = NULL;
477 kthread_stop(p);
478 takeover_tasklets(hotcpu);
479 break;
480#endif /* CONFIG_HOTPLUG_CPU */
481 }
482 return NOTIFY_OK;
483}
484
485static struct notifier_block __devinitdata cpu_nfb = {
486 .notifier_call = cpu_callback
487};
488
489__init int spawn_ksoftirqd(void)
490{
491 void *cpu = (void *)(long)smp_processor_id();
492 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
493 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
494 register_cpu_notifier(&cpu_nfb);
495 return 0;
496}
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
new file mode 100644
index 000000000000..e15ed17863f1
--- /dev/null
+++ b/kernel/spinlock.c
@@ -0,0 +1,371 @@
1/*
2 * Copyright (2004) Linus Torvalds
3 *
4 * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
5 *
6 * Copyright (2004) Ingo Molnar
7 */
8
9#include <linux/config.h>
10#include <linux/linkage.h>
11#include <linux/preempt.h>
12#include <linux/spinlock.h>
13#include <linux/interrupt.h>
14#include <linux/module.h>
15
16/*
17 * Generic declaration of the raw read_trylock() function,
18 * architectures are supposed to optimize this:
19 */
20int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
21{
22 _raw_read_lock(lock);
23 return 1;
24}
25EXPORT_SYMBOL(generic_raw_read_trylock);
26
27int __lockfunc _spin_trylock(spinlock_t *lock)
28{
29 preempt_disable();
30 if (_raw_spin_trylock(lock))
31 return 1;
32
33 preempt_enable();
34 return 0;
35}
36EXPORT_SYMBOL(_spin_trylock);
37
38int __lockfunc _read_trylock(rwlock_t *lock)
39{
40 preempt_disable();
41 if (_raw_read_trylock(lock))
42 return 1;
43
44 preempt_enable();
45 return 0;
46}
47EXPORT_SYMBOL(_read_trylock);
48
49int __lockfunc _write_trylock(rwlock_t *lock)
50{
51 preempt_disable();
52 if (_raw_write_trylock(lock))
53 return 1;
54
55 preempt_enable();
56 return 0;
57}
58EXPORT_SYMBOL(_write_trylock);
59
60#ifndef CONFIG_PREEMPT
61
62void __lockfunc _read_lock(rwlock_t *lock)
63{
64 preempt_disable();
65 _raw_read_lock(lock);
66}
67EXPORT_SYMBOL(_read_lock);
68
69unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
70{
71 unsigned long flags;
72
73 local_irq_save(flags);
74 preempt_disable();
75 _raw_spin_lock_flags(lock, flags);
76 return flags;
77}
78EXPORT_SYMBOL(_spin_lock_irqsave);
79
80void __lockfunc _spin_lock_irq(spinlock_t *lock)
81{
82 local_irq_disable();
83 preempt_disable();
84 _raw_spin_lock(lock);
85}
86EXPORT_SYMBOL(_spin_lock_irq);
87
88void __lockfunc _spin_lock_bh(spinlock_t *lock)
89{
90 local_bh_disable();
91 preempt_disable();
92 _raw_spin_lock(lock);
93}
94EXPORT_SYMBOL(_spin_lock_bh);
95
96unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
97{
98 unsigned long flags;
99
100 local_irq_save(flags);
101 preempt_disable();
102 _raw_read_lock(lock);
103 return flags;
104}
105EXPORT_SYMBOL(_read_lock_irqsave);
106
107void __lockfunc _read_lock_irq(rwlock_t *lock)
108{
109 local_irq_disable();
110 preempt_disable();
111 _raw_read_lock(lock);
112}
113EXPORT_SYMBOL(_read_lock_irq);
114
115void __lockfunc _read_lock_bh(rwlock_t *lock)
116{
117 local_bh_disable();
118 preempt_disable();
119 _raw_read_lock(lock);
120}
121EXPORT_SYMBOL(_read_lock_bh);
122
123unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
124{
125 unsigned long flags;
126
127 local_irq_save(flags);
128 preempt_disable();
129 _raw_write_lock(lock);
130 return flags;
131}
132EXPORT_SYMBOL(_write_lock_irqsave);
133
134void __lockfunc _write_lock_irq(rwlock_t *lock)
135{
136 local_irq_disable();
137 preempt_disable();
138 _raw_write_lock(lock);
139}
140EXPORT_SYMBOL(_write_lock_irq);
141
142void __lockfunc _write_lock_bh(rwlock_t *lock)
143{
144 local_bh_disable();
145 preempt_disable();
146 _raw_write_lock(lock);
147}
148EXPORT_SYMBOL(_write_lock_bh);
149
150void __lockfunc _spin_lock(spinlock_t *lock)
151{
152 preempt_disable();
153 _raw_spin_lock(lock);
154}
155
156EXPORT_SYMBOL(_spin_lock);
157
158void __lockfunc _write_lock(rwlock_t *lock)
159{
160 preempt_disable();
161 _raw_write_lock(lock);
162}
163
164EXPORT_SYMBOL(_write_lock);
165
166#else /* CONFIG_PREEMPT: */
167
168/*
169 * This could be a long-held lock. We both prepare to spin for a long
170 * time (making _this_ CPU preemptable if possible), and we also signal
171 * towards that other CPU that it should break the lock ASAP.
172 *
173 * (We do this in a function because inlining it would be excessive.)
174 */
175
176#define BUILD_LOCK_OPS(op, locktype) \
177void __lockfunc _##op##_lock(locktype##_t *lock) \
178{ \
179 preempt_disable(); \
180 for (;;) { \
181 if (likely(_raw_##op##_trylock(lock))) \
182 break; \
183 preempt_enable(); \
184 if (!(lock)->break_lock) \
185 (lock)->break_lock = 1; \
186 while (!op##_can_lock(lock) && (lock)->break_lock) \
187 cpu_relax(); \
188 preempt_disable(); \
189 } \
190 (lock)->break_lock = 0; \
191} \
192 \
193EXPORT_SYMBOL(_##op##_lock); \
194 \
195unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
196{ \
197 unsigned long flags; \
198 \
199 preempt_disable(); \
200 for (;;) { \
201 local_irq_save(flags); \
202 if (likely(_raw_##op##_trylock(lock))) \
203 break; \
204 local_irq_restore(flags); \
205 \
206 preempt_enable(); \
207 if (!(lock)->break_lock) \
208 (lock)->break_lock = 1; \
209 while (!op##_can_lock(lock) && (lock)->break_lock) \
210 cpu_relax(); \
211 preempt_disable(); \
212 } \
213 (lock)->break_lock = 0; \
214 return flags; \
215} \
216 \
217EXPORT_SYMBOL(_##op##_lock_irqsave); \
218 \
219void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
220{ \
221 _##op##_lock_irqsave(lock); \
222} \
223 \
224EXPORT_SYMBOL(_##op##_lock_irq); \
225 \
226void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
227{ \
228 unsigned long flags; \
229 \
230 /* */ \
231 /* Careful: we must exclude softirqs too, hence the */ \
232 /* irq-disabling. We use the generic preemption-aware */ \
233 /* function: */ \
234 /**/ \
235 flags = _##op##_lock_irqsave(lock); \
236 local_bh_disable(); \
237 local_irq_restore(flags); \
238} \
239 \
240EXPORT_SYMBOL(_##op##_lock_bh)
241
242/*
243 * Build preemption-friendly versions of the following
244 * lock-spinning functions:
245 *
246 * _[spin|read|write]_lock()
247 * _[spin|read|write]_lock_irq()
248 * _[spin|read|write]_lock_irqsave()
249 * _[spin|read|write]_lock_bh()
250 */
251BUILD_LOCK_OPS(spin, spinlock);
252BUILD_LOCK_OPS(read, rwlock);
253BUILD_LOCK_OPS(write, rwlock);
254
255#endif /* CONFIG_PREEMPT */
256
257void __lockfunc _spin_unlock(spinlock_t *lock)
258{
259 _raw_spin_unlock(lock);
260 preempt_enable();
261}
262EXPORT_SYMBOL(_spin_unlock);
263
264void __lockfunc _write_unlock(rwlock_t *lock)
265{
266 _raw_write_unlock(lock);
267 preempt_enable();
268}
269EXPORT_SYMBOL(_write_unlock);
270
271void __lockfunc _read_unlock(rwlock_t *lock)
272{
273 _raw_read_unlock(lock);
274 preempt_enable();
275}
276EXPORT_SYMBOL(_read_unlock);
277
278void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
279{
280 _raw_spin_unlock(lock);
281 local_irq_restore(flags);
282 preempt_enable();
283}
284EXPORT_SYMBOL(_spin_unlock_irqrestore);
285
286void __lockfunc _spin_unlock_irq(spinlock_t *lock)
287{
288 _raw_spin_unlock(lock);
289 local_irq_enable();
290 preempt_enable();
291}
292EXPORT_SYMBOL(_spin_unlock_irq);
293
294void __lockfunc _spin_unlock_bh(spinlock_t *lock)
295{
296 _raw_spin_unlock(lock);
297 preempt_enable();
298 local_bh_enable();
299}
300EXPORT_SYMBOL(_spin_unlock_bh);
301
302void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
303{
304 _raw_read_unlock(lock);
305 local_irq_restore(flags);
306 preempt_enable();
307}
308EXPORT_SYMBOL(_read_unlock_irqrestore);
309
310void __lockfunc _read_unlock_irq(rwlock_t *lock)
311{
312 _raw_read_unlock(lock);
313 local_irq_enable();
314 preempt_enable();
315}
316EXPORT_SYMBOL(_read_unlock_irq);
317
318void __lockfunc _read_unlock_bh(rwlock_t *lock)
319{
320 _raw_read_unlock(lock);
321 preempt_enable();
322 local_bh_enable();
323}
324EXPORT_SYMBOL(_read_unlock_bh);
325
326void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
327{
328 _raw_write_unlock(lock);
329 local_irq_restore(flags);
330 preempt_enable();
331}
332EXPORT_SYMBOL(_write_unlock_irqrestore);
333
334void __lockfunc _write_unlock_irq(rwlock_t *lock)
335{
336 _raw_write_unlock(lock);
337 local_irq_enable();
338 preempt_enable();
339}
340EXPORT_SYMBOL(_write_unlock_irq);
341
342void __lockfunc _write_unlock_bh(rwlock_t *lock)
343{
344 _raw_write_unlock(lock);
345 preempt_enable();
346 local_bh_enable();
347}
348EXPORT_SYMBOL(_write_unlock_bh);
349
350int __lockfunc _spin_trylock_bh(spinlock_t *lock)
351{
352 local_bh_disable();
353 preempt_disable();
354 if (_raw_spin_trylock(lock))
355 return 1;
356
357 preempt_enable();
358 local_bh_enable();
359 return 0;
360}
361EXPORT_SYMBOL(_spin_trylock_bh);
362
363int in_lock_functions(unsigned long addr)
364{
365 /* Linker adds these: start and end of __lockfunc functions */
366 extern char __lock_text_start[], __lock_text_end[];
367
368 return addr >= (unsigned long)__lock_text_start
369 && addr < (unsigned long)__lock_text_end;
370}
371EXPORT_SYMBOL(in_lock_functions);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
new file mode 100644
index 000000000000..c39ed70af174
--- /dev/null
+++ b/kernel/stop_machine.c
@@ -0,0 +1,212 @@
1#include <linux/stop_machine.h>
2#include <linux/kthread.h>
3#include <linux/sched.h>
4#include <linux/cpu.h>
5#include <linux/err.h>
6#include <linux/syscalls.h>
7#include <asm/atomic.h>
8#include <asm/semaphore.h>
9#include <asm/uaccess.h>
10
11/* Since we effect priority and affinity (both of which are visible
12 * to, and settable by outside processes) we do indirection via a
13 * kthread. */
14
15/* Thread to stop each CPU in user context. */
16enum stopmachine_state {
17 STOPMACHINE_WAIT,
18 STOPMACHINE_PREPARE,
19 STOPMACHINE_DISABLE_IRQ,
20 STOPMACHINE_EXIT,
21};
22
23static enum stopmachine_state stopmachine_state;
24static unsigned int stopmachine_num_threads;
25static atomic_t stopmachine_thread_ack;
26static DECLARE_MUTEX(stopmachine_mutex);
27
28static int stopmachine(void *cpu)
29{
30 int irqs_disabled = 0;
31 int prepared = 0;
32
33 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
34
35 /* Ack: we are alive */
36 mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
37 atomic_inc(&stopmachine_thread_ack);
38
39 /* Simple state machine */
40 while (stopmachine_state != STOPMACHINE_EXIT) {
41 if (stopmachine_state == STOPMACHINE_DISABLE_IRQ
42 && !irqs_disabled) {
43 local_irq_disable();
44 irqs_disabled = 1;
45 /* Ack: irqs disabled. */
46 mb(); /* Must read state first. */
47 atomic_inc(&stopmachine_thread_ack);
48 } else if (stopmachine_state == STOPMACHINE_PREPARE
49 && !prepared) {
50 /* Everyone is in place, hold CPU. */
51 preempt_disable();
52 prepared = 1;
53 mb(); /* Must read state first. */
54 atomic_inc(&stopmachine_thread_ack);
55 }
56 /* Yield in first stage: migration threads need to
57 * help our sisters onto their CPUs. */
58 if (!prepared && !irqs_disabled)
59 yield();
60 else
61 cpu_relax();
62 }
63
64 /* Ack: we are exiting. */
65 mb(); /* Must read state first. */
66 atomic_inc(&stopmachine_thread_ack);
67
68 if (irqs_disabled)
69 local_irq_enable();
70 if (prepared)
71 preempt_enable();
72
73 return 0;
74}
75
76/* Change the thread state */
77static void stopmachine_set_state(enum stopmachine_state state)
78{
79 atomic_set(&stopmachine_thread_ack, 0);
80 wmb();
81 stopmachine_state = state;
82 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
83 cpu_relax();
84}
85
86static int stop_machine(void)
87{
88 int i, ret = 0;
89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
90 mm_segment_t old_fs = get_fs();
91
92 /* One high-prio thread per cpu. We'll do this one. */
93 set_fs(KERNEL_DS);
94 sys_sched_setscheduler(current->pid, SCHED_FIFO,
95 (struct sched_param __user *)&param);
96 set_fs(old_fs);
97
98 atomic_set(&stopmachine_thread_ack, 0);
99 stopmachine_num_threads = 0;
100 stopmachine_state = STOPMACHINE_WAIT;
101
102 for_each_online_cpu(i) {
103 if (i == _smp_processor_id())
104 continue;
105 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
106 if (ret < 0)
107 break;
108 stopmachine_num_threads++;
109 }
110
111 /* Wait for them all to come to life. */
112 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
113 yield();
114
115 /* If some failed, kill them all. */
116 if (ret < 0) {
117 stopmachine_set_state(STOPMACHINE_EXIT);
118 up(&stopmachine_mutex);
119 return ret;
120 }
121
122 /* Don't schedule us away at this point, please. */
123 local_irq_disable();
124
125 /* Now they are all started, make them hold the CPUs, ready. */
126 stopmachine_set_state(STOPMACHINE_PREPARE);
127
128 /* Make them disable irqs. */
129 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
130
131 return 0;
132}
133
134static void restart_machine(void)
135{
136 stopmachine_set_state(STOPMACHINE_EXIT);
137 local_irq_enable();
138}
139
140struct stop_machine_data
141{
142 int (*fn)(void *);
143 void *data;
144 struct completion done;
145};
146
147static int do_stop(void *_smdata)
148{
149 struct stop_machine_data *smdata = _smdata;
150 int ret;
151
152 ret = stop_machine();
153 if (ret == 0) {
154 ret = smdata->fn(smdata->data);
155 restart_machine();
156 }
157
158 /* We're done: you can kthread_stop us now */
159 complete(&smdata->done);
160
161 /* Wait for kthread_stop */
162 set_current_state(TASK_INTERRUPTIBLE);
163 while (!kthread_should_stop()) {
164 schedule();
165 set_current_state(TASK_INTERRUPTIBLE);
166 }
167 __set_current_state(TASK_RUNNING);
168 return ret;
169}
170
171struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
172 unsigned int cpu)
173{
174 struct stop_machine_data smdata;
175 struct task_struct *p;
176
177 smdata.fn = fn;
178 smdata.data = data;
179 init_completion(&smdata.done);
180
181 down(&stopmachine_mutex);
182
183 /* If they don't care which CPU fn runs on, bind to any online one. */
184 if (cpu == NR_CPUS)
185 cpu = _smp_processor_id();
186
187 p = kthread_create(do_stop, &smdata, "kstopmachine");
188 if (!IS_ERR(p)) {
189 kthread_bind(p, cpu);
190 wake_up_process(p);
191 wait_for_completion(&smdata.done);
192 }
193 up(&stopmachine_mutex);
194 return p;
195}
196
197int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
198{
199 struct task_struct *p;
200 int ret;
201
202 /* No CPUs can come up or down during this. */
203 lock_cpu_hotplug();
204 p = __stop_machine_run(fn, data, cpu);
205 if (!IS_ERR(p))
206 ret = kthread_stop(p);
207 else
208 ret = PTR_ERR(p);
209 unlock_cpu_hotplug();
210
211 return ret;
212}
diff --git a/kernel/sys.c b/kernel/sys.c
new file mode 100644
index 000000000000..462d78d55895
--- /dev/null
+++ b/kernel/sys.c
@@ -0,0 +1,1725 @@
1/*
2 * linux/kernel/sys.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7#include <linux/config.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/utsname.h>
11#include <linux/mman.h>
12#include <linux/smp_lock.h>
13#include <linux/notifier.h>
14#include <linux/reboot.h>
15#include <linux/prctl.h>
16#include <linux/init.h>
17#include <linux/highuid.h>
18#include <linux/fs.h>
19#include <linux/workqueue.h>
20#include <linux/device.h>
21#include <linux/key.h>
22#include <linux/times.h>
23#include <linux/posix-timers.h>
24#include <linux/security.h>
25#include <linux/dcookies.h>
26#include <linux/suspend.h>
27#include <linux/tty.h>
28
29#include <linux/compat.h>
30#include <linux/syscalls.h>
31
32#include <asm/uaccess.h>
33#include <asm/io.h>
34#include <asm/unistd.h>
35
36#ifndef SET_UNALIGN_CTL
37# define SET_UNALIGN_CTL(a,b) (-EINVAL)
38#endif
39#ifndef GET_UNALIGN_CTL
40# define GET_UNALIGN_CTL(a,b) (-EINVAL)
41#endif
42#ifndef SET_FPEMU_CTL
43# define SET_FPEMU_CTL(a,b) (-EINVAL)
44#endif
45#ifndef GET_FPEMU_CTL
46# define GET_FPEMU_CTL(a,b) (-EINVAL)
47#endif
48#ifndef SET_FPEXC_CTL
49# define SET_FPEXC_CTL(a,b) (-EINVAL)
50#endif
51#ifndef GET_FPEXC_CTL
52# define GET_FPEXC_CTL(a,b) (-EINVAL)
53#endif
54
55/*
56 * this is where the system-wide overflow UID and GID are defined, for
57 * architectures that now have 32-bit UID/GID but didn't in the past
58 */
59
60int overflowuid = DEFAULT_OVERFLOWUID;
61int overflowgid = DEFAULT_OVERFLOWGID;
62
63#ifdef CONFIG_UID16
64EXPORT_SYMBOL(overflowuid);
65EXPORT_SYMBOL(overflowgid);
66#endif
67
68/*
69 * the same as above, but for filesystems which can only store a 16-bit
70 * UID and GID. as such, this is needed on all architectures
71 */
72
73int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
74int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
75
76EXPORT_SYMBOL(fs_overflowuid);
77EXPORT_SYMBOL(fs_overflowgid);
78
79/*
80 * this indicates whether you can reboot with ctrl-alt-del: the default is yes
81 */
82
83int C_A_D = 1;
84int cad_pid = 1;
85
86/*
87 * Notifier list for kernel code which wants to be called
88 * at shutdown. This is used to stop any idling DMA operations
89 * and the like.
90 */
91
92static struct notifier_block *reboot_notifier_list;
93static DEFINE_RWLOCK(notifier_lock);
94
95/**
96 * notifier_chain_register - Add notifier to a notifier chain
97 * @list: Pointer to root list pointer
98 * @n: New entry in notifier chain
99 *
100 * Adds a notifier to a notifier chain.
101 *
102 * Currently always returns zero.
103 */
104
105int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
106{
107 write_lock(&notifier_lock);
108 while(*list)
109 {
110 if(n->priority > (*list)->priority)
111 break;
112 list= &((*list)->next);
113 }
114 n->next = *list;
115 *list=n;
116 write_unlock(&notifier_lock);
117 return 0;
118}
119
120EXPORT_SYMBOL(notifier_chain_register);
121
122/**
123 * notifier_chain_unregister - Remove notifier from a notifier chain
124 * @nl: Pointer to root list pointer
125 * @n: New entry in notifier chain
126 *
127 * Removes a notifier from a notifier chain.
128 *
129 * Returns zero on success, or %-ENOENT on failure.
130 */
131
132int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
133{
134 write_lock(&notifier_lock);
135 while((*nl)!=NULL)
136 {
137 if((*nl)==n)
138 {
139 *nl=n->next;
140 write_unlock(&notifier_lock);
141 return 0;
142 }
143 nl=&((*nl)->next);
144 }
145 write_unlock(&notifier_lock);
146 return -ENOENT;
147}
148
149EXPORT_SYMBOL(notifier_chain_unregister);
150
151/**
152 * notifier_call_chain - Call functions in a notifier chain
153 * @n: Pointer to root pointer of notifier chain
154 * @val: Value passed unmodified to notifier function
155 * @v: Pointer passed unmodified to notifier function
156 *
157 * Calls each function in a notifier chain in turn.
158 *
159 * If the return value of the notifier can be and'd
160 * with %NOTIFY_STOP_MASK, then notifier_call_chain
161 * will return immediately, with the return value of
162 * the notifier function which halted execution.
163 * Otherwise, the return value is the return value
164 * of the last notifier function called.
165 */
166
167int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
168{
169 int ret=NOTIFY_DONE;
170 struct notifier_block *nb = *n;
171
172 while(nb)
173 {
174 ret=nb->notifier_call(nb,val,v);
175 if(ret&NOTIFY_STOP_MASK)
176 {
177 return ret;
178 }
179 nb=nb->next;
180 }
181 return ret;
182}
183
184EXPORT_SYMBOL(notifier_call_chain);
185
186/**
187 * register_reboot_notifier - Register function to be called at reboot time
188 * @nb: Info about notifier function to be called
189 *
190 * Registers a function with the list of functions
191 * to be called at reboot time.
192 *
193 * Currently always returns zero, as notifier_chain_register
194 * always returns zero.
195 */
196
197int register_reboot_notifier(struct notifier_block * nb)
198{
199 return notifier_chain_register(&reboot_notifier_list, nb);
200}
201
202EXPORT_SYMBOL(register_reboot_notifier);
203
204/**
205 * unregister_reboot_notifier - Unregister previously registered reboot notifier
206 * @nb: Hook to be unregistered
207 *
208 * Unregisters a previously registered reboot
209 * notifier function.
210 *
211 * Returns zero on success, or %-ENOENT on failure.
212 */
213
214int unregister_reboot_notifier(struct notifier_block * nb)
215{
216 return notifier_chain_unregister(&reboot_notifier_list, nb);
217}
218
219EXPORT_SYMBOL(unregister_reboot_notifier);
220
221static int set_one_prio(struct task_struct *p, int niceval, int error)
222{
223 int no_nice;
224
225 if (p->uid != current->euid &&
226 p->euid != current->euid && !capable(CAP_SYS_NICE)) {
227 error = -EPERM;
228 goto out;
229 }
230 if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) {
231 error = -EACCES;
232 goto out;
233 }
234 no_nice = security_task_setnice(p, niceval);
235 if (no_nice) {
236 error = no_nice;
237 goto out;
238 }
239 if (error == -ESRCH)
240 error = 0;
241 set_user_nice(p, niceval);
242out:
243 return error;
244}
245
246asmlinkage long sys_setpriority(int which, int who, int niceval)
247{
248 struct task_struct *g, *p;
249 struct user_struct *user;
250 int error = -EINVAL;
251
252 if (which > 2 || which < 0)
253 goto out;
254
255 /* normalize: avoid signed division (rounding problems) */
256 error = -ESRCH;
257 if (niceval < -20)
258 niceval = -20;
259 if (niceval > 19)
260 niceval = 19;
261
262 read_lock(&tasklist_lock);
263 switch (which) {
264 case PRIO_PROCESS:
265 if (!who)
266 who = current->pid;
267 p = find_task_by_pid(who);
268 if (p)
269 error = set_one_prio(p, niceval, error);
270 break;
271 case PRIO_PGRP:
272 if (!who)
273 who = process_group(current);
274 do_each_task_pid(who, PIDTYPE_PGID, p) {
275 error = set_one_prio(p, niceval, error);
276 } while_each_task_pid(who, PIDTYPE_PGID, p);
277 break;
278 case PRIO_USER:
279 user = current->user;
280 if (!who)
281 who = current->uid;
282 else
283 if ((who != current->uid) && !(user = find_user(who)))
284 goto out_unlock; /* No processes for this user */
285
286 do_each_thread(g, p)
287 if (p->uid == who)
288 error = set_one_prio(p, niceval, error);
289 while_each_thread(g, p);
290 if (who != current->uid)
291 free_uid(user); /* For find_user() */
292 break;
293 }
294out_unlock:
295 read_unlock(&tasklist_lock);
296out:
297 return error;
298}
299
300/*
301 * Ugh. To avoid negative return values, "getpriority()" will
302 * not return the normal nice-value, but a negated value that
303 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
304 * to stay compatible.
305 */
306asmlinkage long sys_getpriority(int which, int who)
307{
308 struct task_struct *g, *p;
309 struct user_struct *user;
310 long niceval, retval = -ESRCH;
311
312 if (which > 2 || which < 0)
313 return -EINVAL;
314
315 read_lock(&tasklist_lock);
316 switch (which) {
317 case PRIO_PROCESS:
318 if (!who)
319 who = current->pid;
320 p = find_task_by_pid(who);
321 if (p) {
322 niceval = 20 - task_nice(p);
323 if (niceval > retval)
324 retval = niceval;
325 }
326 break;
327 case PRIO_PGRP:
328 if (!who)
329 who = process_group(current);
330 do_each_task_pid(who, PIDTYPE_PGID, p) {
331 niceval = 20 - task_nice(p);
332 if (niceval > retval)
333 retval = niceval;
334 } while_each_task_pid(who, PIDTYPE_PGID, p);
335 break;
336 case PRIO_USER:
337 user = current->user;
338 if (!who)
339 who = current->uid;
340 else
341 if ((who != current->uid) && !(user = find_user(who)))
342 goto out_unlock; /* No processes for this user */
343
344 do_each_thread(g, p)
345 if (p->uid == who) {
346 niceval = 20 - task_nice(p);
347 if (niceval > retval)
348 retval = niceval;
349 }
350 while_each_thread(g, p);
351 if (who != current->uid)
352 free_uid(user); /* for find_user() */
353 break;
354 }
355out_unlock:
356 read_unlock(&tasklist_lock);
357
358 return retval;
359}
360
361
362/*
363 * Reboot system call: for obvious reasons only root may call it,
364 * and even root needs to set up some magic numbers in the registers
365 * so that some mistake won't make this reboot the whole machine.
366 * You can also set the meaning of the ctrl-alt-del-key here.
367 *
368 * reboot doesn't sync: do that yourself before calling this.
369 */
370asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg)
371{
372 char buffer[256];
373
374 /* We only trust the superuser with rebooting the system. */
375 if (!capable(CAP_SYS_BOOT))
376 return -EPERM;
377
378 /* For safety, we require "magic" arguments. */
379 if (magic1 != LINUX_REBOOT_MAGIC1 ||
380 (magic2 != LINUX_REBOOT_MAGIC2 &&
381 magic2 != LINUX_REBOOT_MAGIC2A &&
382 magic2 != LINUX_REBOOT_MAGIC2B &&
383 magic2 != LINUX_REBOOT_MAGIC2C))
384 return -EINVAL;
385
386 lock_kernel();
387 switch (cmd) {
388 case LINUX_REBOOT_CMD_RESTART:
389 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
390 system_state = SYSTEM_RESTART;
391 device_shutdown();
392 printk(KERN_EMERG "Restarting system.\n");
393 machine_restart(NULL);
394 break;
395
396 case LINUX_REBOOT_CMD_CAD_ON:
397 C_A_D = 1;
398 break;
399
400 case LINUX_REBOOT_CMD_CAD_OFF:
401 C_A_D = 0;
402 break;
403
404 case LINUX_REBOOT_CMD_HALT:
405 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
406 system_state = SYSTEM_HALT;
407 device_shutdown();
408 printk(KERN_EMERG "System halted.\n");
409 machine_halt();
410 unlock_kernel();
411 do_exit(0);
412 break;
413
414 case LINUX_REBOOT_CMD_POWER_OFF:
415 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
416 system_state = SYSTEM_POWER_OFF;
417 device_shutdown();
418 printk(KERN_EMERG "Power down.\n");
419 machine_power_off();
420 unlock_kernel();
421 do_exit(0);
422 break;
423
424 case LINUX_REBOOT_CMD_RESTART2:
425 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
426 unlock_kernel();
427 return -EFAULT;
428 }
429 buffer[sizeof(buffer) - 1] = '\0';
430
431 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
432 system_state = SYSTEM_RESTART;
433 device_shutdown();
434 printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
435 machine_restart(buffer);
436 break;
437
438#ifdef CONFIG_SOFTWARE_SUSPEND
439 case LINUX_REBOOT_CMD_SW_SUSPEND:
440 {
441 int ret = software_suspend();
442 unlock_kernel();
443 return ret;
444 }
445#endif
446
447 default:
448 unlock_kernel();
449 return -EINVAL;
450 }
451 unlock_kernel();
452 return 0;
453}
454
455static void deferred_cad(void *dummy)
456{
457 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
458 machine_restart(NULL);
459}
460
461/*
462 * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
463 * As it's called within an interrupt, it may NOT sync: the only choice
464 * is whether to reboot at once, or just ignore the ctrl-alt-del.
465 */
466void ctrl_alt_del(void)
467{
468 static DECLARE_WORK(cad_work, deferred_cad, NULL);
469
470 if (C_A_D)
471 schedule_work(&cad_work);
472 else
473 kill_proc(cad_pid, SIGINT, 1);
474}
475
476
477/*
478 * Unprivileged users may change the real gid to the effective gid
479 * or vice versa. (BSD-style)
480 *
481 * If you set the real gid at all, or set the effective gid to a value not
482 * equal to the real gid, then the saved gid is set to the new effective gid.
483 *
484 * This makes it possible for a setgid program to completely drop its
485 * privileges, which is often a useful assertion to make when you are doing
486 * a security audit over a program.
487 *
488 * The general idea is that a program which uses just setregid() will be
489 * 100% compatible with BSD. A program which uses just setgid() will be
490 * 100% compatible with POSIX with saved IDs.
491 *
492 * SMP: There are not races, the GIDs are checked only by filesystem
493 * operations (as far as semantic preservation is concerned).
494 */
495asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
496{
497 int old_rgid = current->gid;
498 int old_egid = current->egid;
499 int new_rgid = old_rgid;
500 int new_egid = old_egid;
501 int retval;
502
503 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
504 if (retval)
505 return retval;
506
507 if (rgid != (gid_t) -1) {
508 if ((old_rgid == rgid) ||
509 (current->egid==rgid) ||
510 capable(CAP_SETGID))
511 new_rgid = rgid;
512 else
513 return -EPERM;
514 }
515 if (egid != (gid_t) -1) {
516 if ((old_rgid == egid) ||
517 (current->egid == egid) ||
518 (current->sgid == egid) ||
519 capable(CAP_SETGID))
520 new_egid = egid;
521 else {
522 return -EPERM;
523 }
524 }
525 if (new_egid != old_egid)
526 {
527 current->mm->dumpable = 0;
528 wmb();
529 }
530 if (rgid != (gid_t) -1 ||
531 (egid != (gid_t) -1 && egid != old_rgid))
532 current->sgid = new_egid;
533 current->fsgid = new_egid;
534 current->egid = new_egid;
535 current->gid = new_rgid;
536 key_fsgid_changed(current);
537 return 0;
538}
539
540/*
541 * setgid() is implemented like SysV w/ SAVED_IDS
542 *
543 * SMP: Same implicit races as above.
544 */
545asmlinkage long sys_setgid(gid_t gid)
546{
547 int old_egid = current->egid;
548 int retval;
549
550 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
551 if (retval)
552 return retval;
553
554 if (capable(CAP_SETGID))
555 {
556 if(old_egid != gid)
557 {
558 current->mm->dumpable=0;
559 wmb();
560 }
561 current->gid = current->egid = current->sgid = current->fsgid = gid;
562 }
563 else if ((gid == current->gid) || (gid == current->sgid))
564 {
565 if(old_egid != gid)
566 {
567 current->mm->dumpable=0;
568 wmb();
569 }
570 current->egid = current->fsgid = gid;
571 }
572 else
573 return -EPERM;
574
575 key_fsgid_changed(current);
576 return 0;
577}
578
579static int set_user(uid_t new_ruid, int dumpclear)
580{
581 struct user_struct *new_user;
582
583 new_user = alloc_uid(new_ruid);
584 if (!new_user)
585 return -EAGAIN;
586
587 if (atomic_read(&new_user->processes) >=
588 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
589 new_user != &root_user) {
590 free_uid(new_user);
591 return -EAGAIN;
592 }
593
594 switch_uid(new_user);
595
596 if(dumpclear)
597 {
598 current->mm->dumpable = 0;
599 wmb();
600 }
601 current->uid = new_ruid;
602 return 0;
603}
604
605/*
606 * Unprivileged users may change the real uid to the effective uid
607 * or vice versa. (BSD-style)
608 *
609 * If you set the real uid at all, or set the effective uid to a value not
610 * equal to the real uid, then the saved uid is set to the new effective uid.
611 *
612 * This makes it possible for a setuid program to completely drop its
613 * privileges, which is often a useful assertion to make when you are doing
614 * a security audit over a program.
615 *
616 * The general idea is that a program which uses just setreuid() will be
617 * 100% compatible with BSD. A program which uses just setuid() will be
618 * 100% compatible with POSIX with saved IDs.
619 */
620asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
621{
622 int old_ruid, old_euid, old_suid, new_ruid, new_euid;
623 int retval;
624
625 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
626 if (retval)
627 return retval;
628
629 new_ruid = old_ruid = current->uid;
630 new_euid = old_euid = current->euid;
631 old_suid = current->suid;
632
633 if (ruid != (uid_t) -1) {
634 new_ruid = ruid;
635 if ((old_ruid != ruid) &&
636 (current->euid != ruid) &&
637 !capable(CAP_SETUID))
638 return -EPERM;
639 }
640
641 if (euid != (uid_t) -1) {
642 new_euid = euid;
643 if ((old_ruid != euid) &&
644 (current->euid != euid) &&
645 (current->suid != euid) &&
646 !capable(CAP_SETUID))
647 return -EPERM;
648 }
649
650 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
651 return -EAGAIN;
652
653 if (new_euid != old_euid)
654 {
655 current->mm->dumpable=0;
656 wmb();
657 }
658 current->fsuid = current->euid = new_euid;
659 if (ruid != (uid_t) -1 ||
660 (euid != (uid_t) -1 && euid != old_ruid))
661 current->suid = current->euid;
662 current->fsuid = current->euid;
663
664 key_fsuid_changed(current);
665
666 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
667}
668
669
670
671/*
672 * setuid() is implemented like SysV with SAVED_IDS
673 *
674 * Note that SAVED_ID's is deficient in that a setuid root program
675 * like sendmail, for example, cannot set its uid to be a normal
676 * user and then switch back, because if you're root, setuid() sets
677 * the saved uid too. If you don't like this, blame the bright people
678 * in the POSIX committee and/or USG. Note that the BSD-style setreuid()
679 * will allow a root program to temporarily drop privileges and be able to
680 * regain them by swapping the real and effective uid.
681 */
682asmlinkage long sys_setuid(uid_t uid)
683{
684 int old_euid = current->euid;
685 int old_ruid, old_suid, new_ruid, new_suid;
686 int retval;
687
688 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
689 if (retval)
690 return retval;
691
692 old_ruid = new_ruid = current->uid;
693 old_suid = current->suid;
694 new_suid = old_suid;
695
696 if (capable(CAP_SETUID)) {
697 if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
698 return -EAGAIN;
699 new_suid = uid;
700 } else if ((uid != current->uid) && (uid != new_suid))
701 return -EPERM;
702
703 if (old_euid != uid)
704 {
705 current->mm->dumpable = 0;
706 wmb();
707 }
708 current->fsuid = current->euid = uid;
709 current->suid = new_suid;
710
711 key_fsuid_changed(current);
712
713 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
714}
715
716
717/*
718 * This function implements a generic ability to update ruid, euid,
719 * and suid. This allows you to implement the 4.4 compatible seteuid().
720 */
721asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
722{
723 int old_ruid = current->uid;
724 int old_euid = current->euid;
725 int old_suid = current->suid;
726 int retval;
727
728 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
729 if (retval)
730 return retval;
731
732 if (!capable(CAP_SETUID)) {
733 if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
734 (ruid != current->euid) && (ruid != current->suid))
735 return -EPERM;
736 if ((euid != (uid_t) -1) && (euid != current->uid) &&
737 (euid != current->euid) && (euid != current->suid))
738 return -EPERM;
739 if ((suid != (uid_t) -1) && (suid != current->uid) &&
740 (suid != current->euid) && (suid != current->suid))
741 return -EPERM;
742 }
743 if (ruid != (uid_t) -1) {
744 if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
745 return -EAGAIN;
746 }
747 if (euid != (uid_t) -1) {
748 if (euid != current->euid)
749 {
750 current->mm->dumpable = 0;
751 wmb();
752 }
753 current->euid = euid;
754 }
755 current->fsuid = current->euid;
756 if (suid != (uid_t) -1)
757 current->suid = suid;
758
759 key_fsuid_changed(current);
760
761 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
762}
763
764asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
765{
766 int retval;
767
768 if (!(retval = put_user(current->uid, ruid)) &&
769 !(retval = put_user(current->euid, euid)))
770 retval = put_user(current->suid, suid);
771
772 return retval;
773}
774
775/*
776 * Same as above, but for rgid, egid, sgid.
777 */
778asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
779{
780 int retval;
781
782 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
783 if (retval)
784 return retval;
785
786 if (!capable(CAP_SETGID)) {
787 if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
788 (rgid != current->egid) && (rgid != current->sgid))
789 return -EPERM;
790 if ((egid != (gid_t) -1) && (egid != current->gid) &&
791 (egid != current->egid) && (egid != current->sgid))
792 return -EPERM;
793 if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
794 (sgid != current->egid) && (sgid != current->sgid))
795 return -EPERM;
796 }
797 if (egid != (gid_t) -1) {
798 if (egid != current->egid)
799 {
800 current->mm->dumpable = 0;
801 wmb();
802 }
803 current->egid = egid;
804 }
805 current->fsgid = current->egid;
806 if (rgid != (gid_t) -1)
807 current->gid = rgid;
808 if (sgid != (gid_t) -1)
809 current->sgid = sgid;
810
811 key_fsgid_changed(current);
812 return 0;
813}
814
815asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
816{
817 int retval;
818
819 if (!(retval = put_user(current->gid, rgid)) &&
820 !(retval = put_user(current->egid, egid)))
821 retval = put_user(current->sgid, sgid);
822
823 return retval;
824}
825
826
827/*
828 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
829 * is used for "access()" and for the NFS daemon (letting nfsd stay at
830 * whatever uid it wants to). It normally shadows "euid", except when
831 * explicitly set by setfsuid() or for access..
832 */
833asmlinkage long sys_setfsuid(uid_t uid)
834{
835 int old_fsuid;
836
837 old_fsuid = current->fsuid;
838 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
839 return old_fsuid;
840
841 if (uid == current->uid || uid == current->euid ||
842 uid == current->suid || uid == current->fsuid ||
843 capable(CAP_SETUID))
844 {
845 if (uid != old_fsuid)
846 {
847 current->mm->dumpable = 0;
848 wmb();
849 }
850 current->fsuid = uid;
851 }
852
853 key_fsuid_changed(current);
854
855 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
856
857 return old_fsuid;
858}
859
860/*
861 * Samma på svenska..
862 */
863asmlinkage long sys_setfsgid(gid_t gid)
864{
865 int old_fsgid;
866
867 old_fsgid = current->fsgid;
868 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
869 return old_fsgid;
870
871 if (gid == current->gid || gid == current->egid ||
872 gid == current->sgid || gid == current->fsgid ||
873 capable(CAP_SETGID))
874 {
875 if (gid != old_fsgid)
876 {
877 current->mm->dumpable = 0;
878 wmb();
879 }
880 current->fsgid = gid;
881 key_fsgid_changed(current);
882 }
883 return old_fsgid;
884}
885
886asmlinkage long sys_times(struct tms __user * tbuf)
887{
888 /*
889 * In the SMP world we might just be unlucky and have one of
890 * the times increment as we use it. Since the value is an
891 * atomically safe type this is just fine. Conceptually its
892 * as if the syscall took an instant longer to occur.
893 */
894 if (tbuf) {
895 struct tms tmp;
896 struct task_struct *tsk = current;
897 struct task_struct *t;
898 cputime_t utime, stime, cutime, cstime;
899
900 read_lock(&tasklist_lock);
901 utime = tsk->signal->utime;
902 stime = tsk->signal->stime;
903 t = tsk;
904 do {
905 utime = cputime_add(utime, t->utime);
906 stime = cputime_add(stime, t->stime);
907 t = next_thread(t);
908 } while (t != tsk);
909
910 /*
911 * While we have tasklist_lock read-locked, no dying thread
912 * can be updating current->signal->[us]time. Instead,
913 * we got their counts included in the live thread loop.
914 * However, another thread can come in right now and
915 * do a wait call that updates current->signal->c[us]time.
916 * To make sure we always see that pair updated atomically,
917 * we take the siglock around fetching them.
918 */
919 spin_lock_irq(&tsk->sighand->siglock);
920 cutime = tsk->signal->cutime;
921 cstime = tsk->signal->cstime;
922 spin_unlock_irq(&tsk->sighand->siglock);
923 read_unlock(&tasklist_lock);
924
925 tmp.tms_utime = cputime_to_clock_t(utime);
926 tmp.tms_stime = cputime_to_clock_t(stime);
927 tmp.tms_cutime = cputime_to_clock_t(cutime);
928 tmp.tms_cstime = cputime_to_clock_t(cstime);
929 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
930 return -EFAULT;
931 }
932 return (long) jiffies_64_to_clock_t(get_jiffies_64());
933}
934
935/*
936 * This needs some heavy checking ...
937 * I just haven't the stomach for it. I also don't fully
938 * understand sessions/pgrp etc. Let somebody who does explain it.
939 *
940 * OK, I think I have the protection semantics right.... this is really
941 * only important on a multi-user system anyway, to make sure one user
942 * can't send a signal to a process owned by another. -TYT, 12/12/91
943 *
944 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
945 * LBT 04.03.94
946 */
947
948asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
949{
950 struct task_struct *p;
951 int err = -EINVAL;
952
953 if (!pid)
954 pid = current->pid;
955 if (!pgid)
956 pgid = pid;
957 if (pgid < 0)
958 return -EINVAL;
959
960 /* From this point forward we keep holding onto the tasklist lock
961 * so that our parent does not change from under us. -DaveM
962 */
963 write_lock_irq(&tasklist_lock);
964
965 err = -ESRCH;
966 p = find_task_by_pid(pid);
967 if (!p)
968 goto out;
969
970 err = -EINVAL;
971 if (!thread_group_leader(p))
972 goto out;
973
974 if (p->parent == current || p->real_parent == current) {
975 err = -EPERM;
976 if (p->signal->session != current->signal->session)
977 goto out;
978 err = -EACCES;
979 if (p->did_exec)
980 goto out;
981 } else {
982 err = -ESRCH;
983 if (p != current)
984 goto out;
985 }
986
987 err = -EPERM;
988 if (p->signal->leader)
989 goto out;
990
991 if (pgid != pid) {
992 struct task_struct *p;
993
994 do_each_task_pid(pgid, PIDTYPE_PGID, p) {
995 if (p->signal->session == current->signal->session)
996 goto ok_pgid;
997 } while_each_task_pid(pgid, PIDTYPE_PGID, p);
998 goto out;
999 }
1000
1001ok_pgid:
1002 err = security_task_setpgid(p, pgid);
1003 if (err)
1004 goto out;
1005
1006 if (process_group(p) != pgid) {
1007 detach_pid(p, PIDTYPE_PGID);
1008 p->signal->pgrp = pgid;
1009 attach_pid(p, PIDTYPE_PGID, pgid);
1010 }
1011
1012 err = 0;
1013out:
1014 /* All paths lead to here, thus we are safe. -DaveM */
1015 write_unlock_irq(&tasklist_lock);
1016 return err;
1017}
1018
1019asmlinkage long sys_getpgid(pid_t pid)
1020{
1021 if (!pid) {
1022 return process_group(current);
1023 } else {
1024 int retval;
1025 struct task_struct *p;
1026
1027 read_lock(&tasklist_lock);
1028 p = find_task_by_pid(pid);
1029
1030 retval = -ESRCH;
1031 if (p) {
1032 retval = security_task_getpgid(p);
1033 if (!retval)
1034 retval = process_group(p);
1035 }
1036 read_unlock(&tasklist_lock);
1037 return retval;
1038 }
1039}
1040
1041#ifdef __ARCH_WANT_SYS_GETPGRP
1042
1043asmlinkage long sys_getpgrp(void)
1044{
1045 /* SMP - assuming writes are word atomic this is fine */
1046 return process_group(current);
1047}
1048
1049#endif
1050
1051asmlinkage long sys_getsid(pid_t pid)
1052{
1053 if (!pid) {
1054 return current->signal->session;
1055 } else {
1056 int retval;
1057 struct task_struct *p;
1058
1059 read_lock(&tasklist_lock);
1060 p = find_task_by_pid(pid);
1061
1062 retval = -ESRCH;
1063 if(p) {
1064 retval = security_task_getsid(p);
1065 if (!retval)
1066 retval = p->signal->session;
1067 }
1068 read_unlock(&tasklist_lock);
1069 return retval;
1070 }
1071}
1072
1073asmlinkage long sys_setsid(void)
1074{
1075 struct pid *pid;
1076 int err = -EPERM;
1077
1078 if (!thread_group_leader(current))
1079 return -EINVAL;
1080
1081 down(&tty_sem);
1082 write_lock_irq(&tasklist_lock);
1083
1084 pid = find_pid(PIDTYPE_PGID, current->pid);
1085 if (pid)
1086 goto out;
1087
1088 current->signal->leader = 1;
1089 __set_special_pids(current->pid, current->pid);
1090 current->signal->tty = NULL;
1091 current->signal->tty_old_pgrp = 0;
1092 err = process_group(current);
1093out:
1094 write_unlock_irq(&tasklist_lock);
1095 up(&tty_sem);
1096 return err;
1097}
1098
1099/*
1100 * Supplementary group IDs
1101 */
1102
1103/* init to 2 - one for init_task, one to ensure it is never freed */
1104struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
1105
1106struct group_info *groups_alloc(int gidsetsize)
1107{
1108 struct group_info *group_info;
1109 int nblocks;
1110 int i;
1111
1112 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
1113 /* Make sure we always allocate at least one indirect block pointer */
1114 nblocks = nblocks ? : 1;
1115 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
1116 if (!group_info)
1117 return NULL;
1118 group_info->ngroups = gidsetsize;
1119 group_info->nblocks = nblocks;
1120 atomic_set(&group_info->usage, 1);
1121
1122 if (gidsetsize <= NGROUPS_SMALL) {
1123 group_info->blocks[0] = group_info->small_block;
1124 } else {
1125 for (i = 0; i < nblocks; i++) {
1126 gid_t *b;
1127 b = (void *)__get_free_page(GFP_USER);
1128 if (!b)
1129 goto out_undo_partial_alloc;
1130 group_info->blocks[i] = b;
1131 }
1132 }
1133 return group_info;
1134
1135out_undo_partial_alloc:
1136 while (--i >= 0) {
1137 free_page((unsigned long)group_info->blocks[i]);
1138 }
1139 kfree(group_info);
1140 return NULL;
1141}
1142
1143EXPORT_SYMBOL(groups_alloc);
1144
1145void groups_free(struct group_info *group_info)
1146{
1147 if (group_info->blocks[0] != group_info->small_block) {
1148 int i;
1149 for (i = 0; i < group_info->nblocks; i++)
1150 free_page((unsigned long)group_info->blocks[i]);
1151 }
1152 kfree(group_info);
1153}
1154
1155EXPORT_SYMBOL(groups_free);
1156
1157/* export the group_info to a user-space array */
1158static int groups_to_user(gid_t __user *grouplist,
1159 struct group_info *group_info)
1160{
1161 int i;
1162 int count = group_info->ngroups;
1163
1164 for (i = 0; i < group_info->nblocks; i++) {
1165 int cp_count = min(NGROUPS_PER_BLOCK, count);
1166 int off = i * NGROUPS_PER_BLOCK;
1167 int len = cp_count * sizeof(*grouplist);
1168
1169 if (copy_to_user(grouplist+off, group_info->blocks[i], len))
1170 return -EFAULT;
1171
1172 count -= cp_count;
1173 }
1174 return 0;
1175}
1176
1177/* fill a group_info from a user-space array - it must be allocated already */
1178static int groups_from_user(struct group_info *group_info,
1179 gid_t __user *grouplist)
1180 {
1181 int i;
1182 int count = group_info->ngroups;
1183
1184 for (i = 0; i < group_info->nblocks; i++) {
1185 int cp_count = min(NGROUPS_PER_BLOCK, count);
1186 int off = i * NGROUPS_PER_BLOCK;
1187 int len = cp_count * sizeof(*grouplist);
1188
1189 if (copy_from_user(group_info->blocks[i], grouplist+off, len))
1190 return -EFAULT;
1191
1192 count -= cp_count;
1193 }
1194 return 0;
1195}
1196
1197/* a simple shell-metzner sort */
1198static void groups_sort(struct group_info *group_info)
1199{
1200 int base, max, stride;
1201 int gidsetsize = group_info->ngroups;
1202
1203 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
1204 ; /* nothing */
1205 stride /= 3;
1206
1207 while (stride) {
1208 max = gidsetsize - stride;
1209 for (base = 0; base < max; base++) {
1210 int left = base;
1211 int right = left + stride;
1212 gid_t tmp = GROUP_AT(group_info, right);
1213
1214 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
1215 GROUP_AT(group_info, right) =
1216 GROUP_AT(group_info, left);
1217 right = left;
1218 left -= stride;
1219 }
1220 GROUP_AT(group_info, right) = tmp;
1221 }
1222 stride /= 3;
1223 }
1224}
1225
1226/* a simple bsearch */
1227static int groups_search(struct group_info *group_info, gid_t grp)
1228{
1229 int left, right;
1230
1231 if (!group_info)
1232 return 0;
1233
1234 left = 0;
1235 right = group_info->ngroups;
1236 while (left < right) {
1237 int mid = (left+right)/2;
1238 int cmp = grp - GROUP_AT(group_info, mid);
1239 if (cmp > 0)
1240 left = mid + 1;
1241 else if (cmp < 0)
1242 right = mid;
1243 else
1244 return 1;
1245 }
1246 return 0;
1247}
1248
1249/* validate and set current->group_info */
1250int set_current_groups(struct group_info *group_info)
1251{
1252 int retval;
1253 struct group_info *old_info;
1254
1255 retval = security_task_setgroups(group_info);
1256 if (retval)
1257 return retval;
1258
1259 groups_sort(group_info);
1260 get_group_info(group_info);
1261
1262 task_lock(current);
1263 old_info = current->group_info;
1264 current->group_info = group_info;
1265 task_unlock(current);
1266
1267 put_group_info(old_info);
1268
1269 return 0;
1270}
1271
1272EXPORT_SYMBOL(set_current_groups);
1273
1274asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1275{
1276 int i = 0;
1277
1278 /*
1279 * SMP: Nobody else can change our grouplist. Thus we are
1280 * safe.
1281 */
1282
1283 if (gidsetsize < 0)
1284 return -EINVAL;
1285
1286 /* no need to grab task_lock here; it cannot change */
1287 get_group_info(current->group_info);
1288 i = current->group_info->ngroups;
1289 if (gidsetsize) {
1290 if (i > gidsetsize) {
1291 i = -EINVAL;
1292 goto out;
1293 }
1294 if (groups_to_user(grouplist, current->group_info)) {
1295 i = -EFAULT;
1296 goto out;
1297 }
1298 }
1299out:
1300 put_group_info(current->group_info);
1301 return i;
1302}
1303
1304/*
1305 * SMP: Our groups are copy-on-write. We can set them safely
1306 * without another task interfering.
1307 */
1308
1309asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
1310{
1311 struct group_info *group_info;
1312 int retval;
1313
1314 if (!capable(CAP_SETGID))
1315 return -EPERM;
1316 if ((unsigned)gidsetsize > NGROUPS_MAX)
1317 return -EINVAL;
1318
1319 group_info = groups_alloc(gidsetsize);
1320 if (!group_info)
1321 return -ENOMEM;
1322 retval = groups_from_user(group_info, grouplist);
1323 if (retval) {
1324 put_group_info(group_info);
1325 return retval;
1326 }
1327
1328 retval = set_current_groups(group_info);
1329 put_group_info(group_info);
1330
1331 return retval;
1332}
1333
1334/*
1335 * Check whether we're fsgid/egid or in the supplemental group..
1336 */
1337int in_group_p(gid_t grp)
1338{
1339 int retval = 1;
1340 if (grp != current->fsgid) {
1341 get_group_info(current->group_info);
1342 retval = groups_search(current->group_info, grp);
1343 put_group_info(current->group_info);
1344 }
1345 return retval;
1346}
1347
1348EXPORT_SYMBOL(in_group_p);
1349
1350int in_egroup_p(gid_t grp)
1351{
1352 int retval = 1;
1353 if (grp != current->egid) {
1354 get_group_info(current->group_info);
1355 retval = groups_search(current->group_info, grp);
1356 put_group_info(current->group_info);
1357 }
1358 return retval;
1359}
1360
1361EXPORT_SYMBOL(in_egroup_p);
1362
1363DECLARE_RWSEM(uts_sem);
1364
1365EXPORT_SYMBOL(uts_sem);
1366
1367asmlinkage long sys_newuname(struct new_utsname __user * name)
1368{
1369 int errno = 0;
1370
1371 down_read(&uts_sem);
1372 if (copy_to_user(name,&system_utsname,sizeof *name))
1373 errno = -EFAULT;
1374 up_read(&uts_sem);
1375 return errno;
1376}
1377
1378asmlinkage long sys_sethostname(char __user *name, int len)
1379{
1380 int errno;
1381 char tmp[__NEW_UTS_LEN];
1382
1383 if (!capable(CAP_SYS_ADMIN))
1384 return -EPERM;
1385 if (len < 0 || len > __NEW_UTS_LEN)
1386 return -EINVAL;
1387 down_write(&uts_sem);
1388 errno = -EFAULT;
1389 if (!copy_from_user(tmp, name, len)) {
1390 memcpy(system_utsname.nodename, tmp, len);
1391 system_utsname.nodename[len] = 0;
1392 errno = 0;
1393 }
1394 up_write(&uts_sem);
1395 return errno;
1396}
1397
1398#ifdef __ARCH_WANT_SYS_GETHOSTNAME
1399
1400asmlinkage long sys_gethostname(char __user *name, int len)
1401{
1402 int i, errno;
1403
1404 if (len < 0)
1405 return -EINVAL;
1406 down_read(&uts_sem);
1407 i = 1 + strlen(system_utsname.nodename);
1408 if (i > len)
1409 i = len;
1410 errno = 0;
1411 if (copy_to_user(name, system_utsname.nodename, i))
1412 errno = -EFAULT;
1413 up_read(&uts_sem);
1414 return errno;
1415}
1416
1417#endif
1418
1419/*
1420 * Only setdomainname; getdomainname can be implemented by calling
1421 * uname()
1422 */
1423asmlinkage long sys_setdomainname(char __user *name, int len)
1424{
1425 int errno;
1426 char tmp[__NEW_UTS_LEN];
1427
1428 if (!capable(CAP_SYS_ADMIN))
1429 return -EPERM;
1430 if (len < 0 || len > __NEW_UTS_LEN)
1431 return -EINVAL;
1432
1433 down_write(&uts_sem);
1434 errno = -EFAULT;
1435 if (!copy_from_user(tmp, name, len)) {
1436 memcpy(system_utsname.domainname, tmp, len);
1437 system_utsname.domainname[len] = 0;
1438 errno = 0;
1439 }
1440 up_write(&uts_sem);
1441 return errno;
1442}
1443
1444asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
1445{
1446 if (resource >= RLIM_NLIMITS)
1447 return -EINVAL;
1448 else {
1449 struct rlimit value;
1450 task_lock(current->group_leader);
1451 value = current->signal->rlim[resource];
1452 task_unlock(current->group_leader);
1453 return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1454 }
1455}
1456
1457#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
1458
1459/*
1460 * Back compatibility for getrlimit. Needed for some apps.
1461 */
1462
1463asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim)
1464{
1465 struct rlimit x;
1466 if (resource >= RLIM_NLIMITS)
1467 return -EINVAL;
1468
1469 task_lock(current->group_leader);
1470 x = current->signal->rlim[resource];
1471 task_unlock(current->group_leader);
1472 if(x.rlim_cur > 0x7FFFFFFF)
1473 x.rlim_cur = 0x7FFFFFFF;
1474 if(x.rlim_max > 0x7FFFFFFF)
1475 x.rlim_max = 0x7FFFFFFF;
1476 return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
1477}
1478
1479#endif
1480
1481asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1482{
1483 struct rlimit new_rlim, *old_rlim;
1484 int retval;
1485
1486 if (resource >= RLIM_NLIMITS)
1487 return -EINVAL;
1488 if(copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1489 return -EFAULT;
1490 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1491 return -EINVAL;
1492 old_rlim = current->signal->rlim + resource;
1493 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1494 !capable(CAP_SYS_RESOURCE))
1495 return -EPERM;
1496 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
1497 return -EPERM;
1498
1499 retval = security_task_setrlimit(resource, &new_rlim);
1500 if (retval)
1501 return retval;
1502
1503 task_lock(current->group_leader);
1504 *old_rlim = new_rlim;
1505 task_unlock(current->group_leader);
1506
1507 if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY &&
1508 (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
1509 new_rlim.rlim_cur <= cputime_to_secs(
1510 current->signal->it_prof_expires))) {
1511 cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur);
1512 read_lock(&tasklist_lock);
1513 spin_lock_irq(&current->sighand->siglock);
1514 set_process_cpu_timer(current, CPUCLOCK_PROF,
1515 &cputime, NULL);
1516 spin_unlock_irq(&current->sighand->siglock);
1517 read_unlock(&tasklist_lock);
1518 }
1519
1520 return 0;
1521}
1522
1523/*
1524 * It would make sense to put struct rusage in the task_struct,
1525 * except that would make the task_struct be *really big*. After
1526 * task_struct gets moved into malloc'ed memory, it would
1527 * make sense to do this. It will make moving the rest of the information
1528 * a lot simpler! (Which we're not doing right now because we're not
1529 * measuring them yet).
1530 *
1531 * This expects to be called with tasklist_lock read-locked or better,
1532 * and the siglock not locked. It may momentarily take the siglock.
1533 *
1534 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
1535 * races with threads incrementing their own counters. But since word
1536 * reads are atomic, we either get new values or old values and we don't
1537 * care which for the sums. We always take the siglock to protect reading
1538 * the c* fields from p->signal from races with exit.c updating those
1539 * fields when reaping, so a sample either gets all the additions of a
1540 * given child after it's reaped, or none so this sample is before reaping.
1541 */
1542
1543static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1544{
1545 struct task_struct *t;
1546 unsigned long flags;
1547 cputime_t utime, stime;
1548
1549 memset((char *) r, 0, sizeof *r);
1550
1551 if (unlikely(!p->signal))
1552 return;
1553
1554 switch (who) {
1555 case RUSAGE_CHILDREN:
1556 spin_lock_irqsave(&p->sighand->siglock, flags);
1557 utime = p->signal->cutime;
1558 stime = p->signal->cstime;
1559 r->ru_nvcsw = p->signal->cnvcsw;
1560 r->ru_nivcsw = p->signal->cnivcsw;
1561 r->ru_minflt = p->signal->cmin_flt;
1562 r->ru_majflt = p->signal->cmaj_flt;
1563 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1564 cputime_to_timeval(utime, &r->ru_utime);
1565 cputime_to_timeval(stime, &r->ru_stime);
1566 break;
1567 case RUSAGE_SELF:
1568 spin_lock_irqsave(&p->sighand->siglock, flags);
1569 utime = stime = cputime_zero;
1570 goto sum_group;
1571 case RUSAGE_BOTH:
1572 spin_lock_irqsave(&p->sighand->siglock, flags);
1573 utime = p->signal->cutime;
1574 stime = p->signal->cstime;
1575 r->ru_nvcsw = p->signal->cnvcsw;
1576 r->ru_nivcsw = p->signal->cnivcsw;
1577 r->ru_minflt = p->signal->cmin_flt;
1578 r->ru_majflt = p->signal->cmaj_flt;
1579 sum_group:
1580 utime = cputime_add(utime, p->signal->utime);
1581 stime = cputime_add(stime, p->signal->stime);
1582 r->ru_nvcsw += p->signal->nvcsw;
1583 r->ru_nivcsw += p->signal->nivcsw;
1584 r->ru_minflt += p->signal->min_flt;
1585 r->ru_majflt += p->signal->maj_flt;
1586 t = p;
1587 do {
1588 utime = cputime_add(utime, t->utime);
1589 stime = cputime_add(stime, t->stime);
1590 r->ru_nvcsw += t->nvcsw;
1591 r->ru_nivcsw += t->nivcsw;
1592 r->ru_minflt += t->min_flt;
1593 r->ru_majflt += t->maj_flt;
1594 t = next_thread(t);
1595 } while (t != p);
1596 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1597 cputime_to_timeval(utime, &r->ru_utime);
1598 cputime_to_timeval(stime, &r->ru_stime);
1599 break;
1600 default:
1601 BUG();
1602 }
1603}
1604
1605int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1606{
1607 struct rusage r;
1608 read_lock(&tasklist_lock);
1609 k_getrusage(p, who, &r);
1610 read_unlock(&tasklist_lock);
1611 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1612}
1613
1614asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
1615{
1616 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
1617 return -EINVAL;
1618 return getrusage(current, who, ru);
1619}
1620
1621asmlinkage long sys_umask(int mask)
1622{
1623 mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1624 return mask;
1625}
1626
1627asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1628 unsigned long arg4, unsigned long arg5)
1629{
1630 long error;
1631 int sig;
1632
1633 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1634 if (error)
1635 return error;
1636
1637 switch (option) {
1638 case PR_SET_PDEATHSIG:
1639 sig = arg2;
1640 if (sig < 0 || sig > _NSIG) {
1641 error = -EINVAL;
1642 break;
1643 }
1644 current->pdeath_signal = sig;
1645 break;
1646 case PR_GET_PDEATHSIG:
1647 error = put_user(current->pdeath_signal, (int __user *)arg2);
1648 break;
1649 case PR_GET_DUMPABLE:
1650 if (current->mm->dumpable)
1651 error = 1;
1652 break;
1653 case PR_SET_DUMPABLE:
1654 if (arg2 != 0 && arg2 != 1) {
1655 error = -EINVAL;
1656 break;
1657 }
1658 current->mm->dumpable = arg2;
1659 break;
1660
1661 case PR_SET_UNALIGN:
1662 error = SET_UNALIGN_CTL(current, arg2);
1663 break;
1664 case PR_GET_UNALIGN:
1665 error = GET_UNALIGN_CTL(current, arg2);
1666 break;
1667 case PR_SET_FPEMU:
1668 error = SET_FPEMU_CTL(current, arg2);
1669 break;
1670 case PR_GET_FPEMU:
1671 error = GET_FPEMU_CTL(current, arg2);
1672 break;
1673 case PR_SET_FPEXC:
1674 error = SET_FPEXC_CTL(current, arg2);
1675 break;
1676 case PR_GET_FPEXC:
1677 error = GET_FPEXC_CTL(current, arg2);
1678 break;
1679 case PR_GET_TIMING:
1680 error = PR_TIMING_STATISTICAL;
1681 break;
1682 case PR_SET_TIMING:
1683 if (arg2 == PR_TIMING_STATISTICAL)
1684 error = 0;
1685 else
1686 error = -EINVAL;
1687 break;
1688
1689 case PR_GET_KEEPCAPS:
1690 if (current->keep_capabilities)
1691 error = 1;
1692 break;
1693 case PR_SET_KEEPCAPS:
1694 if (arg2 != 0 && arg2 != 1) {
1695 error = -EINVAL;
1696 break;
1697 }
1698 current->keep_capabilities = arg2;
1699 break;
1700 case PR_SET_NAME: {
1701 struct task_struct *me = current;
1702 unsigned char ncomm[sizeof(me->comm)];
1703
1704 ncomm[sizeof(me->comm)-1] = 0;
1705 if (strncpy_from_user(ncomm, (char __user *)arg2,
1706 sizeof(me->comm)-1) < 0)
1707 return -EFAULT;
1708 set_task_comm(me, ncomm);
1709 return 0;
1710 }
1711 case PR_GET_NAME: {
1712 struct task_struct *me = current;
1713 unsigned char tcomm[sizeof(me->comm)];
1714
1715 get_task_comm(tcomm, me);
1716 if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
1717 return -EFAULT;
1718 return 0;
1719 }
1720 default:
1721 error = -EINVAL;
1722 break;
1723 }
1724 return error;
1725}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
new file mode 100644
index 000000000000..1802a311dd3f
--- /dev/null
+++ b/kernel/sys_ni.c
@@ -0,0 +1,86 @@
1
2#include <linux/linkage.h>
3#include <linux/errno.h>
4
5#include <asm/unistd.h>
6
7/*
8 * Non-implemented system calls get redirected here.
9 */
10asmlinkage long sys_ni_syscall(void)
11{
12 return -ENOSYS;
13}
14
15cond_syscall(sys_nfsservctl);
16cond_syscall(sys_quotactl);
17cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon);
20cond_syscall(sys_swapoff);
21cond_syscall(sys_init_module);
22cond_syscall(sys_delete_module);
23cond_syscall(sys_socketpair);
24cond_syscall(sys_bind);
25cond_syscall(sys_listen);
26cond_syscall(sys_accept);
27cond_syscall(sys_connect);
28cond_syscall(sys_getsockname);
29cond_syscall(sys_getpeername);
30cond_syscall(sys_sendto);
31cond_syscall(sys_send);
32cond_syscall(sys_recvfrom);
33cond_syscall(sys_recv);
34cond_syscall(sys_socket);
35cond_syscall(sys_setsockopt);
36cond_syscall(sys_getsockopt);
37cond_syscall(sys_shutdown);
38cond_syscall(sys_sendmsg);
39cond_syscall(sys_recvmsg);
40cond_syscall(sys_socketcall);
41cond_syscall(sys_futex);
42cond_syscall(compat_sys_futex);
43cond_syscall(sys_epoll_create);
44cond_syscall(sys_epoll_ctl);
45cond_syscall(sys_epoll_wait);
46cond_syscall(sys_semget);
47cond_syscall(sys_semop);
48cond_syscall(sys_semtimedop);
49cond_syscall(sys_semctl);
50cond_syscall(sys_msgget);
51cond_syscall(sys_msgsnd);
52cond_syscall(sys_msgrcv);
53cond_syscall(sys_msgctl);
54cond_syscall(sys_shmget);
55cond_syscall(sys_shmdt);
56cond_syscall(sys_shmctl);
57cond_syscall(sys_mq_open);
58cond_syscall(sys_mq_unlink);
59cond_syscall(sys_mq_timedsend);
60cond_syscall(sys_mq_timedreceive);
61cond_syscall(sys_mq_notify);
62cond_syscall(sys_mq_getsetattr);
63cond_syscall(compat_sys_mq_open);
64cond_syscall(compat_sys_mq_timedsend);
65cond_syscall(compat_sys_mq_timedreceive);
66cond_syscall(compat_sys_mq_notify);
67cond_syscall(compat_sys_mq_getsetattr);
68cond_syscall(sys_mbind);
69cond_syscall(sys_get_mempolicy);
70cond_syscall(sys_set_mempolicy);
71cond_syscall(compat_sys_mbind);
72cond_syscall(compat_sys_get_mempolicy);
73cond_syscall(compat_sys_set_mempolicy);
74cond_syscall(sys_add_key);
75cond_syscall(sys_request_key);
76cond_syscall(sys_keyctl);
77cond_syscall(compat_sys_keyctl);
78cond_syscall(compat_sys_socketcall);
79
80/* arch-specific weak syscall entries */
81cond_syscall(sys_pciconfig_read);
82cond_syscall(sys_pciconfig_write);
83cond_syscall(sys_pciconfig_iobase);
84cond_syscall(sys32_ipc);
85cond_syscall(sys32_sysctl);
86cond_syscall(ppc_rtas);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
new file mode 100644
index 000000000000..79dbd93bd697
--- /dev/null
+++ b/kernel/sysctl.c
@@ -0,0 +1,2337 @@
1/*
2 * sysctl.c: General linux system control interface
3 *
4 * Begun 24 March 1995, Stephen Tweedie
5 * Added /proc support, Dec 1995
6 * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
7 * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
8 * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
9 * Dynamic registration fixes, Stephen Tweedie.
10 * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
11 * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
12 * Horn.
13 * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
14 * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
15 * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
16 * Wendling.
17 * The list_for_each() macro wasn't appropriate for the sysctl loop.
18 * Removed it and replaced it with older style, 03/23/00, Bill Wendling
19 */
20
21#include <linux/config.h>
22#include <linux/module.h>
23#include <linux/mm.h>
24#include <linux/swap.h>
25#include <linux/slab.h>
26#include <linux/sysctl.h>
27#include <linux/proc_fs.h>
28#include <linux/ctype.h>
29#include <linux/utsname.h>
30#include <linux/capability.h>
31#include <linux/smp_lock.h>
32#include <linux/init.h>
33#include <linux/kernel.h>
34#include <linux/sysrq.h>
35#include <linux/highuid.h>
36#include <linux/writeback.h>
37#include <linux/hugetlb.h>
38#include <linux/security.h>
39#include <linux/initrd.h>
40#include <linux/times.h>
41#include <linux/limits.h>
42#include <linux/dcache.h>
43#include <linux/syscalls.h>
44
45#include <asm/uaccess.h>
46#include <asm/processor.h>
47
48#ifdef CONFIG_ROOT_NFS
49#include <linux/nfs_fs.h>
50#endif
51
52#if defined(CONFIG_SYSCTL)
53
54/* External variables not in a header file. */
55extern int C_A_D;
56extern int sysctl_overcommit_memory;
57extern int sysctl_overcommit_ratio;
58extern int max_threads;
59extern int sysrq_enabled;
60extern int core_uses_pid;
61extern char core_pattern[];
62extern int cad_pid;
63extern int pid_max;
64extern int min_free_kbytes;
65extern int printk_ratelimit_jiffies;
66extern int printk_ratelimit_burst;
67extern int pid_max_min, pid_max_max;
68
69#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
70int unknown_nmi_panic;
71extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
72 void __user *, size_t *, loff_t *);
73#endif
74
75/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
76static int maxolduid = 65535;
77static int minolduid;
78
79static int ngroups_max = NGROUPS_MAX;
80
81#ifdef CONFIG_KMOD
82extern char modprobe_path[];
83#endif
84#ifdef CONFIG_HOTPLUG
85extern char hotplug_path[];
86#endif
87#ifdef CONFIG_CHR_DEV_SG
88extern int sg_big_buff;
89#endif
90#ifdef CONFIG_SYSVIPC
91extern size_t shm_ctlmax;
92extern size_t shm_ctlall;
93extern int shm_ctlmni;
94extern int msg_ctlmax;
95extern int msg_ctlmnb;
96extern int msg_ctlmni;
97extern int sem_ctls[];
98#endif
99
100#ifdef __sparc__
101extern char reboot_command [];
102extern int stop_a_enabled;
103extern int scons_pwroff;
104#endif
105
106#ifdef __hppa__
107extern int pwrsw_enabled;
108extern int unaligned_enabled;
109#endif
110
111#ifdef CONFIG_ARCH_S390
112#ifdef CONFIG_MATHEMU
113extern int sysctl_ieee_emulation_warnings;
114#endif
115extern int sysctl_userprocess_debug;
116#endif
117
118extern int sysctl_hz_timer;
119
120#ifdef CONFIG_BSD_PROCESS_ACCT
121extern int acct_parm[];
122#endif
123
124int randomize_va_space = 1;
125
126static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
127 ctl_table *, void **);
128static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
129 void __user *buffer, size_t *lenp, loff_t *ppos);
130
131static ctl_table root_table[];
132static struct ctl_table_header root_table_header =
133 { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
134
135static ctl_table kern_table[];
136static ctl_table vm_table[];
137#ifdef CONFIG_NET
138extern ctl_table net_table[];
139#endif
140static ctl_table proc_table[];
141static ctl_table fs_table[];
142static ctl_table debug_table[];
143static ctl_table dev_table[];
144extern ctl_table random_table[];
145#ifdef CONFIG_UNIX98_PTYS
146extern ctl_table pty_table[];
147#endif
148
149#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
150int sysctl_legacy_va_layout;
151#endif
152
153/* /proc declarations: */
154
155#ifdef CONFIG_PROC_FS
156
157static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
158static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
159static int proc_opensys(struct inode *, struct file *);
160
161struct file_operations proc_sys_file_operations = {
162 .open = proc_opensys,
163 .read = proc_readsys,
164 .write = proc_writesys,
165};
166
167extern struct proc_dir_entry *proc_sys_root;
168
169static void register_proc_table(ctl_table *, struct proc_dir_entry *);
170static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
171#endif
172
173/* The default sysctl tables: */
174
175static ctl_table root_table[] = {
176 {
177 .ctl_name = CTL_KERN,
178 .procname = "kernel",
179 .mode = 0555,
180 .child = kern_table,
181 },
182 {
183 .ctl_name = CTL_VM,
184 .procname = "vm",
185 .mode = 0555,
186 .child = vm_table,
187 },
188#ifdef CONFIG_NET
189 {
190 .ctl_name = CTL_NET,
191 .procname = "net",
192 .mode = 0555,
193 .child = net_table,
194 },
195#endif
196 {
197 .ctl_name = CTL_PROC,
198 .procname = "proc",
199 .mode = 0555,
200 .child = proc_table,
201 },
202 {
203 .ctl_name = CTL_FS,
204 .procname = "fs",
205 .mode = 0555,
206 .child = fs_table,
207 },
208 {
209 .ctl_name = CTL_DEBUG,
210 .procname = "debug",
211 .mode = 0555,
212 .child = debug_table,
213 },
214 {
215 .ctl_name = CTL_DEV,
216 .procname = "dev",
217 .mode = 0555,
218 .child = dev_table,
219 },
220 { .ctl_name = 0 }
221};
222
223static ctl_table kern_table[] = {
224 {
225 .ctl_name = KERN_OSTYPE,
226 .procname = "ostype",
227 .data = system_utsname.sysname,
228 .maxlen = sizeof(system_utsname.sysname),
229 .mode = 0444,
230 .proc_handler = &proc_doutsstring,
231 .strategy = &sysctl_string,
232 },
233 {
234 .ctl_name = KERN_OSRELEASE,
235 .procname = "osrelease",
236 .data = system_utsname.release,
237 .maxlen = sizeof(system_utsname.release),
238 .mode = 0444,
239 .proc_handler = &proc_doutsstring,
240 .strategy = &sysctl_string,
241 },
242 {
243 .ctl_name = KERN_VERSION,
244 .procname = "version",
245 .data = system_utsname.version,
246 .maxlen = sizeof(system_utsname.version),
247 .mode = 0444,
248 .proc_handler = &proc_doutsstring,
249 .strategy = &sysctl_string,
250 },
251 {
252 .ctl_name = KERN_NODENAME,
253 .procname = "hostname",
254 .data = system_utsname.nodename,
255 .maxlen = sizeof(system_utsname.nodename),
256 .mode = 0644,
257 .proc_handler = &proc_doutsstring,
258 .strategy = &sysctl_string,
259 },
260 {
261 .ctl_name = KERN_DOMAINNAME,
262 .procname = "domainname",
263 .data = system_utsname.domainname,
264 .maxlen = sizeof(system_utsname.domainname),
265 .mode = 0644,
266 .proc_handler = &proc_doutsstring,
267 .strategy = &sysctl_string,
268 },
269 {
270 .ctl_name = KERN_PANIC,
271 .procname = "panic",
272 .data = &panic_timeout,
273 .maxlen = sizeof(int),
274 .mode = 0644,
275 .proc_handler = &proc_dointvec,
276 },
277 {
278 .ctl_name = KERN_CORE_USES_PID,
279 .procname = "core_uses_pid",
280 .data = &core_uses_pid,
281 .maxlen = sizeof(int),
282 .mode = 0644,
283 .proc_handler = &proc_dointvec,
284 },
285 {
286 .ctl_name = KERN_CORE_PATTERN,
287 .procname = "core_pattern",
288 .data = core_pattern,
289 .maxlen = 64,
290 .mode = 0644,
291 .proc_handler = &proc_dostring,
292 .strategy = &sysctl_string,
293 },
294 {
295 .ctl_name = KERN_TAINTED,
296 .procname = "tainted",
297 .data = &tainted,
298 .maxlen = sizeof(int),
299 .mode = 0444,
300 .proc_handler = &proc_dointvec,
301 },
302 {
303 .ctl_name = KERN_CAP_BSET,
304 .procname = "cap-bound",
305 .data = &cap_bset,
306 .maxlen = sizeof(kernel_cap_t),
307 .mode = 0600,
308 .proc_handler = &proc_dointvec_bset,
309 },
310#ifdef CONFIG_BLK_DEV_INITRD
311 {
312 .ctl_name = KERN_REALROOTDEV,
313 .procname = "real-root-dev",
314 .data = &real_root_dev,
315 .maxlen = sizeof(int),
316 .mode = 0644,
317 .proc_handler = &proc_dointvec,
318 },
319#endif
320#ifdef __sparc__
321 {
322 .ctl_name = KERN_SPARC_REBOOT,
323 .procname = "reboot-cmd",
324 .data = reboot_command,
325 .maxlen = 256,
326 .mode = 0644,
327 .proc_handler = &proc_dostring,
328 .strategy = &sysctl_string,
329 },
330 {
331 .ctl_name = KERN_SPARC_STOP_A,
332 .procname = "stop-a",
333 .data = &stop_a_enabled,
334 .maxlen = sizeof (int),
335 .mode = 0644,
336 .proc_handler = &proc_dointvec,
337 },
338 {
339 .ctl_name = KERN_SPARC_SCONS_PWROFF,
340 .procname = "scons-poweroff",
341 .data = &scons_pwroff,
342 .maxlen = sizeof (int),
343 .mode = 0644,
344 .proc_handler = &proc_dointvec,
345 },
346#endif
347#ifdef __hppa__
348 {
349 .ctl_name = KERN_HPPA_PWRSW,
350 .procname = "soft-power",
351 .data = &pwrsw_enabled,
352 .maxlen = sizeof (int),
353 .mode = 0644,
354 .proc_handler = &proc_dointvec,
355 },
356 {
357 .ctl_name = KERN_HPPA_UNALIGNED,
358 .procname = "unaligned-trap",
359 .data = &unaligned_enabled,
360 .maxlen = sizeof (int),
361 .mode = 0644,
362 .proc_handler = &proc_dointvec,
363 },
364#endif
365 {
366 .ctl_name = KERN_CTLALTDEL,
367 .procname = "ctrl-alt-del",
368 .data = &C_A_D,
369 .maxlen = sizeof(int),
370 .mode = 0644,
371 .proc_handler = &proc_dointvec,
372 },
373 {
374 .ctl_name = KERN_PRINTK,
375 .procname = "printk",
376 .data = &console_loglevel,
377 .maxlen = 4*sizeof(int),
378 .mode = 0644,
379 .proc_handler = &proc_dointvec,
380 },
381#ifdef CONFIG_KMOD
382 {
383 .ctl_name = KERN_MODPROBE,
384 .procname = "modprobe",
385 .data = &modprobe_path,
386 .maxlen = KMOD_PATH_LEN,
387 .mode = 0644,
388 .proc_handler = &proc_dostring,
389 .strategy = &sysctl_string,
390 },
391#endif
392#ifdef CONFIG_HOTPLUG
393 {
394 .ctl_name = KERN_HOTPLUG,
395 .procname = "hotplug",
396 .data = &hotplug_path,
397 .maxlen = HOTPLUG_PATH_LEN,
398 .mode = 0644,
399 .proc_handler = &proc_dostring,
400 .strategy = &sysctl_string,
401 },
402#endif
403#ifdef CONFIG_CHR_DEV_SG
404 {
405 .ctl_name = KERN_SG_BIG_BUFF,
406 .procname = "sg-big-buff",
407 .data = &sg_big_buff,
408 .maxlen = sizeof (int),
409 .mode = 0444,
410 .proc_handler = &proc_dointvec,
411 },
412#endif
413#ifdef CONFIG_BSD_PROCESS_ACCT
414 {
415 .ctl_name = KERN_ACCT,
416 .procname = "acct",
417 .data = &acct_parm,
418 .maxlen = 3*sizeof(int),
419 .mode = 0644,
420 .proc_handler = &proc_dointvec,
421 },
422#endif
423#ifdef CONFIG_SYSVIPC
424 {
425 .ctl_name = KERN_SHMMAX,
426 .procname = "shmmax",
427 .data = &shm_ctlmax,
428 .maxlen = sizeof (size_t),
429 .mode = 0644,
430 .proc_handler = &proc_doulongvec_minmax,
431 },
432 {
433 .ctl_name = KERN_SHMALL,
434 .procname = "shmall",
435 .data = &shm_ctlall,
436 .maxlen = sizeof (size_t),
437 .mode = 0644,
438 .proc_handler = &proc_doulongvec_minmax,
439 },
440 {
441 .ctl_name = KERN_SHMMNI,
442 .procname = "shmmni",
443 .data = &shm_ctlmni,
444 .maxlen = sizeof (int),
445 .mode = 0644,
446 .proc_handler = &proc_dointvec,
447 },
448 {
449 .ctl_name = KERN_MSGMAX,
450 .procname = "msgmax",
451 .data = &msg_ctlmax,
452 .maxlen = sizeof (int),
453 .mode = 0644,
454 .proc_handler = &proc_dointvec,
455 },
456 {
457 .ctl_name = KERN_MSGMNI,
458 .procname = "msgmni",
459 .data = &msg_ctlmni,
460 .maxlen = sizeof (int),
461 .mode = 0644,
462 .proc_handler = &proc_dointvec,
463 },
464 {
465 .ctl_name = KERN_MSGMNB,
466 .procname = "msgmnb",
467 .data = &msg_ctlmnb,
468 .maxlen = sizeof (int),
469 .mode = 0644,
470 .proc_handler = &proc_dointvec,
471 },
472 {
473 .ctl_name = KERN_SEM,
474 .procname = "sem",
475 .data = &sem_ctls,
476 .maxlen = 4*sizeof (int),
477 .mode = 0644,
478 .proc_handler = &proc_dointvec,
479 },
480#endif
481#ifdef CONFIG_MAGIC_SYSRQ
482 {
483 .ctl_name = KERN_SYSRQ,
484 .procname = "sysrq",
485 .data = &sysrq_enabled,
486 .maxlen = sizeof (int),
487 .mode = 0644,
488 .proc_handler = &proc_dointvec,
489 },
490#endif
491 {
492 .ctl_name = KERN_CADPID,
493 .procname = "cad_pid",
494 .data = &cad_pid,
495 .maxlen = sizeof (int),
496 .mode = 0600,
497 .proc_handler = &proc_dointvec,
498 },
499 {
500 .ctl_name = KERN_MAX_THREADS,
501 .procname = "threads-max",
502 .data = &max_threads,
503 .maxlen = sizeof(int),
504 .mode = 0644,
505 .proc_handler = &proc_dointvec,
506 },
507 {
508 .ctl_name = KERN_RANDOM,
509 .procname = "random",
510 .mode = 0555,
511 .child = random_table,
512 },
513#ifdef CONFIG_UNIX98_PTYS
514 {
515 .ctl_name = KERN_PTY,
516 .procname = "pty",
517 .mode = 0555,
518 .child = pty_table,
519 },
520#endif
521 {
522 .ctl_name = KERN_OVERFLOWUID,
523 .procname = "overflowuid",
524 .data = &overflowuid,
525 .maxlen = sizeof(int),
526 .mode = 0644,
527 .proc_handler = &proc_dointvec_minmax,
528 .strategy = &sysctl_intvec,
529 .extra1 = &minolduid,
530 .extra2 = &maxolduid,
531 },
532 {
533 .ctl_name = KERN_OVERFLOWGID,
534 .procname = "overflowgid",
535 .data = &overflowgid,
536 .maxlen = sizeof(int),
537 .mode = 0644,
538 .proc_handler = &proc_dointvec_minmax,
539 .strategy = &sysctl_intvec,
540 .extra1 = &minolduid,
541 .extra2 = &maxolduid,
542 },
543#ifdef CONFIG_ARCH_S390
544#ifdef CONFIG_MATHEMU
545 {
546 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
547 .procname = "ieee_emulation_warnings",
548 .data = &sysctl_ieee_emulation_warnings,
549 .maxlen = sizeof(int),
550 .mode = 0644,
551 .proc_handler = &proc_dointvec,
552 },
553#endif
554#ifdef CONFIG_NO_IDLE_HZ
555 {
556 .ctl_name = KERN_HZ_TIMER,
557 .procname = "hz_timer",
558 .data = &sysctl_hz_timer,
559 .maxlen = sizeof(int),
560 .mode = 0644,
561 .proc_handler = &proc_dointvec,
562 },
563#endif
564 {
565 .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
566 .procname = "userprocess_debug",
567 .data = &sysctl_userprocess_debug,
568 .maxlen = sizeof(int),
569 .mode = 0644,
570 .proc_handler = &proc_dointvec,
571 },
572#endif
573 {
574 .ctl_name = KERN_PIDMAX,
575 .procname = "pid_max",
576 .data = &pid_max,
577 .maxlen = sizeof (int),
578 .mode = 0644,
579 .proc_handler = &proc_dointvec_minmax,
580 .strategy = sysctl_intvec,
581 .extra1 = &pid_max_min,
582 .extra2 = &pid_max_max,
583 },
584 {
585 .ctl_name = KERN_PANIC_ON_OOPS,
586 .procname = "panic_on_oops",
587 .data = &panic_on_oops,
588 .maxlen = sizeof(int),
589 .mode = 0644,
590 .proc_handler = &proc_dointvec,
591 },
592 {
593 .ctl_name = KERN_PRINTK_RATELIMIT,
594 .procname = "printk_ratelimit",
595 .data = &printk_ratelimit_jiffies,
596 .maxlen = sizeof(int),
597 .mode = 0644,
598 .proc_handler = &proc_dointvec_jiffies,
599 .strategy = &sysctl_jiffies,
600 },
601 {
602 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
603 .procname = "printk_ratelimit_burst",
604 .data = &printk_ratelimit_burst,
605 .maxlen = sizeof(int),
606 .mode = 0644,
607 .proc_handler = &proc_dointvec,
608 },
609 {
610 .ctl_name = KERN_NGROUPS_MAX,
611 .procname = "ngroups_max",
612 .data = &ngroups_max,
613 .maxlen = sizeof (int),
614 .mode = 0444,
615 .proc_handler = &proc_dointvec,
616 },
617#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
618 {
619 .ctl_name = KERN_UNKNOWN_NMI_PANIC,
620 .procname = "unknown_nmi_panic",
621 .data = &unknown_nmi_panic,
622 .maxlen = sizeof (int),
623 .mode = 0644,
624 .proc_handler = &proc_unknown_nmi_panic,
625 },
626#endif
627#if defined(CONFIG_X86)
628 {
629 .ctl_name = KERN_BOOTLOADER_TYPE,
630 .procname = "bootloader_type",
631 .data = &bootloader_type,
632 .maxlen = sizeof (int),
633 .mode = 0444,
634 .proc_handler = &proc_dointvec,
635 },
636#endif
637 {
638 .ctl_name = KERN_RANDOMIZE,
639 .procname = "randomize_va_space",
640 .data = &randomize_va_space,
641 .maxlen = sizeof(int),
642 .mode = 0644,
643 .proc_handler = &proc_dointvec,
644 },
645
646 { .ctl_name = 0 }
647};
648
649/* Constants for minimum and maximum testing in vm_table.
650 We use these as one-element integer vectors. */
651static int zero;
652static int one_hundred = 100;
653
654
655static ctl_table vm_table[] = {
656 {
657 .ctl_name = VM_OVERCOMMIT_MEMORY,
658 .procname = "overcommit_memory",
659 .data = &sysctl_overcommit_memory,
660 .maxlen = sizeof(sysctl_overcommit_memory),
661 .mode = 0644,
662 .proc_handler = &proc_dointvec,
663 },
664 {
665 .ctl_name = VM_OVERCOMMIT_RATIO,
666 .procname = "overcommit_ratio",
667 .data = &sysctl_overcommit_ratio,
668 .maxlen = sizeof(sysctl_overcommit_ratio),
669 .mode = 0644,
670 .proc_handler = &proc_dointvec,
671 },
672 {
673 .ctl_name = VM_PAGE_CLUSTER,
674 .procname = "page-cluster",
675 .data = &page_cluster,
676 .maxlen = sizeof(int),
677 .mode = 0644,
678 .proc_handler = &proc_dointvec,
679 },
680 {
681 .ctl_name = VM_DIRTY_BACKGROUND,
682 .procname = "dirty_background_ratio",
683 .data = &dirty_background_ratio,
684 .maxlen = sizeof(dirty_background_ratio),
685 .mode = 0644,
686 .proc_handler = &proc_dointvec_minmax,
687 .strategy = &sysctl_intvec,
688 .extra1 = &zero,
689 .extra2 = &one_hundred,
690 },
691 {
692 .ctl_name = VM_DIRTY_RATIO,
693 .procname = "dirty_ratio",
694 .data = &vm_dirty_ratio,
695 .maxlen = sizeof(vm_dirty_ratio),
696 .mode = 0644,
697 .proc_handler = &proc_dointvec_minmax,
698 .strategy = &sysctl_intvec,
699 .extra1 = &zero,
700 .extra2 = &one_hundred,
701 },
702 {
703 .ctl_name = VM_DIRTY_WB_CS,
704 .procname = "dirty_writeback_centisecs",
705 .data = &dirty_writeback_centisecs,
706 .maxlen = sizeof(dirty_writeback_centisecs),
707 .mode = 0644,
708 .proc_handler = &dirty_writeback_centisecs_handler,
709 },
710 {
711 .ctl_name = VM_DIRTY_EXPIRE_CS,
712 .procname = "dirty_expire_centisecs",
713 .data = &dirty_expire_centisecs,
714 .maxlen = sizeof(dirty_expire_centisecs),
715 .mode = 0644,
716 .proc_handler = &proc_dointvec,
717 },
718 {
719 .ctl_name = VM_NR_PDFLUSH_THREADS,
720 .procname = "nr_pdflush_threads",
721 .data = &nr_pdflush_threads,
722 .maxlen = sizeof nr_pdflush_threads,
723 .mode = 0444 /* read-only*/,
724 .proc_handler = &proc_dointvec,
725 },
726 {
727 .ctl_name = VM_SWAPPINESS,
728 .procname = "swappiness",
729 .data = &vm_swappiness,
730 .maxlen = sizeof(vm_swappiness),
731 .mode = 0644,
732 .proc_handler = &proc_dointvec_minmax,
733 .strategy = &sysctl_intvec,
734 .extra1 = &zero,
735 .extra2 = &one_hundred,
736 },
737#ifdef CONFIG_HUGETLB_PAGE
738 {
739 .ctl_name = VM_HUGETLB_PAGES,
740 .procname = "nr_hugepages",
741 .data = &max_huge_pages,
742 .maxlen = sizeof(unsigned long),
743 .mode = 0644,
744 .proc_handler = &hugetlb_sysctl_handler,
745 .extra1 = (void *)&hugetlb_zero,
746 .extra2 = (void *)&hugetlb_infinity,
747 },
748 {
749 .ctl_name = VM_HUGETLB_GROUP,
750 .procname = "hugetlb_shm_group",
751 .data = &sysctl_hugetlb_shm_group,
752 .maxlen = sizeof(gid_t),
753 .mode = 0644,
754 .proc_handler = &proc_dointvec,
755 },
756#endif
757 {
758 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
759 .procname = "lowmem_reserve_ratio",
760 .data = &sysctl_lowmem_reserve_ratio,
761 .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
762 .mode = 0644,
763 .proc_handler = &lowmem_reserve_ratio_sysctl_handler,
764 .strategy = &sysctl_intvec,
765 },
766 {
767 .ctl_name = VM_MIN_FREE_KBYTES,
768 .procname = "min_free_kbytes",
769 .data = &min_free_kbytes,
770 .maxlen = sizeof(min_free_kbytes),
771 .mode = 0644,
772 .proc_handler = &min_free_kbytes_sysctl_handler,
773 .strategy = &sysctl_intvec,
774 .extra1 = &zero,
775 },
776#ifdef CONFIG_MMU
777 {
778 .ctl_name = VM_MAX_MAP_COUNT,
779 .procname = "max_map_count",
780 .data = &sysctl_max_map_count,
781 .maxlen = sizeof(sysctl_max_map_count),
782 .mode = 0644,
783 .proc_handler = &proc_dointvec
784 },
785#endif
786 {
787 .ctl_name = VM_LAPTOP_MODE,
788 .procname = "laptop_mode",
789 .data = &laptop_mode,
790 .maxlen = sizeof(laptop_mode),
791 .mode = 0644,
792 .proc_handler = &proc_dointvec,
793 .strategy = &sysctl_intvec,
794 .extra1 = &zero,
795 },
796 {
797 .ctl_name = VM_BLOCK_DUMP,
798 .procname = "block_dump",
799 .data = &block_dump,
800 .maxlen = sizeof(block_dump),
801 .mode = 0644,
802 .proc_handler = &proc_dointvec,
803 .strategy = &sysctl_intvec,
804 .extra1 = &zero,
805 },
806 {
807 .ctl_name = VM_VFS_CACHE_PRESSURE,
808 .procname = "vfs_cache_pressure",
809 .data = &sysctl_vfs_cache_pressure,
810 .maxlen = sizeof(sysctl_vfs_cache_pressure),
811 .mode = 0644,
812 .proc_handler = &proc_dointvec,
813 .strategy = &sysctl_intvec,
814 .extra1 = &zero,
815 },
816#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
817 {
818 .ctl_name = VM_LEGACY_VA_LAYOUT,
819 .procname = "legacy_va_layout",
820 .data = &sysctl_legacy_va_layout,
821 .maxlen = sizeof(sysctl_legacy_va_layout),
822 .mode = 0644,
823 .proc_handler = &proc_dointvec,
824 .strategy = &sysctl_intvec,
825 .extra1 = &zero,
826 },
827#endif
828#ifdef CONFIG_SWAP
829 {
830 .ctl_name = VM_SWAP_TOKEN_TIMEOUT,
831 .procname = "swap_token_timeout",
832 .data = &swap_token_default_timeout,
833 .maxlen = sizeof(swap_token_default_timeout),
834 .mode = 0644,
835 .proc_handler = &proc_dointvec_jiffies,
836 .strategy = &sysctl_jiffies,
837 },
838#endif
839 { .ctl_name = 0 }
840};
841
842static ctl_table proc_table[] = {
843 { .ctl_name = 0 }
844};
845
846static ctl_table fs_table[] = {
847 {
848 .ctl_name = FS_NRINODE,
849 .procname = "inode-nr",
850 .data = &inodes_stat,
851 .maxlen = 2*sizeof(int),
852 .mode = 0444,
853 .proc_handler = &proc_dointvec,
854 },
855 {
856 .ctl_name = FS_STATINODE,
857 .procname = "inode-state",
858 .data = &inodes_stat,
859 .maxlen = 7*sizeof(int),
860 .mode = 0444,
861 .proc_handler = &proc_dointvec,
862 },
863 {
864 .ctl_name = FS_NRFILE,
865 .procname = "file-nr",
866 .data = &files_stat,
867 .maxlen = 3*sizeof(int),
868 .mode = 0444,
869 .proc_handler = &proc_dointvec,
870 },
871 {
872 .ctl_name = FS_MAXFILE,
873 .procname = "file-max",
874 .data = &files_stat.max_files,
875 .maxlen = sizeof(int),
876 .mode = 0644,
877 .proc_handler = &proc_dointvec,
878 },
879 {
880 .ctl_name = FS_DENTRY,
881 .procname = "dentry-state",
882 .data = &dentry_stat,
883 .maxlen = 6*sizeof(int),
884 .mode = 0444,
885 .proc_handler = &proc_dointvec,
886 },
887 {
888 .ctl_name = FS_OVERFLOWUID,
889 .procname = "overflowuid",
890 .data = &fs_overflowuid,
891 .maxlen = sizeof(int),
892 .mode = 0644,
893 .proc_handler = &proc_dointvec_minmax,
894 .strategy = &sysctl_intvec,
895 .extra1 = &minolduid,
896 .extra2 = &maxolduid,
897 },
898 {
899 .ctl_name = FS_OVERFLOWGID,
900 .procname = "overflowgid",
901 .data = &fs_overflowgid,
902 .maxlen = sizeof(int),
903 .mode = 0644,
904 .proc_handler = &proc_dointvec_minmax,
905 .strategy = &sysctl_intvec,
906 .extra1 = &minolduid,
907 .extra2 = &maxolduid,
908 },
909 {
910 .ctl_name = FS_LEASES,
911 .procname = "leases-enable",
912 .data = &leases_enable,
913 .maxlen = sizeof(int),
914 .mode = 0644,
915 .proc_handler = &proc_dointvec,
916 },
917#ifdef CONFIG_DNOTIFY
918 {
919 .ctl_name = FS_DIR_NOTIFY,
920 .procname = "dir-notify-enable",
921 .data = &dir_notify_enable,
922 .maxlen = sizeof(int),
923 .mode = 0644,
924 .proc_handler = &proc_dointvec,
925 },
926#endif
927#ifdef CONFIG_MMU
928 {
929 .ctl_name = FS_LEASE_TIME,
930 .procname = "lease-break-time",
931 .data = &lease_break_time,
932 .maxlen = sizeof(int),
933 .mode = 0644,
934 .proc_handler = &proc_dointvec,
935 },
936 {
937 .ctl_name = FS_AIO_NR,
938 .procname = "aio-nr",
939 .data = &aio_nr,
940 .maxlen = sizeof(aio_nr),
941 .mode = 0444,
942 .proc_handler = &proc_dointvec,
943 },
944 {
945 .ctl_name = FS_AIO_MAX_NR,
946 .procname = "aio-max-nr",
947 .data = &aio_max_nr,
948 .maxlen = sizeof(aio_max_nr),
949 .mode = 0644,
950 .proc_handler = &proc_dointvec,
951 },
952#endif
953 { .ctl_name = 0 }
954};
955
956static ctl_table debug_table[] = {
957 { .ctl_name = 0 }
958};
959
960static ctl_table dev_table[] = {
961 { .ctl_name = 0 }
962};
963
964extern void init_irq_proc (void);
965
966void __init sysctl_init(void)
967{
968#ifdef CONFIG_PROC_FS
969 register_proc_table(root_table, proc_sys_root);
970 init_irq_proc();
971#endif
972}
973
974int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
975 void __user *newval, size_t newlen)
976{
977 struct list_head *tmp;
978
979 if (nlen <= 0 || nlen >= CTL_MAXNAME)
980 return -ENOTDIR;
981 if (oldval) {
982 int old_len;
983 if (!oldlenp || get_user(old_len, oldlenp))
984 return -EFAULT;
985 }
986 tmp = &root_table_header.ctl_entry;
987 do {
988 struct ctl_table_header *head =
989 list_entry(tmp, struct ctl_table_header, ctl_entry);
990 void *context = NULL;
991 int error = parse_table(name, nlen, oldval, oldlenp,
992 newval, newlen, head->ctl_table,
993 &context);
994 if (context)
995 kfree(context);
996 if (error != -ENOTDIR)
997 return error;
998 tmp = tmp->next;
999 } while (tmp != &root_table_header.ctl_entry);
1000 return -ENOTDIR;
1001}
1002
1003asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
1004{
1005 struct __sysctl_args tmp;
1006 int error;
1007
1008 if (copy_from_user(&tmp, args, sizeof(tmp)))
1009 return -EFAULT;
1010
1011 lock_kernel();
1012 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
1013 tmp.newval, tmp.newlen);
1014 unlock_kernel();
1015 return error;
1016}
1017
1018/*
1019 * ctl_perm does NOT grant the superuser all rights automatically, because
1020 * some sysctl variables are readonly even to root.
1021 */
1022
1023static int test_perm(int mode, int op)
1024{
1025 if (!current->euid)
1026 mode >>= 6;
1027 else if (in_egroup_p(0))
1028 mode >>= 3;
1029 if ((mode & op & 0007) == op)
1030 return 0;
1031 return -EACCES;
1032}
1033
1034static inline int ctl_perm(ctl_table *table, int op)
1035{
1036 int error;
1037 error = security_sysctl(table, op);
1038 if (error)
1039 return error;
1040 return test_perm(table->mode, op);
1041}
1042
1043static int parse_table(int __user *name, int nlen,
1044 void __user *oldval, size_t __user *oldlenp,
1045 void __user *newval, size_t newlen,
1046 ctl_table *table, void **context)
1047{
1048 int n;
1049repeat:
1050 if (!nlen)
1051 return -ENOTDIR;
1052 if (get_user(n, name))
1053 return -EFAULT;
1054 for ( ; table->ctl_name; table++) {
1055 if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
1056 int error;
1057 if (table->child) {
1058 if (ctl_perm(table, 001))
1059 return -EPERM;
1060 if (table->strategy) {
1061 error = table->strategy(
1062 table, name, nlen,
1063 oldval, oldlenp,
1064 newval, newlen, context);
1065 if (error)
1066 return error;
1067 }
1068 name++;
1069 nlen--;
1070 table = table->child;
1071 goto repeat;
1072 }
1073 error = do_sysctl_strategy(table, name, nlen,
1074 oldval, oldlenp,
1075 newval, newlen, context);
1076 return error;
1077 }
1078 }
1079 return -ENOTDIR;
1080}
1081
1082/* Perform the actual read/write of a sysctl table entry. */
1083int do_sysctl_strategy (ctl_table *table,
1084 int __user *name, int nlen,
1085 void __user *oldval, size_t __user *oldlenp,
1086 void __user *newval, size_t newlen, void **context)
1087{
1088 int op = 0, rc;
1089 size_t len;
1090
1091 if (oldval)
1092 op |= 004;
1093 if (newval)
1094 op |= 002;
1095 if (ctl_perm(table, op))
1096 return -EPERM;
1097
1098 if (table->strategy) {
1099 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1100 newval, newlen, context);
1101 if (rc < 0)
1102 return rc;
1103 if (rc > 0)
1104 return 0;
1105 }
1106
1107 /* If there is no strategy routine, or if the strategy returns
1108 * zero, proceed with automatic r/w */
1109 if (table->data && table->maxlen) {
1110 if (oldval && oldlenp) {
1111 if (get_user(len, oldlenp))
1112 return -EFAULT;
1113 if (len) {
1114 if (len > table->maxlen)
1115 len = table->maxlen;
1116 if(copy_to_user(oldval, table->data, len))
1117 return -EFAULT;
1118 if(put_user(len, oldlenp))
1119 return -EFAULT;
1120 }
1121 }
1122 if (newval && newlen) {
1123 len = newlen;
1124 if (len > table->maxlen)
1125 len = table->maxlen;
1126 if(copy_from_user(table->data, newval, len))
1127 return -EFAULT;
1128 }
1129 }
1130 return 0;
1131}
1132
1133/**
1134 * register_sysctl_table - register a sysctl hierarchy
1135 * @table: the top-level table structure
1136 * @insert_at_head: whether the entry should be inserted in front or at the end
1137 *
1138 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1139 * array. An entry with a ctl_name of 0 terminates the table.
1140 *
1141 * The members of the &ctl_table structure are used as follows:
1142 *
1143 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
1144 * must be unique within that level of sysctl
1145 *
1146 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1147 * enter a sysctl file
1148 *
1149 * data - a pointer to data for use by proc_handler
1150 *
1151 * maxlen - the maximum size in bytes of the data
1152 *
1153 * mode - the file permissions for the /proc/sys file, and for sysctl(2)
1154 *
1155 * child - a pointer to the child sysctl table if this entry is a directory, or
1156 * %NULL.
1157 *
1158 * proc_handler - the text handler routine (described below)
1159 *
1160 * strategy - the strategy routine (described below)
1161 *
1162 * de - for internal use by the sysctl routines
1163 *
1164 * extra1, extra2 - extra pointers usable by the proc handler routines
1165 *
1166 * Leaf nodes in the sysctl tree will be represented by a single file
1167 * under /proc; non-leaf nodes will be represented by directories.
1168 *
1169 * sysctl(2) can automatically manage read and write requests through
1170 * the sysctl table. The data and maxlen fields of the ctl_table
1171 * struct enable minimal validation of the values being written to be
1172 * performed, and the mode field allows minimal authentication.
1173 *
1174 * More sophisticated management can be enabled by the provision of a
1175 * strategy routine with the table entry. This will be called before
1176 * any automatic read or write of the data is performed.
1177 *
1178 * The strategy routine may return
1179 *
1180 * < 0 - Error occurred (error is passed to user process)
1181 *
1182 * 0 - OK - proceed with automatic read or write.
1183 *
1184 * > 0 - OK - read or write has been done by the strategy routine, so
1185 * return immediately.
1186 *
1187 * There must be a proc_handler routine for any terminal nodes
1188 * mirrored under /proc/sys (non-terminals are handled by a built-in
1189 * directory handler). Several default handlers are available to
1190 * cover common cases -
1191 *
1192 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
1193 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
1194 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
1195 *
1196 * It is the handler's job to read the input buffer from user memory
1197 * and process it. The handler should return 0 on success.
1198 *
1199 * This routine returns %NULL on a failure to register, and a pointer
1200 * to the table header on success.
1201 */
1202struct ctl_table_header *register_sysctl_table(ctl_table * table,
1203 int insert_at_head)
1204{
1205 struct ctl_table_header *tmp;
1206 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
1207 if (!tmp)
1208 return NULL;
1209 tmp->ctl_table = table;
1210 INIT_LIST_HEAD(&tmp->ctl_entry);
1211 if (insert_at_head)
1212 list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
1213 else
1214 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1215#ifdef CONFIG_PROC_FS
1216 register_proc_table(table, proc_sys_root);
1217#endif
1218 return tmp;
1219}
1220
1221/**
1222 * unregister_sysctl_table - unregister a sysctl table hierarchy
1223 * @header: the header returned from register_sysctl_table
1224 *
1225 * Unregisters the sysctl table and all children. proc entries may not
1226 * actually be removed until they are no longer used by anyone.
1227 */
1228void unregister_sysctl_table(struct ctl_table_header * header)
1229{
1230 list_del(&header->ctl_entry);
1231#ifdef CONFIG_PROC_FS
1232 unregister_proc_table(header->ctl_table, proc_sys_root);
1233#endif
1234 kfree(header);
1235}
1236
1237/*
1238 * /proc/sys support
1239 */
1240
1241#ifdef CONFIG_PROC_FS
1242
1243/* Scan the sysctl entries in table and add them all into /proc */
1244static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
1245{
1246 struct proc_dir_entry *de;
1247 int len;
1248 mode_t mode;
1249
1250 for (; table->ctl_name; table++) {
1251 /* Can't do anything without a proc name. */
1252 if (!table->procname)
1253 continue;
1254 /* Maybe we can't do anything with it... */
1255 if (!table->proc_handler && !table->child) {
1256 printk(KERN_WARNING "SYSCTL: Can't register %s\n",
1257 table->procname);
1258 continue;
1259 }
1260
1261 len = strlen(table->procname);
1262 mode = table->mode;
1263
1264 de = NULL;
1265 if (table->proc_handler)
1266 mode |= S_IFREG;
1267 else {
1268 mode |= S_IFDIR;
1269 for (de = root->subdir; de; de = de->next) {
1270 if (proc_match(len, table->procname, de))
1271 break;
1272 }
1273 /* If the subdir exists already, de is non-NULL */
1274 }
1275
1276 if (!de) {
1277 de = create_proc_entry(table->procname, mode, root);
1278 if (!de)
1279 continue;
1280 de->data = (void *) table;
1281 if (table->proc_handler)
1282 de->proc_fops = &proc_sys_file_operations;
1283 }
1284 table->de = de;
1285 if (de->mode & S_IFDIR)
1286 register_proc_table(table->child, de);
1287 }
1288}
1289
1290/*
1291 * Unregister a /proc sysctl table and any subdirectories.
1292 */
1293static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
1294{
1295 struct proc_dir_entry *de;
1296 for (; table->ctl_name; table++) {
1297 if (!(de = table->de))
1298 continue;
1299 if (de->mode & S_IFDIR) {
1300 if (!table->child) {
1301 printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
1302 continue;
1303 }
1304 unregister_proc_table(table->child, de);
1305
1306 /* Don't unregister directories which still have entries.. */
1307 if (de->subdir)
1308 continue;
1309 }
1310
1311 /* Don't unregister proc entries that are still being used.. */
1312 if (atomic_read(&de->count))
1313 continue;
1314
1315 table->de = NULL;
1316 remove_proc_entry(table->procname, root);
1317 }
1318}
1319
1320static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1321 size_t count, loff_t *ppos)
1322{
1323 int op;
1324 struct proc_dir_entry *de;
1325 struct ctl_table *table;
1326 size_t res;
1327 ssize_t error;
1328
1329 de = PDE(file->f_dentry->d_inode);
1330 if (!de || !de->data)
1331 return -ENOTDIR;
1332 table = (struct ctl_table *) de->data;
1333 if (!table || !table->proc_handler)
1334 return -ENOTDIR;
1335 op = (write ? 002 : 004);
1336 if (ctl_perm(table, op))
1337 return -EPERM;
1338
1339 res = count;
1340
1341 error = (*table->proc_handler) (table, write, file, buf, &res, ppos);
1342 if (error)
1343 return error;
1344 return res;
1345}
1346
1347static int proc_opensys(struct inode *inode, struct file *file)
1348{
1349 if (file->f_mode & FMODE_WRITE) {
1350 /*
1351 * sysctl entries that are not writable,
1352 * are _NOT_ writable, capabilities or not.
1353 */
1354 if (!(inode->i_mode & S_IWUSR))
1355 return -EPERM;
1356 }
1357
1358 return 0;
1359}
1360
1361static ssize_t proc_readsys(struct file * file, char __user * buf,
1362 size_t count, loff_t *ppos)
1363{
1364 return do_rw_proc(0, file, buf, count, ppos);
1365}
1366
1367static ssize_t proc_writesys(struct file * file, const char __user * buf,
1368 size_t count, loff_t *ppos)
1369{
1370 return do_rw_proc(1, file, (char __user *) buf, count, ppos);
1371}
1372
1373/**
1374 * proc_dostring - read a string sysctl
1375 * @table: the sysctl table
1376 * @write: %TRUE if this is a write to the sysctl file
1377 * @filp: the file structure
1378 * @buffer: the user buffer
1379 * @lenp: the size of the user buffer
1380 * @ppos: file position
1381 *
1382 * Reads/writes a string from/to the user buffer. If the kernel
1383 * buffer provided is not large enough to hold the string, the
1384 * string is truncated. The copied string is %NULL-terminated.
1385 * If the string is being read by the user process, it is copied
1386 * and a newline '\n' is added. It is truncated if the buffer is
1387 * not large enough.
1388 *
1389 * Returns 0 on success.
1390 */
1391int proc_dostring(ctl_table *table, int write, struct file *filp,
1392 void __user *buffer, size_t *lenp, loff_t *ppos)
1393{
1394 size_t len;
1395 char __user *p;
1396 char c;
1397
1398 if (!table->data || !table->maxlen || !*lenp ||
1399 (*ppos && !write)) {
1400 *lenp = 0;
1401 return 0;
1402 }
1403
1404 if (write) {
1405 len = 0;
1406 p = buffer;
1407 while (len < *lenp) {
1408 if (get_user(c, p++))
1409 return -EFAULT;
1410 if (c == 0 || c == '\n')
1411 break;
1412 len++;
1413 }
1414 if (len >= table->maxlen)
1415 len = table->maxlen-1;
1416 if(copy_from_user(table->data, buffer, len))
1417 return -EFAULT;
1418 ((char *) table->data)[len] = 0;
1419 *ppos += *lenp;
1420 } else {
1421 len = strlen(table->data);
1422 if (len > table->maxlen)
1423 len = table->maxlen;
1424 if (len > *lenp)
1425 len = *lenp;
1426 if (len)
1427 if(copy_to_user(buffer, table->data, len))
1428 return -EFAULT;
1429 if (len < *lenp) {
1430 if(put_user('\n', ((char __user *) buffer) + len))
1431 return -EFAULT;
1432 len++;
1433 }
1434 *lenp = len;
1435 *ppos += len;
1436 }
1437 return 0;
1438}
1439
1440/*
1441 * Special case of dostring for the UTS structure. This has locks
1442 * to observe. Should this be in kernel/sys.c ????
1443 */
1444
1445static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
1446 void __user *buffer, size_t *lenp, loff_t *ppos)
1447{
1448 int r;
1449
1450 if (!write) {
1451 down_read(&uts_sem);
1452 r=proc_dostring(table,0,filp,buffer,lenp, ppos);
1453 up_read(&uts_sem);
1454 } else {
1455 down_write(&uts_sem);
1456 r=proc_dostring(table,1,filp,buffer,lenp, ppos);
1457 up_write(&uts_sem);
1458 }
1459 return r;
1460}
1461
1462static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1463 int *valp,
1464 int write, void *data)
1465{
1466 if (write) {
1467 *valp = *negp ? -*lvalp : *lvalp;
1468 } else {
1469 int val = *valp;
1470 if (val < 0) {
1471 *negp = -1;
1472 *lvalp = (unsigned long)-val;
1473 } else {
1474 *negp = 0;
1475 *lvalp = (unsigned long)val;
1476 }
1477 }
1478 return 0;
1479}
1480
1481static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
1482 void __user *buffer, size_t *lenp, loff_t *ppos,
1483 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
1484 int write, void *data),
1485 void *data)
1486{
1487#define TMPBUFLEN 21
1488 int *i, vleft, first=1, neg, val;
1489 unsigned long lval;
1490 size_t left, len;
1491
1492 char buf[TMPBUFLEN], *p;
1493 char __user *s = buffer;
1494
1495 if (!table->data || !table->maxlen || !*lenp ||
1496 (*ppos && !write)) {
1497 *lenp = 0;
1498 return 0;
1499 }
1500
1501 i = (int *) table->data;
1502 vleft = table->maxlen / sizeof(*i);
1503 left = *lenp;
1504
1505 if (!conv)
1506 conv = do_proc_dointvec_conv;
1507
1508 for (; left && vleft--; i++, first=0) {
1509 if (write) {
1510 while (left) {
1511 char c;
1512 if (get_user(c, s))
1513 return -EFAULT;
1514 if (!isspace(c))
1515 break;
1516 left--;
1517 s++;
1518 }
1519 if (!left)
1520 break;
1521 neg = 0;
1522 len = left;
1523 if (len > sizeof(buf) - 1)
1524 len = sizeof(buf) - 1;
1525 if (copy_from_user(buf, s, len))
1526 return -EFAULT;
1527 buf[len] = 0;
1528 p = buf;
1529 if (*p == '-' && left > 1) {
1530 neg = 1;
1531 left--, p++;
1532 }
1533 if (*p < '0' || *p > '9')
1534 break;
1535
1536 lval = simple_strtoul(p, &p, 0);
1537
1538 len = p-buf;
1539 if ((len < left) && *p && !isspace(*p))
1540 break;
1541 if (neg)
1542 val = -val;
1543 s += len;
1544 left -= len;
1545
1546 if (conv(&neg, &lval, i, 1, data))
1547 break;
1548 } else {
1549 p = buf;
1550 if (!first)
1551 *p++ = '\t';
1552
1553 if (conv(&neg, &lval, i, 0, data))
1554 break;
1555
1556 sprintf(p, "%s%lu", neg ? "-" : "", lval);
1557 len = strlen(buf);
1558 if (len > left)
1559 len = left;
1560 if(copy_to_user(s, buf, len))
1561 return -EFAULT;
1562 left -= len;
1563 s += len;
1564 }
1565 }
1566
1567 if (!write && !first && left) {
1568 if(put_user('\n', s))
1569 return -EFAULT;
1570 left--, s++;
1571 }
1572 if (write) {
1573 while (left) {
1574 char c;
1575 if (get_user(c, s++))
1576 return -EFAULT;
1577 if (!isspace(c))
1578 break;
1579 left--;
1580 }
1581 }
1582 if (write && first)
1583 return -EINVAL;
1584 *lenp -= left;
1585 *ppos += *lenp;
1586 return 0;
1587#undef TMPBUFLEN
1588}
1589
1590/**
1591 * proc_dointvec - read a vector of integers
1592 * @table: the sysctl table
1593 * @write: %TRUE if this is a write to the sysctl file
1594 * @filp: the file structure
1595 * @buffer: the user buffer
1596 * @lenp: the size of the user buffer
1597 * @ppos: file position
1598 *
1599 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
1600 * values from/to the user buffer, treated as an ASCII string.
1601 *
1602 * Returns 0 on success.
1603 */
1604int proc_dointvec(ctl_table *table, int write, struct file *filp,
1605 void __user *buffer, size_t *lenp, loff_t *ppos)
1606{
1607 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1608 NULL,NULL);
1609}
1610
1611#define OP_SET 0
1612#define OP_AND 1
1613#define OP_OR 2
1614#define OP_MAX 3
1615#define OP_MIN 4
1616
1617static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1618 int *valp,
1619 int write, void *data)
1620{
1621 int op = *(int *)data;
1622 if (write) {
1623 int val = *negp ? -*lvalp : *lvalp;
1624 switch(op) {
1625 case OP_SET: *valp = val; break;
1626 case OP_AND: *valp &= val; break;
1627 case OP_OR: *valp |= val; break;
1628 case OP_MAX: if(*valp < val)
1629 *valp = val;
1630 break;
1631 case OP_MIN: if(*valp > val)
1632 *valp = val;
1633 break;
1634 }
1635 } else {
1636 int val = *valp;
1637 if (val < 0) {
1638 *negp = -1;
1639 *lvalp = (unsigned long)-val;
1640 } else {
1641 *negp = 0;
1642 *lvalp = (unsigned long)val;
1643 }
1644 }
1645 return 0;
1646}
1647
1648/*
1649 * init may raise the set.
1650 */
1651
1652int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
1653 void __user *buffer, size_t *lenp, loff_t *ppos)
1654{
1655 int op;
1656
1657 if (!capable(CAP_SYS_MODULE)) {
1658 return -EPERM;
1659 }
1660
1661 op = (current->pid == 1) ? OP_SET : OP_AND;
1662 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1663 do_proc_dointvec_bset_conv,&op);
1664}
1665
1666struct do_proc_dointvec_minmax_conv_param {
1667 int *min;
1668 int *max;
1669};
1670
1671static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
1672 int *valp,
1673 int write, void *data)
1674{
1675 struct do_proc_dointvec_minmax_conv_param *param = data;
1676 if (write) {
1677 int val = *negp ? -*lvalp : *lvalp;
1678 if ((param->min && *param->min > val) ||
1679 (param->max && *param->max < val))
1680 return -EINVAL;
1681 *valp = val;
1682 } else {
1683 int val = *valp;
1684 if (val < 0) {
1685 *negp = -1;
1686 *lvalp = (unsigned long)-val;
1687 } else {
1688 *negp = 0;
1689 *lvalp = (unsigned long)val;
1690 }
1691 }
1692 return 0;
1693}
1694
1695/**
1696 * proc_dointvec_minmax - read a vector of integers with min/max values
1697 * @table: the sysctl table
1698 * @write: %TRUE if this is a write to the sysctl file
1699 * @filp: the file structure
1700 * @buffer: the user buffer
1701 * @lenp: the size of the user buffer
1702 * @ppos: file position
1703 *
1704 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
1705 * values from/to the user buffer, treated as an ASCII string.
1706 *
1707 * This routine will ensure the values are within the range specified by
1708 * table->extra1 (min) and table->extra2 (max).
1709 *
1710 * Returns 0 on success.
1711 */
1712int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
1713 void __user *buffer, size_t *lenp, loff_t *ppos)
1714{
1715 struct do_proc_dointvec_minmax_conv_param param = {
1716 .min = (int *) table->extra1,
1717 .max = (int *) table->extra2,
1718 };
1719 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
1720 do_proc_dointvec_minmax_conv, &param);
1721}
1722
1723static int do_proc_doulongvec_minmax(ctl_table *table, int write,
1724 struct file *filp,
1725 void __user *buffer,
1726 size_t *lenp, loff_t *ppos,
1727 unsigned long convmul,
1728 unsigned long convdiv)
1729{
1730#define TMPBUFLEN 21
1731 unsigned long *i, *min, *max, val;
1732 int vleft, first=1, neg;
1733 size_t len, left;
1734 char buf[TMPBUFLEN], *p;
1735 char __user *s = buffer;
1736
1737 if (!table->data || !table->maxlen || !*lenp ||
1738 (*ppos && !write)) {
1739 *lenp = 0;
1740 return 0;
1741 }
1742
1743 i = (unsigned long *) table->data;
1744 min = (unsigned long *) table->extra1;
1745 max = (unsigned long *) table->extra2;
1746 vleft = table->maxlen / sizeof(unsigned long);
1747 left = *lenp;
1748
1749 for (; left && vleft--; i++, min++, max++, first=0) {
1750 if (write) {
1751 while (left) {
1752 char c;
1753 if (get_user(c, s))
1754 return -EFAULT;
1755 if (!isspace(c))
1756 break;
1757 left--;
1758 s++;
1759 }
1760 if (!left)
1761 break;
1762 neg = 0;
1763 len = left;
1764 if (len > TMPBUFLEN-1)
1765 len = TMPBUFLEN-1;
1766 if (copy_from_user(buf, s, len))
1767 return -EFAULT;
1768 buf[len] = 0;
1769 p = buf;
1770 if (*p == '-' && left > 1) {
1771 neg = 1;
1772 left--, p++;
1773 }
1774 if (*p < '0' || *p > '9')
1775 break;
1776 val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
1777 len = p-buf;
1778 if ((len < left) && *p && !isspace(*p))
1779 break;
1780 if (neg)
1781 val = -val;
1782 s += len;
1783 left -= len;
1784
1785 if(neg)
1786 continue;
1787 if ((min && val < *min) || (max && val > *max))
1788 continue;
1789 *i = val;
1790 } else {
1791 p = buf;
1792 if (!first)
1793 *p++ = '\t';
1794 sprintf(p, "%lu", convdiv * (*i) / convmul);
1795 len = strlen(buf);
1796 if (len > left)
1797 len = left;
1798 if(copy_to_user(s, buf, len))
1799 return -EFAULT;
1800 left -= len;
1801 s += len;
1802 }
1803 }
1804
1805 if (!write && !first && left) {
1806 if(put_user('\n', s))
1807 return -EFAULT;
1808 left--, s++;
1809 }
1810 if (write) {
1811 while (left) {
1812 char c;
1813 if (get_user(c, s++))
1814 return -EFAULT;
1815 if (!isspace(c))
1816 break;
1817 left--;
1818 }
1819 }
1820 if (write && first)
1821 return -EINVAL;
1822 *lenp -= left;
1823 *ppos += *lenp;
1824 return 0;
1825#undef TMPBUFLEN
1826}
1827
1828/**
1829 * proc_doulongvec_minmax - read a vector of long integers with min/max values
1830 * @table: the sysctl table
1831 * @write: %TRUE if this is a write to the sysctl file
1832 * @filp: the file structure
1833 * @buffer: the user buffer
1834 * @lenp: the size of the user buffer
1835 * @ppos: file position
1836 *
1837 * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
1838 * values from/to the user buffer, treated as an ASCII string.
1839 *
1840 * This routine will ensure the values are within the range specified by
1841 * table->extra1 (min) and table->extra2 (max).
1842 *
1843 * Returns 0 on success.
1844 */
1845int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
1846 void __user *buffer, size_t *lenp, loff_t *ppos)
1847{
1848 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
1849}
1850
1851/**
1852 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
1853 * @table: the sysctl table
1854 * @write: %TRUE if this is a write to the sysctl file
1855 * @filp: the file structure
1856 * @buffer: the user buffer
1857 * @lenp: the size of the user buffer
1858 * @ppos: file position
1859 *
1860 * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
1861 * values from/to the user buffer, treated as an ASCII string. The values
1862 * are treated as milliseconds, and converted to jiffies when they are stored.
1863 *
1864 * This routine will ensure the values are within the range specified by
1865 * table->extra1 (min) and table->extra2 (max).
1866 *
1867 * Returns 0 on success.
1868 */
1869int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
1870 struct file *filp,
1871 void __user *buffer,
1872 size_t *lenp, loff_t *ppos)
1873{
1874 return do_proc_doulongvec_minmax(table, write, filp, buffer,
1875 lenp, ppos, HZ, 1000l);
1876}
1877
1878
1879static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
1880 int *valp,
1881 int write, void *data)
1882{
1883 if (write) {
1884 *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
1885 } else {
1886 int val = *valp;
1887 unsigned long lval;
1888 if (val < 0) {
1889 *negp = -1;
1890 lval = (unsigned long)-val;
1891 } else {
1892 *negp = 0;
1893 lval = (unsigned long)val;
1894 }
1895 *lvalp = lval / HZ;
1896 }
1897 return 0;
1898}
1899
1900static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
1901 int *valp,
1902 int write, void *data)
1903{
1904 if (write) {
1905 *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
1906 } else {
1907 int val = *valp;
1908 unsigned long lval;
1909 if (val < 0) {
1910 *negp = -1;
1911 lval = (unsigned long)-val;
1912 } else {
1913 *negp = 0;
1914 lval = (unsigned long)val;
1915 }
1916 *lvalp = jiffies_to_clock_t(lval);
1917 }
1918 return 0;
1919}
1920
1921static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
1922 int *valp,
1923 int write, void *data)
1924{
1925 if (write) {
1926 *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
1927 } else {
1928 int val = *valp;
1929 unsigned long lval;
1930 if (val < 0) {
1931 *negp = -1;
1932 lval = (unsigned long)-val;
1933 } else {
1934 *negp = 0;
1935 lval = (unsigned long)val;
1936 }
1937 *lvalp = jiffies_to_msecs(lval);
1938 }
1939 return 0;
1940}
1941
1942/**
1943 * proc_dointvec_jiffies - read a vector of integers as seconds
1944 * @table: the sysctl table
1945 * @write: %TRUE if this is a write to the sysctl file
1946 * @filp: the file structure
1947 * @buffer: the user buffer
1948 * @lenp: the size of the user buffer
1949 * @ppos: file position
1950 *
1951 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
1952 * values from/to the user buffer, treated as an ASCII string.
1953 * The values read are assumed to be in seconds, and are converted into
1954 * jiffies.
1955 *
1956 * Returns 0 on success.
1957 */
1958int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
1959 void __user *buffer, size_t *lenp, loff_t *ppos)
1960{
1961 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1962 do_proc_dointvec_jiffies_conv,NULL);
1963}
1964
1965/**
1966 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
1967 * @table: the sysctl table
1968 * @write: %TRUE if this is a write to the sysctl file
1969 * @filp: the file structure
1970 * @buffer: the user buffer
1971 * @lenp: the size of the user buffer
1972 *
1973 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
1974 * values from/to the user buffer, treated as an ASCII string.
1975 * The values read are assumed to be in 1/USER_HZ seconds, and
1976 * are converted into jiffies.
1977 *
1978 * Returns 0 on success.
1979 */
1980int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
1981 void __user *buffer, size_t *lenp, loff_t *ppos)
1982{
1983 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1984 do_proc_dointvec_userhz_jiffies_conv,NULL);
1985}
1986
1987/**
1988 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
1989 * @table: the sysctl table
1990 * @write: %TRUE if this is a write to the sysctl file
1991 * @filp: the file structure
1992 * @buffer: the user buffer
1993 * @lenp: the size of the user buffer
1994 *
1995 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
1996 * values from/to the user buffer, treated as an ASCII string.
1997 * The values read are assumed to be in 1/1000 seconds, and
1998 * are converted into jiffies.
1999 *
2000 * Returns 0 on success.
2001 */
2002int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2003 void __user *buffer, size_t *lenp, loff_t *ppos)
2004{
2005 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
2006 do_proc_dointvec_ms_jiffies_conv, NULL);
2007}
2008
2009#else /* CONFIG_PROC_FS */
2010
2011int proc_dostring(ctl_table *table, int write, struct file *filp,
2012 void __user *buffer, size_t *lenp, loff_t *ppos)
2013{
2014 return -ENOSYS;
2015}
2016
2017static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
2018 void __user *buffer, size_t *lenp, loff_t *ppos)
2019{
2020 return -ENOSYS;
2021}
2022
2023int proc_dointvec(ctl_table *table, int write, struct file *filp,
2024 void __user *buffer, size_t *lenp, loff_t *ppos)
2025{
2026 return -ENOSYS;
2027}
2028
2029int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
2030 void __user *buffer, size_t *lenp, loff_t *ppos)
2031{
2032 return -ENOSYS;
2033}
2034
2035int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
2036 void __user *buffer, size_t *lenp, loff_t *ppos)
2037{
2038 return -ENOSYS;
2039}
2040
2041int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
2042 void __user *buffer, size_t *lenp, loff_t *ppos)
2043{
2044 return -ENOSYS;
2045}
2046
2047int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
2048 void __user *buffer, size_t *lenp, loff_t *ppos)
2049{
2050 return -ENOSYS;
2051}
2052
2053int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2054 void __user *buffer, size_t *lenp, loff_t *ppos)
2055{
2056 return -ENOSYS;
2057}
2058
2059int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
2060 void __user *buffer, size_t *lenp, loff_t *ppos)
2061{
2062 return -ENOSYS;
2063}
2064
2065int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2066 struct file *filp,
2067 void __user *buffer,
2068 size_t *lenp, loff_t *ppos)
2069{
2070 return -ENOSYS;
2071}
2072
2073
2074#endif /* CONFIG_PROC_FS */
2075
2076
2077/*
2078 * General sysctl support routines
2079 */
2080
2081/* The generic string strategy routine: */
2082int sysctl_string(ctl_table *table, int __user *name, int nlen,
2083 void __user *oldval, size_t __user *oldlenp,
2084 void __user *newval, size_t newlen, void **context)
2085{
2086 size_t l, len;
2087
2088 if (!table->data || !table->maxlen)
2089 return -ENOTDIR;
2090
2091 if (oldval && oldlenp) {
2092 if (get_user(len, oldlenp))
2093 return -EFAULT;
2094 if (len) {
2095 l = strlen(table->data);
2096 if (len > l) len = l;
2097 if (len >= table->maxlen)
2098 len = table->maxlen;
2099 if(copy_to_user(oldval, table->data, len))
2100 return -EFAULT;
2101 if(put_user(0, ((char __user *) oldval) + len))
2102 return -EFAULT;
2103 if(put_user(len, oldlenp))
2104 return -EFAULT;
2105 }
2106 }
2107 if (newval && newlen) {
2108 len = newlen;
2109 if (len > table->maxlen)
2110 len = table->maxlen;
2111 if(copy_from_user(table->data, newval, len))
2112 return -EFAULT;
2113 if (len == table->maxlen)
2114 len--;
2115 ((char *) table->data)[len] = 0;
2116 }
2117 return 0;
2118}
2119
2120/*
2121 * This function makes sure that all of the integers in the vector
2122 * are between the minimum and maximum values given in the arrays
2123 * table->extra1 and table->extra2, respectively.
2124 */
2125int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2126 void __user *oldval, size_t __user *oldlenp,
2127 void __user *newval, size_t newlen, void **context)
2128{
2129
2130 if (newval && newlen) {
2131 int __user *vec = (int __user *) newval;
2132 int *min = (int *) table->extra1;
2133 int *max = (int *) table->extra2;
2134 size_t length;
2135 int i;
2136
2137 if (newlen % sizeof(int) != 0)
2138 return -EINVAL;
2139
2140 if (!table->extra1 && !table->extra2)
2141 return 0;
2142
2143 if (newlen > table->maxlen)
2144 newlen = table->maxlen;
2145 length = newlen / sizeof(int);
2146
2147 for (i = 0; i < length; i++) {
2148 int value;
2149 if (get_user(value, vec + i))
2150 return -EFAULT;
2151 if (min && value < min[i])
2152 return -EINVAL;
2153 if (max && value > max[i])
2154 return -EINVAL;
2155 }
2156 }
2157 return 0;
2158}
2159
2160/* Strategy function to convert jiffies to seconds */
2161int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2162 void __user *oldval, size_t __user *oldlenp,
2163 void __user *newval, size_t newlen, void **context)
2164{
2165 if (oldval) {
2166 size_t olen;
2167 if (oldlenp) {
2168 if (get_user(olen, oldlenp))
2169 return -EFAULT;
2170 if (olen!=sizeof(int))
2171 return -EINVAL;
2172 }
2173 if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) ||
2174 (oldlenp && put_user(sizeof(int),oldlenp)))
2175 return -EFAULT;
2176 }
2177 if (newval && newlen) {
2178 int new;
2179 if (newlen != sizeof(int))
2180 return -EINVAL;
2181 if (get_user(new, (int __user *)newval))
2182 return -EFAULT;
2183 *(int *)(table->data) = new*HZ;
2184 }
2185 return 1;
2186}
2187
2188/* Strategy function to convert jiffies to seconds */
2189int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2190 void __user *oldval, size_t __user *oldlenp,
2191 void __user *newval, size_t newlen, void **context)
2192{
2193 if (oldval) {
2194 size_t olen;
2195 if (oldlenp) {
2196 if (get_user(olen, oldlenp))
2197 return -EFAULT;
2198 if (olen!=sizeof(int))
2199 return -EINVAL;
2200 }
2201 if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) ||
2202 (oldlenp && put_user(sizeof(int),oldlenp)))
2203 return -EFAULT;
2204 }
2205 if (newval && newlen) {
2206 int new;
2207 if (newlen != sizeof(int))
2208 return -EINVAL;
2209 if (get_user(new, (int __user *)newval))
2210 return -EFAULT;
2211 *(int *)(table->data) = msecs_to_jiffies(new);
2212 }
2213 return 1;
2214}
2215
2216#else /* CONFIG_SYSCTL */
2217
2218
2219asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2220{
2221 return -ENOSYS;
2222}
2223
2224int sysctl_string(ctl_table *table, int __user *name, int nlen,
2225 void __user *oldval, size_t __user *oldlenp,
2226 void __user *newval, size_t newlen, void **context)
2227{
2228 return -ENOSYS;
2229}
2230
2231int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2232 void __user *oldval, size_t __user *oldlenp,
2233 void __user *newval, size_t newlen, void **context)
2234{
2235 return -ENOSYS;
2236}
2237
2238int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2239 void __user *oldval, size_t __user *oldlenp,
2240 void __user *newval, size_t newlen, void **context)
2241{
2242 return -ENOSYS;
2243}
2244
2245int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2246 void __user *oldval, size_t __user *oldlenp,
2247 void __user *newval, size_t newlen, void **context)
2248{
2249 return -ENOSYS;
2250}
2251
2252int proc_dostring(ctl_table *table, int write, struct file *filp,
2253 void __user *buffer, size_t *lenp, loff_t *ppos)
2254{
2255 return -ENOSYS;
2256}
2257
2258int proc_dointvec(ctl_table *table, int write, struct file *filp,
2259 void __user *buffer, size_t *lenp, loff_t *ppos)
2260{
2261 return -ENOSYS;
2262}
2263
2264int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
2265 void __user *buffer, size_t *lenp, loff_t *ppos)
2266{
2267 return -ENOSYS;
2268}
2269
2270int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
2271 void __user *buffer, size_t *lenp, loff_t *ppos)
2272{
2273 return -ENOSYS;
2274}
2275
2276int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
2277 void __user *buffer, size_t *lenp, loff_t *ppos)
2278{
2279 return -ENOSYS;
2280}
2281
2282int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
2283 void __user *buffer, size_t *lenp, loff_t *ppos)
2284{
2285 return -ENOSYS;
2286}
2287
2288int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2289 void __user *buffer, size_t *lenp, loff_t *ppos)
2290{
2291 return -ENOSYS;
2292}
2293
2294int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
2295 void __user *buffer, size_t *lenp, loff_t *ppos)
2296{
2297 return -ENOSYS;
2298}
2299
2300int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2301 struct file *filp,
2302 void __user *buffer,
2303 size_t *lenp, loff_t *ppos)
2304{
2305 return -ENOSYS;
2306}
2307
2308struct ctl_table_header * register_sysctl_table(ctl_table * table,
2309 int insert_at_head)
2310{
2311 return NULL;
2312}
2313
2314void unregister_sysctl_table(struct ctl_table_header * table)
2315{
2316}
2317
2318#endif /* CONFIG_SYSCTL */
2319
2320/*
2321 * No sense putting this after each symbol definition, twice,
2322 * exception granted :-)
2323 */
2324EXPORT_SYMBOL(proc_dointvec);
2325EXPORT_SYMBOL(proc_dointvec_jiffies);
2326EXPORT_SYMBOL(proc_dointvec_minmax);
2327EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
2328EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
2329EXPORT_SYMBOL(proc_dostring);
2330EXPORT_SYMBOL(proc_doulongvec_minmax);
2331EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
2332EXPORT_SYMBOL(register_sysctl_table);
2333EXPORT_SYMBOL(sysctl_intvec);
2334EXPORT_SYMBOL(sysctl_jiffies);
2335EXPORT_SYMBOL(sysctl_ms_jiffies);
2336EXPORT_SYMBOL(sysctl_string);
2337EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/time.c b/kernel/time.c
new file mode 100644
index 000000000000..96fd0f499631
--- /dev/null
+++ b/kernel/time.c
@@ -0,0 +1,599 @@
1/*
2 * linux/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * This file contains the interface functions for the various
7 * time related system calls: time, stime, gettimeofday, settimeofday,
8 * adjtime
9 */
10/*
11 * Modification history kernel/time.c
12 *
13 * 1993-09-02 Philip Gladstone
14 * Created file with time related functions from sched.c and adjtimex()
15 * 1993-10-08 Torsten Duwe
16 * adjtime interface update and CMOS clock write code
17 * 1995-08-13 Torsten Duwe
18 * kernel PLL updated to 1994-12-13 specs (rfc-1589)
19 * 1999-01-16 Ulrich Windl
20 * Introduced error checking for many cases in adjtimex().
21 * Updated NTP code according to technical memorandum Jan '96
22 * "A Kernel Model for Precision Timekeeping" by Dave Mills
23 * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
24 * (Even though the technical memorandum forbids it)
25 * 2004-07-14 Christoph Lameter
26 * Added getnstimeofday to allow the posix timer functions to return
27 * with nanosecond accuracy
28 */
29
30#include <linux/module.h>
31#include <linux/timex.h>
32#include <linux/errno.h>
33#include <linux/smp_lock.h>
34#include <linux/syscalls.h>
35#include <linux/security.h>
36#include <linux/fs.h>
37#include <linux/module.h>
38
39#include <asm/uaccess.h>
40#include <asm/unistd.h>
41
42/*
43 * The timezone where the local system is located. Used as a default by some
44 * programs who obtain this value by using gettimeofday.
45 */
46struct timezone sys_tz;
47
48EXPORT_SYMBOL(sys_tz);
49
50#ifdef __ARCH_WANT_SYS_TIME
51
52/*
53 * sys_time() can be implemented in user-level using
54 * sys_gettimeofday(). Is this for backwards compatibility? If so,
55 * why not move it into the appropriate arch directory (for those
56 * architectures that need it).
57 */
58asmlinkage long sys_time(time_t __user * tloc)
59{
60 time_t i;
61 struct timeval tv;
62
63 do_gettimeofday(&tv);
64 i = tv.tv_sec;
65
66 if (tloc) {
67 if (put_user(i,tloc))
68 i = -EFAULT;
69 }
70 return i;
71}
72
73/*
74 * sys_stime() can be implemented in user-level using
75 * sys_settimeofday(). Is this for backwards compatibility? If so,
76 * why not move it into the appropriate arch directory (for those
77 * architectures that need it).
78 */
79
80asmlinkage long sys_stime(time_t __user *tptr)
81{
82 struct timespec tv;
83 int err;
84
85 if (get_user(tv.tv_sec, tptr))
86 return -EFAULT;
87
88 tv.tv_nsec = 0;
89
90 err = security_settime(&tv, NULL);
91 if (err)
92 return err;
93
94 do_settimeofday(&tv);
95 return 0;
96}
97
98#endif /* __ARCH_WANT_SYS_TIME */
99
100asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
101{
102 if (likely(tv != NULL)) {
103 struct timeval ktv;
104 do_gettimeofday(&ktv);
105 if (copy_to_user(tv, &ktv, sizeof(ktv)))
106 return -EFAULT;
107 }
108 if (unlikely(tz != NULL)) {
109 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
110 return -EFAULT;
111 }
112 return 0;
113}
114
115/*
116 * Adjust the time obtained from the CMOS to be UTC time instead of
117 * local time.
118 *
119 * This is ugly, but preferable to the alternatives. Otherwise we
120 * would either need to write a program to do it in /etc/rc (and risk
121 * confusion if the program gets run more than once; it would also be
122 * hard to make the program warp the clock precisely n hours) or
123 * compile in the timezone information into the kernel. Bad, bad....
124 *
125 * - TYT, 1992-01-01
126 *
127 * The best thing to do is to keep the CMOS clock in universal time (UTC)
128 * as real UNIX machines always do it. This avoids all headaches about
129 * daylight saving times and warping kernel clocks.
130 */
131inline static void warp_clock(void)
132{
133 write_seqlock_irq(&xtime_lock);
134 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
135 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
136 time_interpolator_reset();
137 write_sequnlock_irq(&xtime_lock);
138 clock_was_set();
139}
140
141/*
142 * In case for some reason the CMOS clock has not already been running
143 * in UTC, but in some local time: The first time we set the timezone,
144 * we will warp the clock so that it is ticking UTC time instead of
145 * local time. Presumably, if someone is setting the timezone then we
146 * are running in an environment where the programs understand about
147 * timezones. This should be done at boot time in the /etc/rc script,
148 * as soon as possible, so that the clock can be set right. Otherwise,
149 * various programs will get confused when the clock gets warped.
150 */
151
152int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
153{
154 static int firsttime = 1;
155 int error = 0;
156
157 error = security_settime(tv, tz);
158 if (error)
159 return error;
160
161 if (tz) {
162 /* SMP safe, global irq locking makes it work. */
163 sys_tz = *tz;
164 if (firsttime) {
165 firsttime = 0;
166 if (!tv)
167 warp_clock();
168 }
169 }
170 if (tv)
171 {
172 /* SMP safe, again the code in arch/foo/time.c should
173 * globally block out interrupts when it runs.
174 */
175 return do_settimeofday(tv);
176 }
177 return 0;
178}
179
180asmlinkage long sys_settimeofday(struct timeval __user *tv,
181 struct timezone __user *tz)
182{
183 struct timeval user_tv;
184 struct timespec new_ts;
185 struct timezone new_tz;
186
187 if (tv) {
188 if (copy_from_user(&user_tv, tv, sizeof(*tv)))
189 return -EFAULT;
190 new_ts.tv_sec = user_tv.tv_sec;
191 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
192 }
193 if (tz) {
194 if (copy_from_user(&new_tz, tz, sizeof(*tz)))
195 return -EFAULT;
196 }
197
198 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
199}
200
201long pps_offset; /* pps time offset (us) */
202long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */
203
204long pps_freq; /* frequency offset (scaled ppm) */
205long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
206
207long pps_valid = PPS_VALID; /* pps signal watchdog counter */
208
209int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
210
211long pps_jitcnt; /* jitter limit exceeded */
212long pps_calcnt; /* calibration intervals */
213long pps_errcnt; /* calibration errors */
214long pps_stbcnt; /* stability limit exceeded */
215
216/* hook for a loadable hardpps kernel module */
217void (*hardpps_ptr)(struct timeval *);
218
219/* we call this to notify the arch when the clock is being
220 * controlled. If no such arch routine, do nothing.
221 */
222void __attribute__ ((weak)) notify_arch_cmos_timer(void)
223{
224 return;
225}
226
227/* adjtimex mainly allows reading (and writing, if superuser) of
228 * kernel time-keeping variables. used by xntpd.
229 */
230int do_adjtimex(struct timex *txc)
231{
232 long ltemp, mtemp, save_adjust;
233 int result;
234
235 /* In order to modify anything, you gotta be super-user! */
236 if (txc->modes && !capable(CAP_SYS_TIME))
237 return -EPERM;
238
239 /* Now we validate the data before disabling interrupts */
240
241 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
242 /* singleshot must not be used with any other mode bits */
243 if (txc->modes != ADJ_OFFSET_SINGLESHOT)
244 return -EINVAL;
245
246 if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
247 /* adjustment Offset limited to +- .512 seconds */
248 if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
249 return -EINVAL;
250
251 /* if the quartz is off by more than 10% something is VERY wrong ! */
252 if (txc->modes & ADJ_TICK)
253 if (txc->tick < 900000/USER_HZ ||
254 txc->tick > 1100000/USER_HZ)
255 return -EINVAL;
256
257 write_seqlock_irq(&xtime_lock);
258 result = time_state; /* mostly `TIME_OK' */
259
260 /* Save for later - semantics of adjtime is to return old value */
261 save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
262
263#if 0 /* STA_CLOCKERR is never set yet */
264 time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
265#endif
266 /* If there are input parameters, then process them */
267 if (txc->modes)
268 {
269 if (txc->modes & ADJ_STATUS) /* only set allowed bits */
270 time_status = (txc->status & ~STA_RONLY) |
271 (time_status & STA_RONLY);
272
273 if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
274 if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
275 result = -EINVAL;
276 goto leave;
277 }
278 time_freq = txc->freq - pps_freq;
279 }
280
281 if (txc->modes & ADJ_MAXERROR) {
282 if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
283 result = -EINVAL;
284 goto leave;
285 }
286 time_maxerror = txc->maxerror;
287 }
288
289 if (txc->modes & ADJ_ESTERROR) {
290 if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
291 result = -EINVAL;
292 goto leave;
293 }
294 time_esterror = txc->esterror;
295 }
296
297 if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
298 if (txc->constant < 0) { /* NTP v4 uses values > 6 */
299 result = -EINVAL;
300 goto leave;
301 }
302 time_constant = txc->constant;
303 }
304
305 if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
306 if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
307 /* adjtime() is independent from ntp_adjtime() */
308 if ((time_next_adjust = txc->offset) == 0)
309 time_adjust = 0;
310 }
311 else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
312 ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
313 (STA_PPSTIME | STA_PPSSIGNAL) ?
314 pps_offset : txc->offset;
315
316 /*
317 * Scale the phase adjustment and
318 * clamp to the operating range.
319 */
320 if (ltemp > MAXPHASE)
321 time_offset = MAXPHASE << SHIFT_UPDATE;
322 else if (ltemp < -MAXPHASE)
323 time_offset = -(MAXPHASE << SHIFT_UPDATE);
324 else
325 time_offset = ltemp << SHIFT_UPDATE;
326
327 /*
328 * Select whether the frequency is to be controlled
329 * and in which mode (PLL or FLL). Clamp to the operating
330 * range. Ugly multiply/divide should be replaced someday.
331 */
332
333 if (time_status & STA_FREQHOLD || time_reftime == 0)
334 time_reftime = xtime.tv_sec;
335 mtemp = xtime.tv_sec - time_reftime;
336 time_reftime = xtime.tv_sec;
337 if (time_status & STA_FLL) {
338 if (mtemp >= MINSEC) {
339 ltemp = (time_offset / mtemp) << (SHIFT_USEC -
340 SHIFT_UPDATE);
341 if (ltemp < 0)
342 time_freq -= -ltemp >> SHIFT_KH;
343 else
344 time_freq += ltemp >> SHIFT_KH;
345 } else /* calibration interval too short (p. 12) */
346 result = TIME_ERROR;
347 } else { /* PLL mode */
348 if (mtemp < MAXSEC) {
349 ltemp *= mtemp;
350 if (ltemp < 0)
351 time_freq -= -ltemp >> (time_constant +
352 time_constant +
353 SHIFT_KF - SHIFT_USEC);
354 else
355 time_freq += ltemp >> (time_constant +
356 time_constant +
357 SHIFT_KF - SHIFT_USEC);
358 } else /* calibration interval too long (p. 12) */
359 result = TIME_ERROR;
360 }
361 if (time_freq > time_tolerance)
362 time_freq = time_tolerance;
363 else if (time_freq < -time_tolerance)
364 time_freq = -time_tolerance;
365 } /* STA_PLL || STA_PPSTIME */
366 } /* txc->modes & ADJ_OFFSET */
367 if (txc->modes & ADJ_TICK) {
368 tick_usec = txc->tick;
369 tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
370 }
371 } /* txc->modes */
372leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
373 || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
374 && (time_status & STA_PPSSIGNAL) == 0)
375 /* p. 24, (b) */
376 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
377 == (STA_PPSTIME|STA_PPSJITTER))
378 /* p. 24, (c) */
379 || ((time_status & STA_PPSFREQ) != 0
380 && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
381 /* p. 24, (d) */
382 result = TIME_ERROR;
383
384 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
385 txc->offset = save_adjust;
386 else {
387 if (time_offset < 0)
388 txc->offset = -(-time_offset >> SHIFT_UPDATE);
389 else
390 txc->offset = time_offset >> SHIFT_UPDATE;
391 }
392 txc->freq = time_freq + pps_freq;
393 txc->maxerror = time_maxerror;
394 txc->esterror = time_esterror;
395 txc->status = time_status;
396 txc->constant = time_constant;
397 txc->precision = time_precision;
398 txc->tolerance = time_tolerance;
399 txc->tick = tick_usec;
400 txc->ppsfreq = pps_freq;
401 txc->jitter = pps_jitter >> PPS_AVG;
402 txc->shift = pps_shift;
403 txc->stabil = pps_stabil;
404 txc->jitcnt = pps_jitcnt;
405 txc->calcnt = pps_calcnt;
406 txc->errcnt = pps_errcnt;
407 txc->stbcnt = pps_stbcnt;
408 write_sequnlock_irq(&xtime_lock);
409 do_gettimeofday(&txc->time);
410 notify_arch_cmos_timer();
411 return(result);
412}
413
414asmlinkage long sys_adjtimex(struct timex __user *txc_p)
415{
416 struct timex txc; /* Local copy of parameter */
417 int ret;
418
419 /* Copy the user data space into the kernel copy
420 * structure. But bear in mind that the structures
421 * may change
422 */
423 if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
424 return -EFAULT;
425 ret = do_adjtimex(&txc);
426 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
427}
428
429inline struct timespec current_kernel_time(void)
430{
431 struct timespec now;
432 unsigned long seq;
433
434 do {
435 seq = read_seqbegin(&xtime_lock);
436
437 now = xtime;
438 } while (read_seqretry(&xtime_lock, seq));
439
440 return now;
441}
442
443EXPORT_SYMBOL(current_kernel_time);
444
445/**
446 * current_fs_time - Return FS time
447 * @sb: Superblock.
448 *
449 * Return the current time truncated to the time granuality supported by
450 * the fs.
451 */
452struct timespec current_fs_time(struct super_block *sb)
453{
454 struct timespec now = current_kernel_time();
455 return timespec_trunc(now, sb->s_time_gran);
456}
457EXPORT_SYMBOL(current_fs_time);
458
459/**
460 * timespec_trunc - Truncate timespec to a granuality
461 * @t: Timespec
462 * @gran: Granuality in ns.
463 *
464 * Truncate a timespec to a granuality. gran must be smaller than a second.
465 * Always rounds down.
466 *
467 * This function should be only used for timestamps returned by
468 * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
469 * it doesn't handle the better resolution of the later.
470 */
471struct timespec timespec_trunc(struct timespec t, unsigned gran)
472{
473 /*
474 * Division is pretty slow so avoid it for common cases.
475 * Currently current_kernel_time() never returns better than
476 * jiffies resolution. Exploit that.
477 */
478 if (gran <= jiffies_to_usecs(1) * 1000) {
479 /* nothing */
480 } else if (gran == 1000000000) {
481 t.tv_nsec = 0;
482 } else {
483 t.tv_nsec -= t.tv_nsec % gran;
484 }
485 return t;
486}
487EXPORT_SYMBOL(timespec_trunc);
488
489#ifdef CONFIG_TIME_INTERPOLATION
490void getnstimeofday (struct timespec *tv)
491{
492 unsigned long seq,sec,nsec;
493
494 do {
495 seq = read_seqbegin(&xtime_lock);
496 sec = xtime.tv_sec;
497 nsec = xtime.tv_nsec+time_interpolator_get_offset();
498 } while (unlikely(read_seqretry(&xtime_lock, seq)));
499
500 while (unlikely(nsec >= NSEC_PER_SEC)) {
501 nsec -= NSEC_PER_SEC;
502 ++sec;
503 }
504 tv->tv_sec = sec;
505 tv->tv_nsec = nsec;
506}
507EXPORT_SYMBOL_GPL(getnstimeofday);
508
509int do_settimeofday (struct timespec *tv)
510{
511 time_t wtm_sec, sec = tv->tv_sec;
512 long wtm_nsec, nsec = tv->tv_nsec;
513
514 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
515 return -EINVAL;
516
517 write_seqlock_irq(&xtime_lock);
518 {
519 /*
520 * This is revolting. We need to set "xtime" correctly. However, the value
521 * in this location is the value at the most recent update of wall time.
522 * Discover what correction gettimeofday would have done, and then undo
523 * it!
524 */
525 nsec -= time_interpolator_get_offset();
526
527 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
528 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
529
530 set_normalized_timespec(&xtime, sec, nsec);
531 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
532
533 time_adjust = 0; /* stop active adjtime() */
534 time_status |= STA_UNSYNC;
535 time_maxerror = NTP_PHASE_LIMIT;
536 time_esterror = NTP_PHASE_LIMIT;
537 time_interpolator_reset();
538 }
539 write_sequnlock_irq(&xtime_lock);
540 clock_was_set();
541 return 0;
542}
543
544void do_gettimeofday (struct timeval *tv)
545{
546 unsigned long seq, nsec, usec, sec, offset;
547 do {
548 seq = read_seqbegin(&xtime_lock);
549 offset = time_interpolator_get_offset();
550 sec = xtime.tv_sec;
551 nsec = xtime.tv_nsec;
552 } while (unlikely(read_seqretry(&xtime_lock, seq)));
553
554 usec = (nsec + offset) / 1000;
555
556 while (unlikely(usec >= USEC_PER_SEC)) {
557 usec -= USEC_PER_SEC;
558 ++sec;
559 }
560
561 tv->tv_sec = sec;
562 tv->tv_usec = usec;
563}
564
565EXPORT_SYMBOL(do_gettimeofday);
566
567
568#else
569/*
570 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
571 * and therefore only yields usec accuracy
572 */
573void getnstimeofday(struct timespec *tv)
574{
575 struct timeval x;
576
577 do_gettimeofday(&x);
578 tv->tv_sec = x.tv_sec;
579 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
580}
581#endif
582
583#if (BITS_PER_LONG < 64)
584u64 get_jiffies_64(void)
585{
586 unsigned long seq;
587 u64 ret;
588
589 do {
590 seq = read_seqbegin(&xtime_lock);
591 ret = jiffies_64;
592 } while (read_seqretry(&xtime_lock, seq));
593 return ret;
594}
595
596EXPORT_SYMBOL(get_jiffies_64);
597#endif
598
599EXPORT_SYMBOL(jiffies);
diff --git a/kernel/timer.c b/kernel/timer.c
new file mode 100644
index 000000000000..ecb3d67c0e14
--- /dev/null
+++ b/kernel/timer.c
@@ -0,0 +1,1611 @@
1/*
2 * linux/kernel/timer.c
3 *
4 * Kernel internal timers, kernel timekeeping, basic process system calls
5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds
7 *
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
9 *
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
16 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
17 * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
18 * Copyright (C) 2000, 2001, 2002 Ingo Molnar
19 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
20 */
21
22#include <linux/kernel_stat.h>
23#include <linux/module.h>
24#include <linux/interrupt.h>
25#include <linux/percpu.h>
26#include <linux/init.h>
27#include <linux/mm.h>
28#include <linux/swap.h>
29#include <linux/notifier.h>
30#include <linux/thread_info.h>
31#include <linux/time.h>
32#include <linux/jiffies.h>
33#include <linux/posix-timers.h>
34#include <linux/cpu.h>
35#include <linux/syscalls.h>
36
37#include <asm/uaccess.h>
38#include <asm/unistd.h>
39#include <asm/div64.h>
40#include <asm/timex.h>
41#include <asm/io.h>
42
43#ifdef CONFIG_TIME_INTERPOLATION
44static void time_interpolator_update(long delta_nsec);
45#else
46#define time_interpolator_update(x)
47#endif
48
49/*
50 * per-CPU timer vector definitions:
51 */
52
53#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
54#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
55#define TVN_SIZE (1 << TVN_BITS)
56#define TVR_SIZE (1 << TVR_BITS)
57#define TVN_MASK (TVN_SIZE - 1)
58#define TVR_MASK (TVR_SIZE - 1)
59
60typedef struct tvec_s {
61 struct list_head vec[TVN_SIZE];
62} tvec_t;
63
64typedef struct tvec_root_s {
65 struct list_head vec[TVR_SIZE];
66} tvec_root_t;
67
68struct tvec_t_base_s {
69 spinlock_t lock;
70 unsigned long timer_jiffies;
71 struct timer_list *running_timer;
72 tvec_root_t tv1;
73 tvec_t tv2;
74 tvec_t tv3;
75 tvec_t tv4;
76 tvec_t tv5;
77} ____cacheline_aligned_in_smp;
78
79typedef struct tvec_t_base_s tvec_base_t;
80
81static inline void set_running_timer(tvec_base_t *base,
82 struct timer_list *timer)
83{
84#ifdef CONFIG_SMP
85 base->running_timer = timer;
86#endif
87}
88
89/* Fake initialization */
90static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
91
92static void check_timer_failed(struct timer_list *timer)
93{
94 static int whine_count;
95 if (whine_count < 16) {
96 whine_count++;
97 printk("Uninitialised timer!\n");
98 printk("This is just a warning. Your computer is OK\n");
99 printk("function=0x%p, data=0x%lx\n",
100 timer->function, timer->data);
101 dump_stack();
102 }
103 /*
104 * Now fix it up
105 */
106 spin_lock_init(&timer->lock);
107 timer->magic = TIMER_MAGIC;
108}
109
110static inline void check_timer(struct timer_list *timer)
111{
112 if (timer->magic != TIMER_MAGIC)
113 check_timer_failed(timer);
114}
115
116
117static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
118{
119 unsigned long expires = timer->expires;
120 unsigned long idx = expires - base->timer_jiffies;
121 struct list_head *vec;
122
123 if (idx < TVR_SIZE) {
124 int i = expires & TVR_MASK;
125 vec = base->tv1.vec + i;
126 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
127 int i = (expires >> TVR_BITS) & TVN_MASK;
128 vec = base->tv2.vec + i;
129 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
130 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
131 vec = base->tv3.vec + i;
132 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
133 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
134 vec = base->tv4.vec + i;
135 } else if ((signed long) idx < 0) {
136 /*
137 * Can happen if you add a timer with expires == jiffies,
138 * or you set a timer to go off in the past
139 */
140 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
141 } else {
142 int i;
143 /* If the timeout is larger than 0xffffffff on 64-bit
144 * architectures then we use the maximum timeout:
145 */
146 if (idx > 0xffffffffUL) {
147 idx = 0xffffffffUL;
148 expires = idx + base->timer_jiffies;
149 }
150 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
151 vec = base->tv5.vec + i;
152 }
153 /*
154 * Timers are FIFO:
155 */
156 list_add_tail(&timer->entry, vec);
157}
158
159int __mod_timer(struct timer_list *timer, unsigned long expires)
160{
161 tvec_base_t *old_base, *new_base;
162 unsigned long flags;
163 int ret = 0;
164
165 BUG_ON(!timer->function);
166
167 check_timer(timer);
168
169 spin_lock_irqsave(&timer->lock, flags);
170 new_base = &__get_cpu_var(tvec_bases);
171repeat:
172 old_base = timer->base;
173
174 /*
175 * Prevent deadlocks via ordering by old_base < new_base.
176 */
177 if (old_base && (new_base != old_base)) {
178 if (old_base < new_base) {
179 spin_lock(&new_base->lock);
180 spin_lock(&old_base->lock);
181 } else {
182 spin_lock(&old_base->lock);
183 spin_lock(&new_base->lock);
184 }
185 /*
186 * The timer base might have been cancelled while we were
187 * trying to take the lock(s):
188 */
189 if (timer->base != old_base) {
190 spin_unlock(&new_base->lock);
191 spin_unlock(&old_base->lock);
192 goto repeat;
193 }
194 } else {
195 spin_lock(&new_base->lock);
196 if (timer->base != old_base) {
197 spin_unlock(&new_base->lock);
198 goto repeat;
199 }
200 }
201
202 /*
203 * Delete the previous timeout (if there was any), and install
204 * the new one:
205 */
206 if (old_base) {
207 list_del(&timer->entry);
208 ret = 1;
209 }
210 timer->expires = expires;
211 internal_add_timer(new_base, timer);
212 timer->base = new_base;
213
214 if (old_base && (new_base != old_base))
215 spin_unlock(&old_base->lock);
216 spin_unlock(&new_base->lock);
217 spin_unlock_irqrestore(&timer->lock, flags);
218
219 return ret;
220}
221
222EXPORT_SYMBOL(__mod_timer);
223
224/***
225 * add_timer_on - start a timer on a particular CPU
226 * @timer: the timer to be added
227 * @cpu: the CPU to start it on
228 *
229 * This is not very scalable on SMP. Double adds are not possible.
230 */
231void add_timer_on(struct timer_list *timer, int cpu)
232{
233 tvec_base_t *base = &per_cpu(tvec_bases, cpu);
234 unsigned long flags;
235
236 BUG_ON(timer_pending(timer) || !timer->function);
237
238 check_timer(timer);
239
240 spin_lock_irqsave(&base->lock, flags);
241 internal_add_timer(base, timer);
242 timer->base = base;
243 spin_unlock_irqrestore(&base->lock, flags);
244}
245
246
247/***
248 * mod_timer - modify a timer's timeout
249 * @timer: the timer to be modified
250 *
251 * mod_timer is a more efficient way to update the expire field of an
252 * active timer (if the timer is inactive it will be activated)
253 *
254 * mod_timer(timer, expires) is equivalent to:
255 *
256 * del_timer(timer); timer->expires = expires; add_timer(timer);
257 *
258 * Note that if there are multiple unserialized concurrent users of the
259 * same timer, then mod_timer() is the only safe way to modify the timeout,
260 * since add_timer() cannot modify an already running timer.
261 *
262 * The function returns whether it has modified a pending timer or not.
263 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
264 * active timer returns 1.)
265 */
266int mod_timer(struct timer_list *timer, unsigned long expires)
267{
268 BUG_ON(!timer->function);
269
270 check_timer(timer);
271
272 /*
273 * This is a common optimization triggered by the
274 * networking code - if the timer is re-modified
275 * to be the same thing then just return:
276 */
277 if (timer->expires == expires && timer_pending(timer))
278 return 1;
279
280 return __mod_timer(timer, expires);
281}
282
283EXPORT_SYMBOL(mod_timer);
284
285/***
286 * del_timer - deactive a timer.
287 * @timer: the timer to be deactivated
288 *
289 * del_timer() deactivates a timer - this works on both active and inactive
290 * timers.
291 *
292 * The function returns whether it has deactivated a pending timer or not.
293 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
294 * active timer returns 1.)
295 */
296int del_timer(struct timer_list *timer)
297{
298 unsigned long flags;
299 tvec_base_t *base;
300
301 check_timer(timer);
302
303repeat:
304 base = timer->base;
305 if (!base)
306 return 0;
307 spin_lock_irqsave(&base->lock, flags);
308 if (base != timer->base) {
309 spin_unlock_irqrestore(&base->lock, flags);
310 goto repeat;
311 }
312 list_del(&timer->entry);
313 /* Need to make sure that anybody who sees a NULL base also sees the list ops */
314 smp_wmb();
315 timer->base = NULL;
316 spin_unlock_irqrestore(&base->lock, flags);
317
318 return 1;
319}
320
321EXPORT_SYMBOL(del_timer);
322
323#ifdef CONFIG_SMP
324/***
325 * del_timer_sync - deactivate a timer and wait for the handler to finish.
326 * @timer: the timer to be deactivated
327 *
328 * This function only differs from del_timer() on SMP: besides deactivating
329 * the timer it also makes sure the handler has finished executing on other
330 * CPUs.
331 *
332 * Synchronization rules: callers must prevent restarting of the timer,
333 * otherwise this function is meaningless. It must not be called from
334 * interrupt contexts. The caller must not hold locks which would prevent
335 * completion of the timer's handler. Upon exit the timer is not queued and
336 * the handler is not running on any CPU.
337 *
338 * The function returns whether it has deactivated a pending timer or not.
339 *
340 * del_timer_sync() is slow and complicated because it copes with timer
341 * handlers which re-arm the timer (periodic timers). If the timer handler
342 * is known to not do this (a single shot timer) then use
343 * del_singleshot_timer_sync() instead.
344 */
345int del_timer_sync(struct timer_list *timer)
346{
347 tvec_base_t *base;
348 int i, ret = 0;
349
350 check_timer(timer);
351
352del_again:
353 ret += del_timer(timer);
354
355 for_each_online_cpu(i) {
356 base = &per_cpu(tvec_bases, i);
357 if (base->running_timer == timer) {
358 while (base->running_timer == timer) {
359 cpu_relax();
360 preempt_check_resched();
361 }
362 break;
363 }
364 }
365 smp_rmb();
366 if (timer_pending(timer))
367 goto del_again;
368
369 return ret;
370}
371EXPORT_SYMBOL(del_timer_sync);
372
373/***
374 * del_singleshot_timer_sync - deactivate a non-recursive timer
375 * @timer: the timer to be deactivated
376 *
377 * This function is an optimization of del_timer_sync for the case where the
378 * caller can guarantee the timer does not reschedule itself in its timer
379 * function.
380 *
381 * Synchronization rules: callers must prevent restarting of the timer,
382 * otherwise this function is meaningless. It must not be called from
383 * interrupt contexts. The caller must not hold locks which wold prevent
384 * completion of the timer's handler. Upon exit the timer is not queued and
385 * the handler is not running on any CPU.
386 *
387 * The function returns whether it has deactivated a pending timer or not.
388 */
389int del_singleshot_timer_sync(struct timer_list *timer)
390{
391 int ret = del_timer(timer);
392
393 if (!ret) {
394 ret = del_timer_sync(timer);
395 BUG_ON(ret);
396 }
397
398 return ret;
399}
400EXPORT_SYMBOL(del_singleshot_timer_sync);
401#endif
402
403static int cascade(tvec_base_t *base, tvec_t *tv, int index)
404{
405 /* cascade all the timers from tv up one level */
406 struct list_head *head, *curr;
407
408 head = tv->vec + index;
409 curr = head->next;
410 /*
411 * We are removing _all_ timers from the list, so we don't have to
412 * detach them individually, just clear the list afterwards.
413 */
414 while (curr != head) {
415 struct timer_list *tmp;
416
417 tmp = list_entry(curr, struct timer_list, entry);
418 BUG_ON(tmp->base != base);
419 curr = curr->next;
420 internal_add_timer(base, tmp);
421 }
422 INIT_LIST_HEAD(head);
423
424 return index;
425}
426
427/***
428 * __run_timers - run all expired timers (if any) on this CPU.
429 * @base: the timer vector to be processed.
430 *
431 * This function cascades all vectors and executes all expired timer
432 * vectors.
433 */
434#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
435
436static inline void __run_timers(tvec_base_t *base)
437{
438 struct timer_list *timer;
439
440 spin_lock_irq(&base->lock);
441 while (time_after_eq(jiffies, base->timer_jiffies)) {
442 struct list_head work_list = LIST_HEAD_INIT(work_list);
443 struct list_head *head = &work_list;
444 int index = base->timer_jiffies & TVR_MASK;
445
446 /*
447 * Cascade timers:
448 */
449 if (!index &&
450 (!cascade(base, &base->tv2, INDEX(0))) &&
451 (!cascade(base, &base->tv3, INDEX(1))) &&
452 !cascade(base, &base->tv4, INDEX(2)))
453 cascade(base, &base->tv5, INDEX(3));
454 ++base->timer_jiffies;
455 list_splice_init(base->tv1.vec + index, &work_list);
456repeat:
457 if (!list_empty(head)) {
458 void (*fn)(unsigned long);
459 unsigned long data;
460
461 timer = list_entry(head->next,struct timer_list,entry);
462 fn = timer->function;
463 data = timer->data;
464
465 list_del(&timer->entry);
466 set_running_timer(base, timer);
467 smp_wmb();
468 timer->base = NULL;
469 spin_unlock_irq(&base->lock);
470 {
471 u32 preempt_count = preempt_count();
472 fn(data);
473 if (preempt_count != preempt_count()) {
474 printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
475 BUG();
476 }
477 }
478 spin_lock_irq(&base->lock);
479 goto repeat;
480 }
481 }
482 set_running_timer(base, NULL);
483 spin_unlock_irq(&base->lock);
484}
485
486#ifdef CONFIG_NO_IDLE_HZ
487/*
488 * Find out when the next timer event is due to happen. This
489 * is used on S/390 to stop all activity when a cpus is idle.
490 * This functions needs to be called disabled.
491 */
492unsigned long next_timer_interrupt(void)
493{
494 tvec_base_t *base;
495 struct list_head *list;
496 struct timer_list *nte;
497 unsigned long expires;
498 tvec_t *varray[4];
499 int i, j;
500
501 base = &__get_cpu_var(tvec_bases);
502 spin_lock(&base->lock);
503 expires = base->timer_jiffies + (LONG_MAX >> 1);
504 list = 0;
505
506 /* Look for timer events in tv1. */
507 j = base->timer_jiffies & TVR_MASK;
508 do {
509 list_for_each_entry(nte, base->tv1.vec + j, entry) {
510 expires = nte->expires;
511 if (j < (base->timer_jiffies & TVR_MASK))
512 list = base->tv2.vec + (INDEX(0));
513 goto found;
514 }
515 j = (j + 1) & TVR_MASK;
516 } while (j != (base->timer_jiffies & TVR_MASK));
517
518 /* Check tv2-tv5. */
519 varray[0] = &base->tv2;
520 varray[1] = &base->tv3;
521 varray[2] = &base->tv4;
522 varray[3] = &base->tv5;
523 for (i = 0; i < 4; i++) {
524 j = INDEX(i);
525 do {
526 if (list_empty(varray[i]->vec + j)) {
527 j = (j + 1) & TVN_MASK;
528 continue;
529 }
530 list_for_each_entry(nte, varray[i]->vec + j, entry)
531 if (time_before(nte->expires, expires))
532 expires = nte->expires;
533 if (j < (INDEX(i)) && i < 3)
534 list = varray[i + 1]->vec + (INDEX(i + 1));
535 goto found;
536 } while (j != (INDEX(i)));
537 }
538found:
539 if (list) {
540 /*
541 * The search wrapped. We need to look at the next list
542 * from next tv element that would cascade into tv element
543 * where we found the timer element.
544 */
545 list_for_each_entry(nte, list, entry) {
546 if (time_before(nte->expires, expires))
547 expires = nte->expires;
548 }
549 }
550 spin_unlock(&base->lock);
551 return expires;
552}
553#endif
554
555/******************************************************************/
556
557/*
558 * Timekeeping variables
559 */
560unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
561unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */
562
563/*
564 * The current time
565 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
566 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
567 * at zero at system boot time, so wall_to_monotonic will be negative,
568 * however, we will ALWAYS keep the tv_nsec part positive so we can use
569 * the usual normalization.
570 */
571struct timespec xtime __attribute__ ((aligned (16)));
572struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
573
574EXPORT_SYMBOL(xtime);
575
576/* Don't completely fail for HZ > 500. */
577int tickadj = 500/HZ ? : 1; /* microsecs */
578
579
580/*
581 * phase-lock loop variables
582 */
583/* TIME_ERROR prevents overwriting the CMOS clock */
584int time_state = TIME_OK; /* clock synchronization status */
585int time_status = STA_UNSYNC; /* clock status bits */
586long time_offset; /* time adjustment (us) */
587long time_constant = 2; /* pll time constant */
588long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
589long time_precision = 1; /* clock precision (us) */
590long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
591long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
592static long time_phase; /* phase offset (scaled us) */
593long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
594 /* frequency offset (scaled ppm)*/
595static long time_adj; /* tick adjust (scaled 1 / HZ) */
596long time_reftime; /* time at last adjustment (s) */
597long time_adjust;
598long time_next_adjust;
599
600/*
601 * this routine handles the overflow of the microsecond field
602 *
603 * The tricky bits of code to handle the accurate clock support
604 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
605 * They were originally developed for SUN and DEC kernels.
606 * All the kudos should go to Dave for this stuff.
607 *
608 */
609static void second_overflow(void)
610{
611 long ltemp;
612
613 /* Bump the maxerror field */
614 time_maxerror += time_tolerance >> SHIFT_USEC;
615 if ( time_maxerror > NTP_PHASE_LIMIT ) {
616 time_maxerror = NTP_PHASE_LIMIT;
617 time_status |= STA_UNSYNC;
618 }
619
620 /*
621 * Leap second processing. If in leap-insert state at
622 * the end of the day, the system clock is set back one
623 * second; if in leap-delete state, the system clock is
624 * set ahead one second. The microtime() routine or
625 * external clock driver will insure that reported time
626 * is always monotonic. The ugly divides should be
627 * replaced.
628 */
629 switch (time_state) {
630
631 case TIME_OK:
632 if (time_status & STA_INS)
633 time_state = TIME_INS;
634 else if (time_status & STA_DEL)
635 time_state = TIME_DEL;
636 break;
637
638 case TIME_INS:
639 if (xtime.tv_sec % 86400 == 0) {
640 xtime.tv_sec--;
641 wall_to_monotonic.tv_sec++;
642 /* The timer interpolator will make time change gradually instead
643 * of an immediate jump by one second.
644 */
645 time_interpolator_update(-NSEC_PER_SEC);
646 time_state = TIME_OOP;
647 clock_was_set();
648 printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
649 }
650 break;
651
652 case TIME_DEL:
653 if ((xtime.tv_sec + 1) % 86400 == 0) {
654 xtime.tv_sec++;
655 wall_to_monotonic.tv_sec--;
656 /* Use of time interpolator for a gradual change of time */
657 time_interpolator_update(NSEC_PER_SEC);
658 time_state = TIME_WAIT;
659 clock_was_set();
660 printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
661 }
662 break;
663
664 case TIME_OOP:
665 time_state = TIME_WAIT;
666 break;
667
668 case TIME_WAIT:
669 if (!(time_status & (STA_INS | STA_DEL)))
670 time_state = TIME_OK;
671 }
672
673 /*
674 * Compute the phase adjustment for the next second. In
675 * PLL mode, the offset is reduced by a fixed factor
676 * times the time constant. In FLL mode the offset is
677 * used directly. In either mode, the maximum phase
678 * adjustment for each second is clamped so as to spread
679 * the adjustment over not more than the number of
680 * seconds between updates.
681 */
682 if (time_offset < 0) {
683 ltemp = -time_offset;
684 if (!(time_status & STA_FLL))
685 ltemp >>= SHIFT_KG + time_constant;
686 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
687 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
688 time_offset += ltemp;
689 time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
690 } else {
691 ltemp = time_offset;
692 if (!(time_status & STA_FLL))
693 ltemp >>= SHIFT_KG + time_constant;
694 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
695 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
696 time_offset -= ltemp;
697 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
698 }
699
700 /*
701 * Compute the frequency estimate and additional phase
702 * adjustment due to frequency error for the next
703 * second. When the PPS signal is engaged, gnaw on the
704 * watchdog counter and update the frequency computed by
705 * the pll and the PPS signal.
706 */
707 pps_valid++;
708 if (pps_valid == PPS_VALID) { /* PPS signal lost */
709 pps_jitter = MAXTIME;
710 pps_stabil = MAXFREQ;
711 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
712 STA_PPSWANDER | STA_PPSERROR);
713 }
714 ltemp = time_freq + pps_freq;
715 if (ltemp < 0)
716 time_adj -= -ltemp >>
717 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
718 else
719 time_adj += ltemp >>
720 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
721
722#if HZ == 100
723 /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
724 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
725 */
726 if (time_adj < 0)
727 time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
728 else
729 time_adj += (time_adj >> 2) + (time_adj >> 5);
730#endif
731#if HZ == 1000
732 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
733 * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
734 */
735 if (time_adj < 0)
736 time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
737 else
738 time_adj += (time_adj >> 6) + (time_adj >> 7);
739#endif
740}
741
742/* in the NTP reference this is called "hardclock()" */
743static void update_wall_time_one_tick(void)
744{
745 long time_adjust_step, delta_nsec;
746
747 if ( (time_adjust_step = time_adjust) != 0 ) {
748 /* We are doing an adjtime thing.
749 *
750 * Prepare time_adjust_step to be within bounds.
751 * Note that a positive time_adjust means we want the clock
752 * to run faster.
753 *
754 * Limit the amount of the step to be in the range
755 * -tickadj .. +tickadj
756 */
757 if (time_adjust > tickadj)
758 time_adjust_step = tickadj;
759 else if (time_adjust < -tickadj)
760 time_adjust_step = -tickadj;
761
762 /* Reduce by this step the amount of time left */
763 time_adjust -= time_adjust_step;
764 }
765 delta_nsec = tick_nsec + time_adjust_step * 1000;
766 /*
767 * Advance the phase, once it gets to one microsecond, then
768 * advance the tick more.
769 */
770 time_phase += time_adj;
771 if (time_phase <= -FINENSEC) {
772 long ltemp = -time_phase >> (SHIFT_SCALE - 10);
773 time_phase += ltemp << (SHIFT_SCALE - 10);
774 delta_nsec -= ltemp;
775 }
776 else if (time_phase >= FINENSEC) {
777 long ltemp = time_phase >> (SHIFT_SCALE - 10);
778 time_phase -= ltemp << (SHIFT_SCALE - 10);
779 delta_nsec += ltemp;
780 }
781 xtime.tv_nsec += delta_nsec;
782 time_interpolator_update(delta_nsec);
783
784 /* Changes by adjtime() do not take effect till next tick. */
785 if (time_next_adjust != 0) {
786 time_adjust = time_next_adjust;
787 time_next_adjust = 0;
788 }
789}
790
791/*
792 * Using a loop looks inefficient, but "ticks" is
793 * usually just one (we shouldn't be losing ticks,
794 * we're doing this this way mainly for interrupt
795 * latency reasons, not because we think we'll
796 * have lots of lost timer ticks
797 */
798static void update_wall_time(unsigned long ticks)
799{
800 do {
801 ticks--;
802 update_wall_time_one_tick();
803 if (xtime.tv_nsec >= 1000000000) {
804 xtime.tv_nsec -= 1000000000;
805 xtime.tv_sec++;
806 second_overflow();
807 }
808 } while (ticks);
809}
810
811/*
812 * Called from the timer interrupt handler to charge one tick to the current
813 * process. user_tick is 1 if the tick is user time, 0 for system.
814 */
815void update_process_times(int user_tick)
816{
817 struct task_struct *p = current;
818 int cpu = smp_processor_id();
819
820 /* Note: this timer irq context must be accounted for as well. */
821 if (user_tick)
822 account_user_time(p, jiffies_to_cputime(1));
823 else
824 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
825 run_local_timers();
826 if (rcu_pending(cpu))
827 rcu_check_callbacks(cpu, user_tick);
828 scheduler_tick();
829 run_posix_cpu_timers(p);
830}
831
832/*
833 * Nr of active tasks - counted in fixed-point numbers
834 */
835static unsigned long count_active_tasks(void)
836{
837 return (nr_running() + nr_uninterruptible()) * FIXED_1;
838}
839
840/*
841 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
842 * imply that avenrun[] is the standard name for this kind of thing.
843 * Nothing else seems to be standardized: the fractional size etc
844 * all seem to differ on different machines.
845 *
846 * Requires xtime_lock to access.
847 */
848unsigned long avenrun[3];
849
850EXPORT_SYMBOL(avenrun);
851
852/*
853 * calc_load - given tick count, update the avenrun load estimates.
854 * This is called while holding a write_lock on xtime_lock.
855 */
856static inline void calc_load(unsigned long ticks)
857{
858 unsigned long active_tasks; /* fixed-point */
859 static int count = LOAD_FREQ;
860
861 count -= ticks;
862 if (count < 0) {
863 count += LOAD_FREQ;
864 active_tasks = count_active_tasks();
865 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
866 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
867 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
868 }
869}
870
871/* jiffies at the most recent update of wall time */
872unsigned long wall_jiffies = INITIAL_JIFFIES;
873
874/*
875 * This read-write spinlock protects us from races in SMP while
876 * playing with xtime and avenrun.
877 */
878#ifndef ARCH_HAVE_XTIME_LOCK
879seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
880
881EXPORT_SYMBOL(xtime_lock);
882#endif
883
884/*
885 * This function runs timers and the timer-tq in bottom half context.
886 */
887static void run_timer_softirq(struct softirq_action *h)
888{
889 tvec_base_t *base = &__get_cpu_var(tvec_bases);
890
891 if (time_after_eq(jiffies, base->timer_jiffies))
892 __run_timers(base);
893}
894
895/*
896 * Called by the local, per-CPU timer interrupt on SMP.
897 */
898void run_local_timers(void)
899{
900 raise_softirq(TIMER_SOFTIRQ);
901}
902
903/*
904 * Called by the timer interrupt. xtime_lock must already be taken
905 * by the timer IRQ!
906 */
907static inline void update_times(void)
908{
909 unsigned long ticks;
910
911 ticks = jiffies - wall_jiffies;
912 if (ticks) {
913 wall_jiffies += ticks;
914 update_wall_time(ticks);
915 }
916 calc_load(ticks);
917}
918
919/*
920 * The 64-bit jiffies value is not atomic - you MUST NOT read it
921 * without sampling the sequence number in xtime_lock.
922 * jiffies is defined in the linker script...
923 */
924
925void do_timer(struct pt_regs *regs)
926{
927 jiffies_64++;
928 update_times();
929}
930
931#ifdef __ARCH_WANT_SYS_ALARM
932
933/*
934 * For backwards compatibility? This can be done in libc so Alpha
935 * and all newer ports shouldn't need it.
936 */
937asmlinkage unsigned long sys_alarm(unsigned int seconds)
938{
939 struct itimerval it_new, it_old;
940 unsigned int oldalarm;
941
942 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
943 it_new.it_value.tv_sec = seconds;
944 it_new.it_value.tv_usec = 0;
945 do_setitimer(ITIMER_REAL, &it_new, &it_old);
946 oldalarm = it_old.it_value.tv_sec;
947 /* ehhh.. We can't return 0 if we have an alarm pending.. */
948 /* And we'd better return too much than too little anyway */
949 if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
950 oldalarm++;
951 return oldalarm;
952}
953
954#endif
955
956#ifndef __alpha__
957
958/*
959 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
960 * should be moved into arch/i386 instead?
961 */
962
963/**
964 * sys_getpid - return the thread group id of the current process
965 *
966 * Note, despite the name, this returns the tgid not the pid. The tgid and
967 * the pid are identical unless CLONE_THREAD was specified on clone() in
968 * which case the tgid is the same in all threads of the same group.
969 *
970 * This is SMP safe as current->tgid does not change.
971 */
972asmlinkage long sys_getpid(void)
973{
974 return current->tgid;
975}
976
977/*
978 * Accessing ->group_leader->real_parent is not SMP-safe, it could
979 * change from under us. However, rather than getting any lock
980 * we can use an optimistic algorithm: get the parent
981 * pid, and go back and check that the parent is still
982 * the same. If it has changed (which is extremely unlikely
983 * indeed), we just try again..
984 *
985 * NOTE! This depends on the fact that even if we _do_
986 * get an old value of "parent", we can happily dereference
987 * the pointer (it was and remains a dereferencable kernel pointer
988 * no matter what): we just can't necessarily trust the result
989 * until we know that the parent pointer is valid.
990 *
991 * NOTE2: ->group_leader never changes from under us.
992 */
993asmlinkage long sys_getppid(void)
994{
995 int pid;
996 struct task_struct *me = current;
997 struct task_struct *parent;
998
999 parent = me->group_leader->real_parent;
1000 for (;;) {
1001 pid = parent->tgid;
1002#ifdef CONFIG_SMP
1003{
1004 struct task_struct *old = parent;
1005
1006 /*
1007 * Make sure we read the pid before re-reading the
1008 * parent pointer:
1009 */
1010 rmb();
1011 parent = me->group_leader->real_parent;
1012 if (old != parent)
1013 continue;
1014}
1015#endif
1016 break;
1017 }
1018 return pid;
1019}
1020
1021asmlinkage long sys_getuid(void)
1022{
1023 /* Only we change this so SMP safe */
1024 return current->uid;
1025}
1026
1027asmlinkage long sys_geteuid(void)
1028{
1029 /* Only we change this so SMP safe */
1030 return current->euid;
1031}
1032
1033asmlinkage long sys_getgid(void)
1034{
1035 /* Only we change this so SMP safe */
1036 return current->gid;
1037}
1038
1039asmlinkage long sys_getegid(void)
1040{
1041 /* Only we change this so SMP safe */
1042 return current->egid;
1043}
1044
1045#endif
1046
1047static void process_timeout(unsigned long __data)
1048{
1049 wake_up_process((task_t *)__data);
1050}
1051
1052/**
1053 * schedule_timeout - sleep until timeout
1054 * @timeout: timeout value in jiffies
1055 *
1056 * Make the current task sleep until @timeout jiffies have
1057 * elapsed. The routine will return immediately unless
1058 * the current task state has been set (see set_current_state()).
1059 *
1060 * You can set the task state as follows -
1061 *
1062 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1063 * pass before the routine returns. The routine will return 0
1064 *
1065 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1066 * delivered to the current task. In this case the remaining time
1067 * in jiffies will be returned, or 0 if the timer expired in time
1068 *
1069 * The current task state is guaranteed to be TASK_RUNNING when this
1070 * routine returns.
1071 *
1072 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1073 * the CPU away without a bound on the timeout. In this case the return
1074 * value will be %MAX_SCHEDULE_TIMEOUT.
1075 *
1076 * In all cases the return value is guaranteed to be non-negative.
1077 */
1078fastcall signed long __sched schedule_timeout(signed long timeout)
1079{
1080 struct timer_list timer;
1081 unsigned long expire;
1082
1083 switch (timeout)
1084 {
1085 case MAX_SCHEDULE_TIMEOUT:
1086 /*
1087 * These two special cases are useful to be comfortable
1088 * in the caller. Nothing more. We could take
1089 * MAX_SCHEDULE_TIMEOUT from one of the negative value
1090 * but I' d like to return a valid offset (>=0) to allow
1091 * the caller to do everything it want with the retval.
1092 */
1093 schedule();
1094 goto out;
1095 default:
1096 /*
1097 * Another bit of PARANOID. Note that the retval will be
1098 * 0 since no piece of kernel is supposed to do a check
1099 * for a negative retval of schedule_timeout() (since it
1100 * should never happens anyway). You just have the printk()
1101 * that will tell you if something is gone wrong and where.
1102 */
1103 if (timeout < 0)
1104 {
1105 printk(KERN_ERR "schedule_timeout: wrong timeout "
1106 "value %lx from %p\n", timeout,
1107 __builtin_return_address(0));
1108 current->state = TASK_RUNNING;
1109 goto out;
1110 }
1111 }
1112
1113 expire = timeout + jiffies;
1114
1115 init_timer(&timer);
1116 timer.expires = expire;
1117 timer.data = (unsigned long) current;
1118 timer.function = process_timeout;
1119
1120 add_timer(&timer);
1121 schedule();
1122 del_singleshot_timer_sync(&timer);
1123
1124 timeout = expire - jiffies;
1125
1126 out:
1127 return timeout < 0 ? 0 : timeout;
1128}
1129
1130EXPORT_SYMBOL(schedule_timeout);
1131
1132/* Thread ID - the internal kernel "pid" */
1133asmlinkage long sys_gettid(void)
1134{
1135 return current->pid;
1136}
1137
1138static long __sched nanosleep_restart(struct restart_block *restart)
1139{
1140 unsigned long expire = restart->arg0, now = jiffies;
1141 struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
1142 long ret;
1143
1144 /* Did it expire while we handled signals? */
1145 if (!time_after(expire, now))
1146 return 0;
1147
1148 current->state = TASK_INTERRUPTIBLE;
1149 expire = schedule_timeout(expire - now);
1150
1151 ret = 0;
1152 if (expire) {
1153 struct timespec t;
1154 jiffies_to_timespec(expire, &t);
1155
1156 ret = -ERESTART_RESTARTBLOCK;
1157 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1158 ret = -EFAULT;
1159 /* The 'restart' block is already filled in */
1160 }
1161 return ret;
1162}
1163
1164asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1165{
1166 struct timespec t;
1167 unsigned long expire;
1168 long ret;
1169
1170 if (copy_from_user(&t, rqtp, sizeof(t)))
1171 return -EFAULT;
1172
1173 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
1174 return -EINVAL;
1175
1176 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1177 current->state = TASK_INTERRUPTIBLE;
1178 expire = schedule_timeout(expire);
1179
1180 ret = 0;
1181 if (expire) {
1182 struct restart_block *restart;
1183 jiffies_to_timespec(expire, &t);
1184 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1185 return -EFAULT;
1186
1187 restart = &current_thread_info()->restart_block;
1188 restart->fn = nanosleep_restart;
1189 restart->arg0 = jiffies + expire;
1190 restart->arg1 = (unsigned long) rmtp;
1191 ret = -ERESTART_RESTARTBLOCK;
1192 }
1193 return ret;
1194}
1195
1196/*
1197 * sys_sysinfo - fill in sysinfo struct
1198 */
1199asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1200{
1201 struct sysinfo val;
1202 unsigned long mem_total, sav_total;
1203 unsigned int mem_unit, bitcount;
1204 unsigned long seq;
1205
1206 memset((char *)&val, 0, sizeof(struct sysinfo));
1207
1208 do {
1209 struct timespec tp;
1210 seq = read_seqbegin(&xtime_lock);
1211
1212 /*
1213 * This is annoying. The below is the same thing
1214 * posix_get_clock_monotonic() does, but it wants to
1215 * take the lock which we want to cover the loads stuff
1216 * too.
1217 */
1218
1219 getnstimeofday(&tp);
1220 tp.tv_sec += wall_to_monotonic.tv_sec;
1221 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1222 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1223 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1224 tp.tv_sec++;
1225 }
1226 val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1227
1228 val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
1229 val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1230 val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1231
1232 val.procs = nr_threads;
1233 } while (read_seqretry(&xtime_lock, seq));
1234
1235 si_meminfo(&val);
1236 si_swapinfo(&val);
1237
1238 /*
1239 * If the sum of all the available memory (i.e. ram + swap)
1240 * is less than can be stored in a 32 bit unsigned long then
1241 * we can be binary compatible with 2.2.x kernels. If not,
1242 * well, in that case 2.2.x was broken anyways...
1243 *
1244 * -Erik Andersen <andersee@debian.org>
1245 */
1246
1247 mem_total = val.totalram + val.totalswap;
1248 if (mem_total < val.totalram || mem_total < val.totalswap)
1249 goto out;
1250 bitcount = 0;
1251 mem_unit = val.mem_unit;
1252 while (mem_unit > 1) {
1253 bitcount++;
1254 mem_unit >>= 1;
1255 sav_total = mem_total;
1256 mem_total <<= 1;
1257 if (mem_total < sav_total)
1258 goto out;
1259 }
1260
1261 /*
1262 * If mem_total did not overflow, multiply all memory values by
1263 * val.mem_unit and set it to 1. This leaves things compatible
1264 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1265 * kernels...
1266 */
1267
1268 val.mem_unit = 1;
1269 val.totalram <<= bitcount;
1270 val.freeram <<= bitcount;
1271 val.sharedram <<= bitcount;
1272 val.bufferram <<= bitcount;
1273 val.totalswap <<= bitcount;
1274 val.freeswap <<= bitcount;
1275 val.totalhigh <<= bitcount;
1276 val.freehigh <<= bitcount;
1277
1278 out:
1279 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1280 return -EFAULT;
1281
1282 return 0;
1283}
1284
1285static void __devinit init_timers_cpu(int cpu)
1286{
1287 int j;
1288 tvec_base_t *base;
1289
1290 base = &per_cpu(tvec_bases, cpu);
1291 spin_lock_init(&base->lock);
1292 for (j = 0; j < TVN_SIZE; j++) {
1293 INIT_LIST_HEAD(base->tv5.vec + j);
1294 INIT_LIST_HEAD(base->tv4.vec + j);
1295 INIT_LIST_HEAD(base->tv3.vec + j);
1296 INIT_LIST_HEAD(base->tv2.vec + j);
1297 }
1298 for (j = 0; j < TVR_SIZE; j++)
1299 INIT_LIST_HEAD(base->tv1.vec + j);
1300
1301 base->timer_jiffies = jiffies;
1302}
1303
1304#ifdef CONFIG_HOTPLUG_CPU
1305static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1306{
1307 struct timer_list *timer;
1308
1309 while (!list_empty(head)) {
1310 timer = list_entry(head->next, struct timer_list, entry);
1311 /* We're locking backwards from __mod_timer order here,
1312 beware deadlock. */
1313 if (!spin_trylock(&timer->lock))
1314 return 0;
1315 list_del(&timer->entry);
1316 internal_add_timer(new_base, timer);
1317 timer->base = new_base;
1318 spin_unlock(&timer->lock);
1319 }
1320 return 1;
1321}
1322
1323static void __devinit migrate_timers(int cpu)
1324{
1325 tvec_base_t *old_base;
1326 tvec_base_t *new_base;
1327 int i;
1328
1329 BUG_ON(cpu_online(cpu));
1330 old_base = &per_cpu(tvec_bases, cpu);
1331 new_base = &get_cpu_var(tvec_bases);
1332
1333 local_irq_disable();
1334again:
1335 /* Prevent deadlocks via ordering by old_base < new_base. */
1336 if (old_base < new_base) {
1337 spin_lock(&new_base->lock);
1338 spin_lock(&old_base->lock);
1339 } else {
1340 spin_lock(&old_base->lock);
1341 spin_lock(&new_base->lock);
1342 }
1343
1344 if (old_base->running_timer)
1345 BUG();
1346 for (i = 0; i < TVR_SIZE; i++)
1347 if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
1348 goto unlock_again;
1349 for (i = 0; i < TVN_SIZE; i++)
1350 if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
1351 || !migrate_timer_list(new_base, old_base->tv3.vec + i)
1352 || !migrate_timer_list(new_base, old_base->tv4.vec + i)
1353 || !migrate_timer_list(new_base, old_base->tv5.vec + i))
1354 goto unlock_again;
1355 spin_unlock(&old_base->lock);
1356 spin_unlock(&new_base->lock);
1357 local_irq_enable();
1358 put_cpu_var(tvec_bases);
1359 return;
1360
1361unlock_again:
1362 /* Avoid deadlock with __mod_timer, by backing off. */
1363 spin_unlock(&old_base->lock);
1364 spin_unlock(&new_base->lock);
1365 cpu_relax();
1366 goto again;
1367}
1368#endif /* CONFIG_HOTPLUG_CPU */
1369
1370static int __devinit timer_cpu_notify(struct notifier_block *self,
1371 unsigned long action, void *hcpu)
1372{
1373 long cpu = (long)hcpu;
1374 switch(action) {
1375 case CPU_UP_PREPARE:
1376 init_timers_cpu(cpu);
1377 break;
1378#ifdef CONFIG_HOTPLUG_CPU
1379 case CPU_DEAD:
1380 migrate_timers(cpu);
1381 break;
1382#endif
1383 default:
1384 break;
1385 }
1386 return NOTIFY_OK;
1387}
1388
1389static struct notifier_block __devinitdata timers_nb = {
1390 .notifier_call = timer_cpu_notify,
1391};
1392
1393
1394void __init init_timers(void)
1395{
1396 timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1397 (void *)(long)smp_processor_id());
1398 register_cpu_notifier(&timers_nb);
1399 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1400}
1401
1402#ifdef CONFIG_TIME_INTERPOLATION
1403
1404struct time_interpolator *time_interpolator;
1405static struct time_interpolator *time_interpolator_list;
1406static DEFINE_SPINLOCK(time_interpolator_lock);
1407
1408static inline u64 time_interpolator_get_cycles(unsigned int src)
1409{
1410 unsigned long (*x)(void);
1411
1412 switch (src)
1413 {
1414 case TIME_SOURCE_FUNCTION:
1415 x = time_interpolator->addr;
1416 return x();
1417
1418 case TIME_SOURCE_MMIO64 :
1419 return readq((void __iomem *) time_interpolator->addr);
1420
1421 case TIME_SOURCE_MMIO32 :
1422 return readl((void __iomem *) time_interpolator->addr);
1423
1424 default: return get_cycles();
1425 }
1426}
1427
1428static inline u64 time_interpolator_get_counter(void)
1429{
1430 unsigned int src = time_interpolator->source;
1431
1432 if (time_interpolator->jitter)
1433 {
1434 u64 lcycle;
1435 u64 now;
1436
1437 do {
1438 lcycle = time_interpolator->last_cycle;
1439 now = time_interpolator_get_cycles(src);
1440 if (lcycle && time_after(lcycle, now))
1441 return lcycle;
1442 /* Keep track of the last timer value returned. The use of cmpxchg here
1443 * will cause contention in an SMP environment.
1444 */
1445 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1446 return now;
1447 }
1448 else
1449 return time_interpolator_get_cycles(src);
1450}
1451
1452void time_interpolator_reset(void)
1453{
1454 time_interpolator->offset = 0;
1455 time_interpolator->last_counter = time_interpolator_get_counter();
1456}
1457
1458#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1459
1460unsigned long time_interpolator_get_offset(void)
1461{
1462 /* If we do not have a time interpolator set up then just return zero */
1463 if (!time_interpolator)
1464 return 0;
1465
1466 return time_interpolator->offset +
1467 GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
1468}
1469
1470#define INTERPOLATOR_ADJUST 65536
1471#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1472
1473static void time_interpolator_update(long delta_nsec)
1474{
1475 u64 counter;
1476 unsigned long offset;
1477
1478 /* If there is no time interpolator set up then do nothing */
1479 if (!time_interpolator)
1480 return;
1481
1482 /* The interpolator compensates for late ticks by accumulating
1483 * the late time in time_interpolator->offset. A tick earlier than
1484 * expected will lead to a reset of the offset and a corresponding
1485 * jump of the clock forward. Again this only works if the
1486 * interpolator clock is running slightly slower than the regular clock
1487 * and the tuning logic insures that.
1488 */
1489
1490 counter = time_interpolator_get_counter();
1491 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
1492
1493 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1494 time_interpolator->offset = offset - delta_nsec;
1495 else {
1496 time_interpolator->skips++;
1497 time_interpolator->ns_skipped += delta_nsec - offset;
1498 time_interpolator->offset = 0;
1499 }
1500 time_interpolator->last_counter = counter;
1501
1502 /* Tuning logic for time interpolator invoked every minute or so.
1503 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1504 * Increase interpolator clock speed if we skip too much time.
1505 */
1506 if (jiffies % INTERPOLATOR_ADJUST == 0)
1507 {
1508 if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
1509 time_interpolator->nsec_per_cyc--;
1510 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1511 time_interpolator->nsec_per_cyc++;
1512 time_interpolator->skips = 0;
1513 time_interpolator->ns_skipped = 0;
1514 }
1515}
1516
1517static inline int
1518is_better_time_interpolator(struct time_interpolator *new)
1519{
1520 if (!time_interpolator)
1521 return 1;
1522 return new->frequency > 2*time_interpolator->frequency ||
1523 (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1524}
1525
1526void
1527register_time_interpolator(struct time_interpolator *ti)
1528{
1529 unsigned long flags;
1530
1531 /* Sanity check */
1532 if (ti->frequency == 0 || ti->mask == 0)
1533 BUG();
1534
1535 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1536 spin_lock(&time_interpolator_lock);
1537 write_seqlock_irqsave(&xtime_lock, flags);
1538 if (is_better_time_interpolator(ti)) {
1539 time_interpolator = ti;
1540 time_interpolator_reset();
1541 }
1542 write_sequnlock_irqrestore(&xtime_lock, flags);
1543
1544 ti->next = time_interpolator_list;
1545 time_interpolator_list = ti;
1546 spin_unlock(&time_interpolator_lock);
1547}
1548
1549void
1550unregister_time_interpolator(struct time_interpolator *ti)
1551{
1552 struct time_interpolator *curr, **prev;
1553 unsigned long flags;
1554
1555 spin_lock(&time_interpolator_lock);
1556 prev = &time_interpolator_list;
1557 for (curr = *prev; curr; curr = curr->next) {
1558 if (curr == ti) {
1559 *prev = curr->next;
1560 break;
1561 }
1562 prev = &curr->next;
1563 }
1564
1565 write_seqlock_irqsave(&xtime_lock, flags);
1566 if (ti == time_interpolator) {
1567 /* we lost the best time-interpolator: */
1568 time_interpolator = NULL;
1569 /* find the next-best interpolator */
1570 for (curr = time_interpolator_list; curr; curr = curr->next)
1571 if (is_better_time_interpolator(curr))
1572 time_interpolator = curr;
1573 time_interpolator_reset();
1574 }
1575 write_sequnlock_irqrestore(&xtime_lock, flags);
1576 spin_unlock(&time_interpolator_lock);
1577}
1578#endif /* CONFIG_TIME_INTERPOLATION */
1579
1580/**
1581 * msleep - sleep safely even with waitqueue interruptions
1582 * @msecs: Time in milliseconds to sleep for
1583 */
1584void msleep(unsigned int msecs)
1585{
1586 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1587
1588 while (timeout) {
1589 set_current_state(TASK_UNINTERRUPTIBLE);
1590 timeout = schedule_timeout(timeout);
1591 }
1592}
1593
1594EXPORT_SYMBOL(msleep);
1595
1596/**
1597 * msleep_interruptible - sleep waiting for waitqueue interruptions
1598 * @msecs: Time in milliseconds to sleep for
1599 */
1600unsigned long msleep_interruptible(unsigned int msecs)
1601{
1602 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1603
1604 while (timeout && !signal_pending(current)) {
1605 set_current_state(TASK_INTERRUPTIBLE);
1606 timeout = schedule_timeout(timeout);
1607 }
1608 return jiffies_to_msecs(timeout);
1609}
1610
1611EXPORT_SYMBOL(msleep_interruptible);
diff --git a/kernel/uid16.c b/kernel/uid16.c
new file mode 100644
index 000000000000..f669941e8b26
--- /dev/null
+++ b/kernel/uid16.c
@@ -0,0 +1,196 @@
1/*
2 * Wrapper functions for 16bit uid back compatibility. All nicely tied
3 * together in the faint hope we can take the out in five years time.
4 */
5
6#include <linux/mm.h>
7#include <linux/utsname.h>
8#include <linux/mman.h>
9#include <linux/smp_lock.h>
10#include <linux/notifier.h>
11#include <linux/reboot.h>
12#include <linux/prctl.h>
13#include <linux/init.h>
14#include <linux/highuid.h>
15#include <linux/security.h>
16#include <linux/syscalls.h>
17
18#include <asm/uaccess.h>
19
20asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
21{
22 return sys_chown(filename, low2highuid(user), low2highgid(group));
23}
24
25asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
26{
27 return sys_lchown(filename, low2highuid(user), low2highgid(group));
28}
29
30asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
31{
32 return sys_fchown(fd, low2highuid(user), low2highgid(group));
33}
34
35asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
36{
37 return sys_setregid(low2highgid(rgid), low2highgid(egid));
38}
39
40asmlinkage long sys_setgid16(old_gid_t gid)
41{
42 return sys_setgid(low2highgid(gid));
43}
44
45asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
46{
47 return sys_setreuid(low2highuid(ruid), low2highuid(euid));
48}
49
50asmlinkage long sys_setuid16(old_uid_t uid)
51{
52 return sys_setuid(low2highuid(uid));
53}
54
55asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
56{
57 return sys_setresuid(low2highuid(ruid), low2highuid(euid),
58 low2highuid(suid));
59}
60
61asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
62{
63 int retval;
64
65 if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
66 !(retval = put_user(high2lowuid(current->euid), euid)))
67 retval = put_user(high2lowuid(current->suid), suid);
68
69 return retval;
70}
71
72asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
73{
74 return sys_setresgid(low2highgid(rgid), low2highgid(egid),
75 low2highgid(sgid));
76}
77
78asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
79{
80 int retval;
81
82 if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
83 !(retval = put_user(high2lowgid(current->egid), egid)))
84 retval = put_user(high2lowgid(current->sgid), sgid);
85
86 return retval;
87}
88
89asmlinkage long sys_setfsuid16(old_uid_t uid)
90{
91 return sys_setfsuid(low2highuid(uid));
92}
93
94asmlinkage long sys_setfsgid16(old_gid_t gid)
95{
96 return sys_setfsgid(low2highgid(gid));
97}
98
99static int groups16_to_user(old_gid_t __user *grouplist,
100 struct group_info *group_info)
101{
102 int i;
103 old_gid_t group;
104
105 for (i = 0; i < group_info->ngroups; i++) {
106 group = high2lowgid(GROUP_AT(group_info, i));
107 if (put_user(group, grouplist+i))
108 return -EFAULT;
109 }
110
111 return 0;
112}
113
114static int groups16_from_user(struct group_info *group_info,
115 old_gid_t __user *grouplist)
116{
117 int i;
118 old_gid_t group;
119
120 for (i = 0; i < group_info->ngroups; i++) {
121 if (get_user(group, grouplist+i))
122 return -EFAULT;
123 GROUP_AT(group_info, i) = low2highgid(group);
124 }
125
126 return 0;
127}
128
129asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
130{
131 int i = 0;
132
133 if (gidsetsize < 0)
134 return -EINVAL;
135
136 get_group_info(current->group_info);
137 i = current->group_info->ngroups;
138 if (gidsetsize) {
139 if (i > gidsetsize) {
140 i = -EINVAL;
141 goto out;
142 }
143 if (groups16_to_user(grouplist, current->group_info)) {
144 i = -EFAULT;
145 goto out;
146 }
147 }
148out:
149 put_group_info(current->group_info);
150 return i;
151}
152
153asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
154{
155 struct group_info *group_info;
156 int retval;
157
158 if (!capable(CAP_SETGID))
159 return -EPERM;
160 if ((unsigned)gidsetsize > NGROUPS_MAX)
161 return -EINVAL;
162
163 group_info = groups_alloc(gidsetsize);
164 if (!group_info)
165 return -ENOMEM;
166 retval = groups16_from_user(group_info, grouplist);
167 if (retval) {
168 put_group_info(group_info);
169 return retval;
170 }
171
172 retval = set_current_groups(group_info);
173 put_group_info(group_info);
174
175 return retval;
176}
177
178asmlinkage long sys_getuid16(void)
179{
180 return high2lowuid(current->uid);
181}
182
183asmlinkage long sys_geteuid16(void)
184{
185 return high2lowuid(current->euid);
186}
187
188asmlinkage long sys_getgid16(void)
189{
190 return high2lowgid(current->gid);
191}
192
193asmlinkage long sys_getegid16(void)
194{
195 return high2lowgid(current->egid);
196}
diff --git a/kernel/user.c b/kernel/user.c
new file mode 100644
index 000000000000..734575d55769
--- /dev/null
+++ b/kernel/user.c
@@ -0,0 +1,189 @@
1/*
2 * The "user cache".
3 *
4 * (C) Copyright 1991-2000 Linus Torvalds
5 *
6 * We have a per-user structure to keep track of how many
7 * processes, files etc the user has claimed, in order to be
8 * able to have per-user limits for system resources.
9 */
10
11#include <linux/init.h>
12#include <linux/sched.h>
13#include <linux/slab.h>
14#include <linux/bitops.h>
15#include <linux/key.h>
16
17/*
18 * UID task count cache, to get fast user lookup in "alloc_uid"
19 * when changing user ID's (ie setuid() and friends).
20 */
21
22#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
23#define UIDHASH_SZ (1 << UIDHASH_BITS)
24#define UIDHASH_MASK (UIDHASH_SZ - 1)
25#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
26#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
27
28static kmem_cache_t *uid_cachep;
29static struct list_head uidhash_table[UIDHASH_SZ];
30static DEFINE_SPINLOCK(uidhash_lock);
31
32struct user_struct root_user = {
33 .__count = ATOMIC_INIT(1),
34 .processes = ATOMIC_INIT(1),
35 .files = ATOMIC_INIT(0),
36 .sigpending = ATOMIC_INIT(0),
37 .mq_bytes = 0,
38 .locked_shm = 0,
39#ifdef CONFIG_KEYS
40 .uid_keyring = &root_user_keyring,
41 .session_keyring = &root_session_keyring,
42#endif
43};
44
45/*
46 * These routines must be called with the uidhash spinlock held!
47 */
48static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
49{
50 list_add(&up->uidhash_list, hashent);
51}
52
53static inline void uid_hash_remove(struct user_struct *up)
54{
55 list_del(&up->uidhash_list);
56}
57
58static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
59{
60 struct list_head *up;
61
62 list_for_each(up, hashent) {
63 struct user_struct *user;
64
65 user = list_entry(up, struct user_struct, uidhash_list);
66
67 if(user->uid == uid) {
68 atomic_inc(&user->__count);
69 return user;
70 }
71 }
72
73 return NULL;
74}
75
76/*
77 * Locate the user_struct for the passed UID. If found, take a ref on it. The
78 * caller must undo that ref with free_uid().
79 *
80 * If the user_struct could not be found, return NULL.
81 */
82struct user_struct *find_user(uid_t uid)
83{
84 struct user_struct *ret;
85
86 spin_lock(&uidhash_lock);
87 ret = uid_hash_find(uid, uidhashentry(uid));
88 spin_unlock(&uidhash_lock);
89 return ret;
90}
91
92void free_uid(struct user_struct *up)
93{
94 if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
95 uid_hash_remove(up);
96 key_put(up->uid_keyring);
97 key_put(up->session_keyring);
98 kmem_cache_free(uid_cachep, up);
99 spin_unlock(&uidhash_lock);
100 }
101}
102
103struct user_struct * alloc_uid(uid_t uid)
104{
105 struct list_head *hashent = uidhashentry(uid);
106 struct user_struct *up;
107
108 spin_lock(&uidhash_lock);
109 up = uid_hash_find(uid, hashent);
110 spin_unlock(&uidhash_lock);
111
112 if (!up) {
113 struct user_struct *new;
114
115 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
116 if (!new)
117 return NULL;
118 new->uid = uid;
119 atomic_set(&new->__count, 1);
120 atomic_set(&new->processes, 0);
121 atomic_set(&new->files, 0);
122 atomic_set(&new->sigpending, 0);
123
124 new->mq_bytes = 0;
125 new->locked_shm = 0;
126
127 if (alloc_uid_keyring(new) < 0) {
128 kmem_cache_free(uid_cachep, new);
129 return NULL;
130 }
131
132 /*
133 * Before adding this, check whether we raced
134 * on adding the same user already..
135 */
136 spin_lock(&uidhash_lock);
137 up = uid_hash_find(uid, hashent);
138 if (up) {
139 key_put(new->uid_keyring);
140 key_put(new->session_keyring);
141 kmem_cache_free(uid_cachep, new);
142 } else {
143 uid_hash_insert(new, hashent);
144 up = new;
145 }
146 spin_unlock(&uidhash_lock);
147
148 }
149 return up;
150}
151
152void switch_uid(struct user_struct *new_user)
153{
154 struct user_struct *old_user;
155
156 /* What if a process setreuid()'s and this brings the
157 * new uid over his NPROC rlimit? We can check this now
158 * cheaply with the new uid cache, so if it matters
159 * we should be checking for it. -DaveM
160 */
161 old_user = current->user;
162 atomic_inc(&new_user->processes);
163 atomic_dec(&old_user->processes);
164 switch_uid_keyring(new_user);
165 current->user = new_user;
166 free_uid(old_user);
167 suid_keys(current);
168}
169
170
171static int __init uid_cache_init(void)
172{
173 int n;
174
175 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
176 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
177
178 for(n = 0; n < UIDHASH_SZ; ++n)
179 INIT_LIST_HEAD(uidhash_table + n);
180
181 /* Insert the root user immediately (init already runs as root) */
182 spin_lock(&uidhash_lock);
183 uid_hash_insert(&root_user, uidhashentry(0));
184 spin_unlock(&uidhash_lock);
185
186 return 0;
187}
188
189module_init(uid_cache_init);
diff --git a/kernel/wait.c b/kernel/wait.c
new file mode 100644
index 000000000000..791681cfea98
--- /dev/null
+++ b/kernel/wait.c
@@ -0,0 +1,246 @@
1/*
2 * Generic waiting primitives.
3 *
4 * (C) 2004 William Irwin, Oracle
5 */
6#include <linux/config.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/sched.h>
10#include <linux/mm.h>
11#include <linux/wait.h>
12#include <linux/hash.h>
13
14void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
15{
16 unsigned long flags;
17
18 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
19 spin_lock_irqsave(&q->lock, flags);
20 __add_wait_queue(q, wait);
21 spin_unlock_irqrestore(&q->lock, flags);
22}
23EXPORT_SYMBOL(add_wait_queue);
24
25void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
26{
27 unsigned long flags;
28
29 wait->flags |= WQ_FLAG_EXCLUSIVE;
30 spin_lock_irqsave(&q->lock, flags);
31 __add_wait_queue_tail(q, wait);
32 spin_unlock_irqrestore(&q->lock, flags);
33}
34EXPORT_SYMBOL(add_wait_queue_exclusive);
35
36void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
37{
38 unsigned long flags;
39
40 spin_lock_irqsave(&q->lock, flags);
41 __remove_wait_queue(q, wait);
42 spin_unlock_irqrestore(&q->lock, flags);
43}
44EXPORT_SYMBOL(remove_wait_queue);
45
46
47/*
48 * Note: we use "set_current_state()" _after_ the wait-queue add,
49 * because we need a memory barrier there on SMP, so that any
50 * wake-function that tests for the wait-queue being active
51 * will be guaranteed to see waitqueue addition _or_ subsequent
52 * tests in this thread will see the wakeup having taken place.
53 *
54 * The spin_unlock() itself is semi-permeable and only protects
55 * one way (it only protects stuff inside the critical region and
56 * stops them from bleeding out - it would still allow subsequent
57 * loads to move into the the critical region).
58 */
59void fastcall
60prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
61{
62 unsigned long flags;
63
64 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
65 spin_lock_irqsave(&q->lock, flags);
66 if (list_empty(&wait->task_list))
67 __add_wait_queue(q, wait);
68 /*
69 * don't alter the task state if this is just going to
70 * queue an async wait queue callback
71 */
72 if (is_sync_wait(wait))
73 set_current_state(state);
74 spin_unlock_irqrestore(&q->lock, flags);
75}
76EXPORT_SYMBOL(prepare_to_wait);
77
78void fastcall
79prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
80{
81 unsigned long flags;
82
83 wait->flags |= WQ_FLAG_EXCLUSIVE;
84 spin_lock_irqsave(&q->lock, flags);
85 if (list_empty(&wait->task_list))
86 __add_wait_queue_tail(q, wait);
87 /*
88 * don't alter the task state if this is just going to
89 * queue an async wait queue callback
90 */
91 if (is_sync_wait(wait))
92 set_current_state(state);
93 spin_unlock_irqrestore(&q->lock, flags);
94}
95EXPORT_SYMBOL(prepare_to_wait_exclusive);
96
97void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
98{
99 unsigned long flags;
100
101 __set_current_state(TASK_RUNNING);
102 /*
103 * We can check for list emptiness outside the lock
104 * IFF:
105 * - we use the "careful" check that verifies both
106 * the next and prev pointers, so that there cannot
107 * be any half-pending updates in progress on other
108 * CPU's that we haven't seen yet (and that might
109 * still change the stack area.
110 * and
111 * - all other users take the lock (ie we can only
112 * have _one_ other CPU that looks at or modifies
113 * the list).
114 */
115 if (!list_empty_careful(&wait->task_list)) {
116 spin_lock_irqsave(&q->lock, flags);
117 list_del_init(&wait->task_list);
118 spin_unlock_irqrestore(&q->lock, flags);
119 }
120}
121EXPORT_SYMBOL(finish_wait);
122
123int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
124{
125 int ret = default_wake_function(wait, mode, sync, key);
126
127 if (ret)
128 list_del_init(&wait->task_list);
129 return ret;
130}
131EXPORT_SYMBOL(autoremove_wake_function);
132
133int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
134{
135 struct wait_bit_key *key = arg;
136 struct wait_bit_queue *wait_bit
137 = container_of(wait, struct wait_bit_queue, wait);
138
139 if (wait_bit->key.flags != key->flags ||
140 wait_bit->key.bit_nr != key->bit_nr ||
141 test_bit(key->bit_nr, key->flags))
142 return 0;
143 else
144 return autoremove_wake_function(wait, mode, sync, key);
145}
146EXPORT_SYMBOL(wake_bit_function);
147
148/*
149 * To allow interruptible waiting and asynchronous (i.e. nonblocking)
150 * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
151 * permitted return codes. Nonzero return codes halt waiting and return.
152 */
153int __sched fastcall
154__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
155 int (*action)(void *), unsigned mode)
156{
157 int ret = 0;
158
159 do {
160 prepare_to_wait(wq, &q->wait, mode);
161 if (test_bit(q->key.bit_nr, q->key.flags))
162 ret = (*action)(q->key.flags);
163 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
164 finish_wait(wq, &q->wait);
165 return ret;
166}
167EXPORT_SYMBOL(__wait_on_bit);
168
169int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
170 int (*action)(void *), unsigned mode)
171{
172 wait_queue_head_t *wq = bit_waitqueue(word, bit);
173 DEFINE_WAIT_BIT(wait, word, bit);
174
175 return __wait_on_bit(wq, &wait, action, mode);
176}
177EXPORT_SYMBOL(out_of_line_wait_on_bit);
178
179int __sched fastcall
180__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
181 int (*action)(void *), unsigned mode)
182{
183 int ret = 0;
184
185 do {
186 prepare_to_wait_exclusive(wq, &q->wait, mode);
187 if (test_bit(q->key.bit_nr, q->key.flags)) {
188 if ((ret = (*action)(q->key.flags)))
189 break;
190 }
191 } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
192 finish_wait(wq, &q->wait);
193 return ret;
194}
195EXPORT_SYMBOL(__wait_on_bit_lock);
196
197int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
198 int (*action)(void *), unsigned mode)
199{
200 wait_queue_head_t *wq = bit_waitqueue(word, bit);
201 DEFINE_WAIT_BIT(wait, word, bit);
202
203 return __wait_on_bit_lock(wq, &wait, action, mode);
204}
205EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
206
207void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
208{
209 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
210 if (waitqueue_active(wq))
211 __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key);
212}
213EXPORT_SYMBOL(__wake_up_bit);
214
215/**
216 * wake_up_bit - wake up a waiter on a bit
217 * @word: the word being waited on, a kernel virtual address
218 * @bit: the bit of the word being waited on
219 *
220 * There is a standard hashed waitqueue table for generic use. This
221 * is the part of the hashtable's accessor API that wakes up waiters
222 * on a bit. For instance, if one were to have waiters on a bitflag,
223 * one would call wake_up_bit() after clearing the bit.
224 *
225 * In order for this to function properly, as it uses waitqueue_active()
226 * internally, some kind of memory barrier must be done prior to calling
227 * this. Typically, this will be smp_mb__after_clear_bit(), but in some
228 * cases where bitflags are manipulated non-atomically under a lock, one
229 * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
230 * because spin_unlock() does not guarantee a memory barrier.
231 */
232void fastcall wake_up_bit(void *word, int bit)
233{
234 __wake_up_bit(bit_waitqueue(word, bit), word, bit);
235}
236EXPORT_SYMBOL(wake_up_bit);
237
238fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit)
239{
240 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
241 const struct zone *zone = page_zone(virt_to_page(word));
242 unsigned long val = (unsigned long)word << shift | bit;
243
244 return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
245}
246EXPORT_SYMBOL(bit_waitqueue);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
new file mode 100644
index 000000000000..52ef419d2747
--- /dev/null
+++ b/kernel/workqueue.c
@@ -0,0 +1,555 @@
1/*
2 * linux/kernel/workqueue.c
3 *
4 * Generic mechanism for defining kernel helper threads for running
5 * arbitrary tasks in process context.
6 *
7 * Started by Ingo Molnar, Copyright (C) 2002
8 *
9 * Derived from the taskqueue/keventd code by:
10 *
11 * David Woodhouse <dwmw2@infradead.org>
12 * Andrew Morton <andrewm@uow.edu.au>
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu>
15 */
16
17#include <linux/module.h>
18#include <linux/kernel.h>
19#include <linux/sched.h>
20#include <linux/init.h>
21#include <linux/signal.h>
22#include <linux/completion.h>
23#include <linux/workqueue.h>
24#include <linux/slab.h>
25#include <linux/cpu.h>
26#include <linux/notifier.h>
27#include <linux/kthread.h>
28
29/*
30 * The per-CPU workqueue (if single thread, we always use cpu 0's).
31 *
32 * The sequence counters are for flush_scheduled_work(). It wants to wait
33 * until until all currently-scheduled works are completed, but it doesn't
34 * want to be livelocked by new, incoming ones. So it waits until
35 * remove_sequence is >= the insert_sequence which pertained when
36 * flush_scheduled_work() was called.
37 */
38struct cpu_workqueue_struct {
39
40 spinlock_t lock;
41
42 long remove_sequence; /* Least-recently added (next to run) */
43 long insert_sequence; /* Next to add */
44
45 struct list_head worklist;
46 wait_queue_head_t more_work;
47 wait_queue_head_t work_done;
48
49 struct workqueue_struct *wq;
50 task_t *thread;
51
52 int run_depth; /* Detect run_workqueue() recursion depth */
53} ____cacheline_aligned;
54
55/*
56 * The externally visible workqueue abstraction is an array of
57 * per-CPU workqueues:
58 */
59struct workqueue_struct {
60 struct cpu_workqueue_struct cpu_wq[NR_CPUS];
61 const char *name;
62 struct list_head list; /* Empty if single thread */
63};
64
65/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
66 threads to each one as cpus come/go. */
67static DEFINE_SPINLOCK(workqueue_lock);
68static LIST_HEAD(workqueues);
69
70/* If it's single threaded, it isn't in the list of workqueues. */
71static inline int is_single_threaded(struct workqueue_struct *wq)
72{
73 return list_empty(&wq->list);
74}
75
76/* Preempt must be disabled. */
77static void __queue_work(struct cpu_workqueue_struct *cwq,
78 struct work_struct *work)
79{
80 unsigned long flags;
81
82 spin_lock_irqsave(&cwq->lock, flags);
83 work->wq_data = cwq;
84 list_add_tail(&work->entry, &cwq->worklist);
85 cwq->insert_sequence++;
86 wake_up(&cwq->more_work);
87 spin_unlock_irqrestore(&cwq->lock, flags);
88}
89
90/*
91 * Queue work on a workqueue. Return non-zero if it was successfully
92 * added.
93 *
94 * We queue the work to the CPU it was submitted, but there is no
95 * guarantee that it will be processed by that CPU.
96 */
97int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
98{
99 int ret = 0, cpu = get_cpu();
100
101 if (!test_and_set_bit(0, &work->pending)) {
102 if (unlikely(is_single_threaded(wq)))
103 cpu = 0;
104 BUG_ON(!list_empty(&work->entry));
105 __queue_work(wq->cpu_wq + cpu, work);
106 ret = 1;
107 }
108 put_cpu();
109 return ret;
110}
111
112static void delayed_work_timer_fn(unsigned long __data)
113{
114 struct work_struct *work = (struct work_struct *)__data;
115 struct workqueue_struct *wq = work->wq_data;
116 int cpu = smp_processor_id();
117
118 if (unlikely(is_single_threaded(wq)))
119 cpu = 0;
120
121 __queue_work(wq->cpu_wq + cpu, work);
122}
123
124int fastcall queue_delayed_work(struct workqueue_struct *wq,
125 struct work_struct *work, unsigned long delay)
126{
127 int ret = 0;
128 struct timer_list *timer = &work->timer;
129
130 if (!test_and_set_bit(0, &work->pending)) {
131 BUG_ON(timer_pending(timer));
132 BUG_ON(!list_empty(&work->entry));
133
134 /* This stores wq for the moment, for the timer_fn */
135 work->wq_data = wq;
136 timer->expires = jiffies + delay;
137 timer->data = (unsigned long)work;
138 timer->function = delayed_work_timer_fn;
139 add_timer(timer);
140 ret = 1;
141 }
142 return ret;
143}
144
145static inline void run_workqueue(struct cpu_workqueue_struct *cwq)
146{
147 unsigned long flags;
148
149 /*
150 * Keep taking off work from the queue until
151 * done.
152 */
153 spin_lock_irqsave(&cwq->lock, flags);
154 cwq->run_depth++;
155 if (cwq->run_depth > 3) {
156 /* morton gets to eat his hat */
157 printk("%s: recursion depth exceeded: %d\n",
158 __FUNCTION__, cwq->run_depth);
159 dump_stack();
160 }
161 while (!list_empty(&cwq->worklist)) {
162 struct work_struct *work = list_entry(cwq->worklist.next,
163 struct work_struct, entry);
164 void (*f) (void *) = work->func;
165 void *data = work->data;
166
167 list_del_init(cwq->worklist.next);
168 spin_unlock_irqrestore(&cwq->lock, flags);
169
170 BUG_ON(work->wq_data != cwq);
171 clear_bit(0, &work->pending);
172 f(data);
173
174 spin_lock_irqsave(&cwq->lock, flags);
175 cwq->remove_sequence++;
176 wake_up(&cwq->work_done);
177 }
178 cwq->run_depth--;
179 spin_unlock_irqrestore(&cwq->lock, flags);
180}
181
182static int worker_thread(void *__cwq)
183{
184 struct cpu_workqueue_struct *cwq = __cwq;
185 DECLARE_WAITQUEUE(wait, current);
186 struct k_sigaction sa;
187 sigset_t blocked;
188
189 current->flags |= PF_NOFREEZE;
190
191 set_user_nice(current, -5);
192
193 /* Block and flush all signals */
194 sigfillset(&blocked);
195 sigprocmask(SIG_BLOCK, &blocked, NULL);
196 flush_signals(current);
197
198 /* SIG_IGN makes children autoreap: see do_notify_parent(). */
199 sa.sa.sa_handler = SIG_IGN;
200 sa.sa.sa_flags = 0;
201 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
202 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
203
204 set_current_state(TASK_INTERRUPTIBLE);
205 while (!kthread_should_stop()) {
206 add_wait_queue(&cwq->more_work, &wait);
207 if (list_empty(&cwq->worklist))
208 schedule();
209 else
210 __set_current_state(TASK_RUNNING);
211 remove_wait_queue(&cwq->more_work, &wait);
212
213 if (!list_empty(&cwq->worklist))
214 run_workqueue(cwq);
215 set_current_state(TASK_INTERRUPTIBLE);
216 }
217 __set_current_state(TASK_RUNNING);
218 return 0;
219}
220
221static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
222{
223 if (cwq->thread == current) {
224 /*
225 * Probably keventd trying to flush its own queue. So simply run
226 * it by hand rather than deadlocking.
227 */
228 run_workqueue(cwq);
229 } else {
230 DEFINE_WAIT(wait);
231 long sequence_needed;
232
233 spin_lock_irq(&cwq->lock);
234 sequence_needed = cwq->insert_sequence;
235
236 while (sequence_needed - cwq->remove_sequence > 0) {
237 prepare_to_wait(&cwq->work_done, &wait,
238 TASK_UNINTERRUPTIBLE);
239 spin_unlock_irq(&cwq->lock);
240 schedule();
241 spin_lock_irq(&cwq->lock);
242 }
243 finish_wait(&cwq->work_done, &wait);
244 spin_unlock_irq(&cwq->lock);
245 }
246}
247
248/*
249 * flush_workqueue - ensure that any scheduled work has run to completion.
250 *
251 * Forces execution of the workqueue and blocks until its completion.
252 * This is typically used in driver shutdown handlers.
253 *
254 * This function will sample each workqueue's current insert_sequence number and
255 * will sleep until the head sequence is greater than or equal to that. This
256 * means that we sleep until all works which were queued on entry have been
257 * handled, but we are not livelocked by new incoming ones.
258 *
259 * This function used to run the workqueues itself. Now we just wait for the
260 * helper threads to do it.
261 */
262void fastcall flush_workqueue(struct workqueue_struct *wq)
263{
264 might_sleep();
265
266 if (is_single_threaded(wq)) {
267 /* Always use cpu 0's area. */
268 flush_cpu_workqueue(wq->cpu_wq + 0);
269 } else {
270 int cpu;
271
272 lock_cpu_hotplug();
273 for_each_online_cpu(cpu)
274 flush_cpu_workqueue(wq->cpu_wq + cpu);
275 unlock_cpu_hotplug();
276 }
277}
278
279static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
280 int cpu)
281{
282 struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
283 struct task_struct *p;
284
285 spin_lock_init(&cwq->lock);
286 cwq->wq = wq;
287 cwq->thread = NULL;
288 cwq->insert_sequence = 0;
289 cwq->remove_sequence = 0;
290 INIT_LIST_HEAD(&cwq->worklist);
291 init_waitqueue_head(&cwq->more_work);
292 init_waitqueue_head(&cwq->work_done);
293
294 if (is_single_threaded(wq))
295 p = kthread_create(worker_thread, cwq, "%s", wq->name);
296 else
297 p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
298 if (IS_ERR(p))
299 return NULL;
300 cwq->thread = p;
301 return p;
302}
303
304struct workqueue_struct *__create_workqueue(const char *name,
305 int singlethread)
306{
307 int cpu, destroy = 0;
308 struct workqueue_struct *wq;
309 struct task_struct *p;
310
311 BUG_ON(strlen(name) > 10);
312
313 wq = kmalloc(sizeof(*wq), GFP_KERNEL);
314 if (!wq)
315 return NULL;
316 memset(wq, 0, sizeof(*wq));
317
318 wq->name = name;
319 /* We don't need the distraction of CPUs appearing and vanishing. */
320 lock_cpu_hotplug();
321 if (singlethread) {
322 INIT_LIST_HEAD(&wq->list);
323 p = create_workqueue_thread(wq, 0);
324 if (!p)
325 destroy = 1;
326 else
327 wake_up_process(p);
328 } else {
329 spin_lock(&workqueue_lock);
330 list_add(&wq->list, &workqueues);
331 spin_unlock(&workqueue_lock);
332 for_each_online_cpu(cpu) {
333 p = create_workqueue_thread(wq, cpu);
334 if (p) {
335 kthread_bind(p, cpu);
336 wake_up_process(p);
337 } else
338 destroy = 1;
339 }
340 }
341 unlock_cpu_hotplug();
342
343 /*
344 * Was there any error during startup? If yes then clean up:
345 */
346 if (destroy) {
347 destroy_workqueue(wq);
348 wq = NULL;
349 }
350 return wq;
351}
352
353static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
354{
355 struct cpu_workqueue_struct *cwq;
356 unsigned long flags;
357 struct task_struct *p;
358
359 cwq = wq->cpu_wq + cpu;
360 spin_lock_irqsave(&cwq->lock, flags);
361 p = cwq->thread;
362 cwq->thread = NULL;
363 spin_unlock_irqrestore(&cwq->lock, flags);
364 if (p)
365 kthread_stop(p);
366}
367
368void destroy_workqueue(struct workqueue_struct *wq)
369{
370 int cpu;
371
372 flush_workqueue(wq);
373
374 /* We don't need the distraction of CPUs appearing and vanishing. */
375 lock_cpu_hotplug();
376 if (is_single_threaded(wq))
377 cleanup_workqueue_thread(wq, 0);
378 else {
379 for_each_online_cpu(cpu)
380 cleanup_workqueue_thread(wq, cpu);
381 spin_lock(&workqueue_lock);
382 list_del(&wq->list);
383 spin_unlock(&workqueue_lock);
384 }
385 unlock_cpu_hotplug();
386 kfree(wq);
387}
388
389static struct workqueue_struct *keventd_wq;
390
391int fastcall schedule_work(struct work_struct *work)
392{
393 return queue_work(keventd_wq, work);
394}
395
396int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
397{
398 return queue_delayed_work(keventd_wq, work, delay);
399}
400
401int schedule_delayed_work_on(int cpu,
402 struct work_struct *work, unsigned long delay)
403{
404 int ret = 0;
405 struct timer_list *timer = &work->timer;
406
407 if (!test_and_set_bit(0, &work->pending)) {
408 BUG_ON(timer_pending(timer));
409 BUG_ON(!list_empty(&work->entry));
410 /* This stores keventd_wq for the moment, for the timer_fn */
411 work->wq_data = keventd_wq;
412 timer->expires = jiffies + delay;
413 timer->data = (unsigned long)work;
414 timer->function = delayed_work_timer_fn;
415 add_timer_on(timer, cpu);
416 ret = 1;
417 }
418 return ret;
419}
420
421void flush_scheduled_work(void)
422{
423 flush_workqueue(keventd_wq);
424}
425
426/**
427 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
428 * work whose handler rearms the delayed work.
429 * @wq: the controlling workqueue structure
430 * @work: the delayed work struct
431 */
432static void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
433 struct work_struct *work)
434{
435 while (!cancel_delayed_work(work))
436 flush_workqueue(wq);
437}
438
439/**
440 * cancel_rearming_delayed_work - reliably kill off a delayed keventd
441 * work whose handler rearms the delayed work.
442 * @work: the delayed work struct
443 */
444void cancel_rearming_delayed_work(struct work_struct *work)
445{
446 cancel_rearming_delayed_workqueue(keventd_wq, work);
447}
448EXPORT_SYMBOL(cancel_rearming_delayed_work);
449
450int keventd_up(void)
451{
452 return keventd_wq != NULL;
453}
454
455int current_is_keventd(void)
456{
457 struct cpu_workqueue_struct *cwq;
458 int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */
459 int ret = 0;
460
461 BUG_ON(!keventd_wq);
462
463 cwq = keventd_wq->cpu_wq + cpu;
464 if (current == cwq->thread)
465 ret = 1;
466
467 return ret;
468
469}
470
471#ifdef CONFIG_HOTPLUG_CPU
472/* Take the work from this (downed) CPU. */
473static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
474{
475 struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
476 LIST_HEAD(list);
477 struct work_struct *work;
478
479 spin_lock_irq(&cwq->lock);
480 list_splice_init(&cwq->worklist, &list);
481
482 while (!list_empty(&list)) {
483 printk("Taking work for %s\n", wq->name);
484 work = list_entry(list.next,struct work_struct,entry);
485 list_del(&work->entry);
486 __queue_work(wq->cpu_wq + smp_processor_id(), work);
487 }
488 spin_unlock_irq(&cwq->lock);
489}
490
491/* We're holding the cpucontrol mutex here */
492static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
493 unsigned long action,
494 void *hcpu)
495{
496 unsigned int hotcpu = (unsigned long)hcpu;
497 struct workqueue_struct *wq;
498
499 switch (action) {
500 case CPU_UP_PREPARE:
501 /* Create a new workqueue thread for it. */
502 list_for_each_entry(wq, &workqueues, list) {
503 if (create_workqueue_thread(wq, hotcpu) < 0) {
504 printk("workqueue for %i failed\n", hotcpu);
505 return NOTIFY_BAD;
506 }
507 }
508 break;
509
510 case CPU_ONLINE:
511 /* Kick off worker threads. */
512 list_for_each_entry(wq, &workqueues, list) {
513 kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu);
514 wake_up_process(wq->cpu_wq[hotcpu].thread);
515 }
516 break;
517
518 case CPU_UP_CANCELED:
519 list_for_each_entry(wq, &workqueues, list) {
520 /* Unbind so it can run. */
521 kthread_bind(wq->cpu_wq[hotcpu].thread,
522 smp_processor_id());
523 cleanup_workqueue_thread(wq, hotcpu);
524 }
525 break;
526
527 case CPU_DEAD:
528 list_for_each_entry(wq, &workqueues, list)
529 cleanup_workqueue_thread(wq, hotcpu);
530 list_for_each_entry(wq, &workqueues, list)
531 take_over_work(wq, hotcpu);
532 break;
533 }
534
535 return NOTIFY_OK;
536}
537#endif
538
539void init_workqueues(void)
540{
541 hotcpu_notifier(workqueue_cpu_callback, 0);
542 keventd_wq = create_workqueue("events");
543 BUG_ON(!keventd_wq);
544}
545
546EXPORT_SYMBOL_GPL(__create_workqueue);
547EXPORT_SYMBOL_GPL(queue_work);
548EXPORT_SYMBOL_GPL(queue_delayed_work);
549EXPORT_SYMBOL_GPL(flush_workqueue);
550EXPORT_SYMBOL_GPL(destroy_workqueue);
551
552EXPORT_SYMBOL(schedule_work);
553EXPORT_SYMBOL(schedule_delayed_work);
554EXPORT_SYMBOL(schedule_delayed_work_on);
555EXPORT_SYMBOL(flush_scheduled_work);