diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'kernel')
67 files changed, 40718 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile new file mode 100644 index 000000000000..eb88b446c2cc --- /dev/null +++ b/kernel/Makefile | |||
@@ -0,0 +1,53 @@ | |||
1 | # | ||
2 | # Makefile for the linux kernel. | ||
3 | # | ||
4 | |||
5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | ||
6 | exit.o itimer.o time.o softirq.o resource.o \ | ||
7 | sysctl.o capability.o ptrace.o timer.o user.o \ | ||
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | ||
9 | rcupdate.o intermodule.o extable.o params.o posix-timers.o \ | ||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o | ||
11 | |||
12 | obj-$(CONFIG_FUTEX) += futex.o | ||
13 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | ||
14 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | ||
15 | obj-$(CONFIG_UID16) += uid16.o | ||
16 | obj-$(CONFIG_MODULES) += module.o | ||
17 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | ||
18 | obj-$(CONFIG_PM) += power/ | ||
19 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | ||
20 | obj-$(CONFIG_COMPAT) += compat.o | ||
21 | obj-$(CONFIG_CPUSETS) += cpuset.o | ||
22 | obj-$(CONFIG_IKCONFIG) += configs.o | ||
23 | obj-$(CONFIG_IKCONFIG_PROC) += configs.o | ||
24 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | ||
25 | obj-$(CONFIG_AUDIT) += audit.o | ||
26 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | ||
27 | obj-$(CONFIG_KPROBES) += kprobes.o | ||
28 | obj-$(CONFIG_SYSFS) += ksysfs.o | ||
29 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | ||
30 | obj-$(CONFIG_SECCOMP) += seccomp.o | ||
31 | |||
32 | ifneq ($(CONFIG_IA64),y) | ||
33 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
34 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
35 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
36 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
37 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
38 | CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer | ||
39 | endif | ||
40 | |||
41 | $(obj)/configs.o: $(obj)/config_data.h | ||
42 | |||
43 | # config_data.h contains the same information as ikconfig.h but gzipped. | ||
44 | # Info from config_data can be extracted from /proc/config* | ||
45 | targets += config_data.gz | ||
46 | $(obj)/config_data.gz: .config FORCE | ||
47 | $(call if_changed,gzip) | ||
48 | |||
49 | quiet_cmd_ikconfiggz = IKCFG $@ | ||
50 | cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ | ||
51 | targets += config_data.h | ||
52 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE | ||
53 | $(call if_changed,ikconfiggz) | ||
diff --git a/kernel/acct.c b/kernel/acct.c new file mode 100644 index 000000000000..4168f631868e --- /dev/null +++ b/kernel/acct.c | |||
@@ -0,0 +1,561 @@ | |||
1 | /* | ||
2 | * linux/kernel/acct.c | ||
3 | * | ||
4 | * BSD Process Accounting for Linux | ||
5 | * | ||
6 | * Author: Marco van Wieringen <mvw@planets.elm.net> | ||
7 | * | ||
8 | * Some code based on ideas and code from: | ||
9 | * Thomas K. Dyas <tdyas@eden.rutgers.edu> | ||
10 | * | ||
11 | * This file implements BSD-style process accounting. Whenever any | ||
12 | * process exits, an accounting record of type "struct acct" is | ||
13 | * written to the file specified with the acct() system call. It is | ||
14 | * up to user-level programs to do useful things with the accounting | ||
15 | * log. The kernel just provides the raw accounting information. | ||
16 | * | ||
17 | * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V. | ||
18 | * | ||
19 | * Plugged two leaks. 1) It didn't return acct_file into the free_filps if | ||
20 | * the file happened to be read-only. 2) If the accounting was suspended | ||
21 | * due to the lack of space it happily allowed to reopen it and completely | ||
22 | * lost the old acct_file. 3/10/98, Al Viro. | ||
23 | * | ||
24 | * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). | ||
25 | * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. | ||
26 | * | ||
27 | * Fixed a nasty interaction with with sys_umount(). If the accointing | ||
28 | * was suspeneded we failed to stop it on umount(). Messy. | ||
29 | * Another one: remount to readonly didn't stop accounting. | ||
30 | * Question: what should we do if we have CAP_SYS_ADMIN but not | ||
31 | * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY | ||
32 | * unless we are messing with the root. In that case we are getting a | ||
33 | * real mess with do_remount_sb(). 9/11/98, AV. | ||
34 | * | ||
35 | * Fixed a bunch of races (and pair of leaks). Probably not the best way, | ||
36 | * but this one obviously doesn't introduce deadlocks. Later. BTW, found | ||
37 | * one race (and leak) in BSD implementation. | ||
38 | * OK, that's better. ANOTHER race and leak in BSD variant. There always | ||
39 | * is one more bug... 10/11/98, AV. | ||
40 | * | ||
41 | * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold | ||
42 | * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks | ||
43 | * a struct file opened for write. Fixed. 2/6/2000, AV. | ||
44 | */ | ||
45 | |||
46 | #include <linux/config.h> | ||
47 | #include <linux/mm.h> | ||
48 | #include <linux/slab.h> | ||
49 | #include <linux/acct.h> | ||
50 | #include <linux/file.h> | ||
51 | #include <linux/tty.h> | ||
52 | #include <linux/security.h> | ||
53 | #include <linux/vfs.h> | ||
54 | #include <linux/jiffies.h> | ||
55 | #include <linux/times.h> | ||
56 | #include <linux/syscalls.h> | ||
57 | #include <asm/uaccess.h> | ||
58 | #include <asm/div64.h> | ||
59 | #include <linux/blkdev.h> /* sector_div */ | ||
60 | |||
61 | /* | ||
62 | * These constants control the amount of freespace that suspend and | ||
63 | * resume the process accounting system, and the time delay between | ||
64 | * each check. | ||
65 | * Turned into sysctl-controllable parameters. AV, 12/11/98 | ||
66 | */ | ||
67 | |||
68 | int acct_parm[3] = {4, 2, 30}; | ||
69 | #define RESUME (acct_parm[0]) /* >foo% free space - resume */ | ||
70 | #define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */ | ||
71 | #define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */ | ||
72 | |||
73 | /* | ||
74 | * External references and all of the globals. | ||
75 | */ | ||
76 | static void do_acct_process(long, struct file *); | ||
77 | |||
78 | /* | ||
79 | * This structure is used so that all the data protected by lock | ||
80 | * can be placed in the same cache line as the lock. This primes | ||
81 | * the cache line to have the data after getting the lock. | ||
82 | */ | ||
83 | struct acct_glbs { | ||
84 | spinlock_t lock; | ||
85 | volatile int active; | ||
86 | volatile int needcheck; | ||
87 | struct file *file; | ||
88 | struct timer_list timer; | ||
89 | }; | ||
90 | |||
91 | static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; | ||
92 | |||
93 | /* | ||
94 | * Called whenever the timer says to check the free space. | ||
95 | */ | ||
96 | static void acct_timeout(unsigned long unused) | ||
97 | { | ||
98 | acct_globals.needcheck = 1; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Check the amount of free space and suspend/resume accordingly. | ||
103 | */ | ||
104 | static int check_free_space(struct file *file) | ||
105 | { | ||
106 | struct kstatfs sbuf; | ||
107 | int res; | ||
108 | int act; | ||
109 | sector_t resume; | ||
110 | sector_t suspend; | ||
111 | |||
112 | spin_lock(&acct_globals.lock); | ||
113 | res = acct_globals.active; | ||
114 | if (!file || !acct_globals.needcheck) | ||
115 | goto out; | ||
116 | spin_unlock(&acct_globals.lock); | ||
117 | |||
118 | /* May block */ | ||
119 | if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) | ||
120 | return res; | ||
121 | suspend = sbuf.f_blocks * SUSPEND; | ||
122 | resume = sbuf.f_blocks * RESUME; | ||
123 | |||
124 | sector_div(suspend, 100); | ||
125 | sector_div(resume, 100); | ||
126 | |||
127 | if (sbuf.f_bavail <= suspend) | ||
128 | act = -1; | ||
129 | else if (sbuf.f_bavail >= resume) | ||
130 | act = 1; | ||
131 | else | ||
132 | act = 0; | ||
133 | |||
134 | /* | ||
135 | * If some joker switched acct_globals.file under us we'ld better be | ||
136 | * silent and _not_ touch anything. | ||
137 | */ | ||
138 | spin_lock(&acct_globals.lock); | ||
139 | if (file != acct_globals.file) { | ||
140 | if (act) | ||
141 | res = act>0; | ||
142 | goto out; | ||
143 | } | ||
144 | |||
145 | if (acct_globals.active) { | ||
146 | if (act < 0) { | ||
147 | acct_globals.active = 0; | ||
148 | printk(KERN_INFO "Process accounting paused\n"); | ||
149 | } | ||
150 | } else { | ||
151 | if (act > 0) { | ||
152 | acct_globals.active = 1; | ||
153 | printk(KERN_INFO "Process accounting resumed\n"); | ||
154 | } | ||
155 | } | ||
156 | |||
157 | del_timer(&acct_globals.timer); | ||
158 | acct_globals.needcheck = 0; | ||
159 | acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; | ||
160 | add_timer(&acct_globals.timer); | ||
161 | res = acct_globals.active; | ||
162 | out: | ||
163 | spin_unlock(&acct_globals.lock); | ||
164 | return res; | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Close the old accouting file (if currently open) and then replace | ||
169 | * it with file (if non-NULL). | ||
170 | * | ||
171 | * NOTE: acct_globals.lock MUST be held on entry and exit. | ||
172 | */ | ||
173 | static void acct_file_reopen(struct file *file) | ||
174 | { | ||
175 | struct file *old_acct = NULL; | ||
176 | |||
177 | if (acct_globals.file) { | ||
178 | old_acct = acct_globals.file; | ||
179 | del_timer(&acct_globals.timer); | ||
180 | acct_globals.active = 0; | ||
181 | acct_globals.needcheck = 0; | ||
182 | acct_globals.file = NULL; | ||
183 | } | ||
184 | if (file) { | ||
185 | acct_globals.file = file; | ||
186 | acct_globals.needcheck = 0; | ||
187 | acct_globals.active = 1; | ||
188 | /* It's been deleted if it was used before so this is safe */ | ||
189 | init_timer(&acct_globals.timer); | ||
190 | acct_globals.timer.function = acct_timeout; | ||
191 | acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; | ||
192 | add_timer(&acct_globals.timer); | ||
193 | } | ||
194 | if (old_acct) { | ||
195 | spin_unlock(&acct_globals.lock); | ||
196 | do_acct_process(0, old_acct); | ||
197 | filp_close(old_acct, NULL); | ||
198 | spin_lock(&acct_globals.lock); | ||
199 | } | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * sys_acct() is the only system call needed to implement process | ||
204 | * accounting. It takes the name of the file where accounting records | ||
205 | * should be written. If the filename is NULL, accounting will be | ||
206 | * shutdown. | ||
207 | */ | ||
208 | asmlinkage long sys_acct(const char __user *name) | ||
209 | { | ||
210 | struct file *file = NULL; | ||
211 | char *tmp; | ||
212 | int error; | ||
213 | |||
214 | if (!capable(CAP_SYS_PACCT)) | ||
215 | return -EPERM; | ||
216 | |||
217 | if (name) { | ||
218 | tmp = getname(name); | ||
219 | if (IS_ERR(tmp)) { | ||
220 | return (PTR_ERR(tmp)); | ||
221 | } | ||
222 | /* Difference from BSD - they don't do O_APPEND */ | ||
223 | file = filp_open(tmp, O_WRONLY|O_APPEND, 0); | ||
224 | putname(tmp); | ||
225 | if (IS_ERR(file)) { | ||
226 | return (PTR_ERR(file)); | ||
227 | } | ||
228 | if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { | ||
229 | filp_close(file, NULL); | ||
230 | return (-EACCES); | ||
231 | } | ||
232 | |||
233 | if (!file->f_op->write) { | ||
234 | filp_close(file, NULL); | ||
235 | return (-EIO); | ||
236 | } | ||
237 | } | ||
238 | |||
239 | error = security_acct(file); | ||
240 | if (error) { | ||
241 | if (file) | ||
242 | filp_close(file, NULL); | ||
243 | return error; | ||
244 | } | ||
245 | |||
246 | spin_lock(&acct_globals.lock); | ||
247 | acct_file_reopen(file); | ||
248 | spin_unlock(&acct_globals.lock); | ||
249 | |||
250 | return (0); | ||
251 | } | ||
252 | |||
253 | /* | ||
254 | * If the accouting is turned on for a file in the filesystem pointed | ||
255 | * to by sb, turn accouting off. | ||
256 | */ | ||
257 | void acct_auto_close(struct super_block *sb) | ||
258 | { | ||
259 | spin_lock(&acct_globals.lock); | ||
260 | if (acct_globals.file && | ||
261 | acct_globals.file->f_dentry->d_inode->i_sb == sb) { | ||
262 | acct_file_reopen((struct file *)NULL); | ||
263 | } | ||
264 | spin_unlock(&acct_globals.lock); | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * encode an unsigned long into a comp_t | ||
269 | * | ||
270 | * This routine has been adopted from the encode_comp_t() function in | ||
271 | * the kern_acct.c file of the FreeBSD operating system. The encoding | ||
272 | * is a 13-bit fraction with a 3-bit (base 8) exponent. | ||
273 | */ | ||
274 | |||
275 | #define MANTSIZE 13 /* 13 bit mantissa. */ | ||
276 | #define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ | ||
277 | #define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ | ||
278 | |||
279 | static comp_t encode_comp_t(unsigned long value) | ||
280 | { | ||
281 | int exp, rnd; | ||
282 | |||
283 | exp = rnd = 0; | ||
284 | while (value > MAXFRACT) { | ||
285 | rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */ | ||
286 | value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ | ||
287 | exp++; | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * If we need to round up, do it (and handle overflow correctly). | ||
292 | */ | ||
293 | if (rnd && (++value > MAXFRACT)) { | ||
294 | value >>= EXPSIZE; | ||
295 | exp++; | ||
296 | } | ||
297 | |||
298 | /* | ||
299 | * Clean it up and polish it off. | ||
300 | */ | ||
301 | exp <<= MANTSIZE; /* Shift the exponent into place */ | ||
302 | exp += value; /* and add on the mantissa. */ | ||
303 | return exp; | ||
304 | } | ||
305 | |||
306 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | ||
307 | /* | ||
308 | * encode an u64 into a comp2_t (24 bits) | ||
309 | * | ||
310 | * Format: 5 bit base 2 exponent, 20 bits mantissa. | ||
311 | * The leading bit of the mantissa is not stored, but implied for | ||
312 | * non-zero exponents. | ||
313 | * Largest encodable value is 50 bits. | ||
314 | */ | ||
315 | |||
316 | #define MANTSIZE2 20 /* 20 bit mantissa. */ | ||
317 | #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ | ||
318 | #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ | ||
319 | #define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ | ||
320 | |||
321 | static comp2_t encode_comp2_t(u64 value) | ||
322 | { | ||
323 | int exp, rnd; | ||
324 | |||
325 | exp = (value > (MAXFRACT2>>1)); | ||
326 | rnd = 0; | ||
327 | while (value > MAXFRACT2) { | ||
328 | rnd = value & 1; | ||
329 | value >>= 1; | ||
330 | exp++; | ||
331 | } | ||
332 | |||
333 | /* | ||
334 | * If we need to round up, do it (and handle overflow correctly). | ||
335 | */ | ||
336 | if (rnd && (++value > MAXFRACT2)) { | ||
337 | value >>= 1; | ||
338 | exp++; | ||
339 | } | ||
340 | |||
341 | if (exp > MAXEXP2) { | ||
342 | /* Overflow. Return largest representable number instead. */ | ||
343 | return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; | ||
344 | } else { | ||
345 | return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); | ||
346 | } | ||
347 | } | ||
348 | #endif | ||
349 | |||
350 | #if ACCT_VERSION==3 | ||
351 | /* | ||
352 | * encode an u64 into a 32 bit IEEE float | ||
353 | */ | ||
354 | static u32 encode_float(u64 value) | ||
355 | { | ||
356 | unsigned exp = 190; | ||
357 | unsigned u; | ||
358 | |||
359 | if (value==0) return 0; | ||
360 | while ((s64)value > 0){ | ||
361 | value <<= 1; | ||
362 | exp--; | ||
363 | } | ||
364 | u = (u32)(value >> 40) & 0x7fffffu; | ||
365 | return u | (exp << 23); | ||
366 | } | ||
367 | #endif | ||
368 | |||
369 | /* | ||
370 | * Write an accounting entry for an exiting process | ||
371 | * | ||
372 | * The acct_process() call is the workhorse of the process | ||
373 | * accounting system. The struct acct is built here and then written | ||
374 | * into the accounting file. This function should only be called from | ||
375 | * do_exit(). | ||
376 | */ | ||
377 | |||
378 | /* | ||
379 | * do_acct_process does all actual work. Caller holds the reference to file. | ||
380 | */ | ||
381 | static void do_acct_process(long exitcode, struct file *file) | ||
382 | { | ||
383 | acct_t ac; | ||
384 | mm_segment_t fs; | ||
385 | unsigned long vsize; | ||
386 | unsigned long flim; | ||
387 | u64 elapsed; | ||
388 | u64 run_time; | ||
389 | struct timespec uptime; | ||
390 | |||
391 | /* | ||
392 | * First check to see if there is enough free_space to continue | ||
393 | * the process accounting system. | ||
394 | */ | ||
395 | if (!check_free_space(file)) | ||
396 | return; | ||
397 | |||
398 | /* | ||
399 | * Fill the accounting struct with the needed info as recorded | ||
400 | * by the different kernel functions. | ||
401 | */ | ||
402 | memset((caddr_t)&ac, 0, sizeof(acct_t)); | ||
403 | |||
404 | ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; | ||
405 | strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); | ||
406 | |||
407 | /* calculate run_time in nsec*/ | ||
408 | do_posix_clock_monotonic_gettime(&uptime); | ||
409 | run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; | ||
410 | run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC | ||
411 | + current->start_time.tv_nsec; | ||
412 | /* convert nsec -> AHZ */ | ||
413 | elapsed = nsec_to_AHZ(run_time); | ||
414 | #if ACCT_VERSION==3 | ||
415 | ac.ac_etime = encode_float(elapsed); | ||
416 | #else | ||
417 | ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? | ||
418 | (unsigned long) elapsed : (unsigned long) -1l); | ||
419 | #endif | ||
420 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | ||
421 | { | ||
422 | /* new enlarged etime field */ | ||
423 | comp2_t etime = encode_comp2_t(elapsed); | ||
424 | ac.ac_etime_hi = etime >> 16; | ||
425 | ac.ac_etime_lo = (u16) etime; | ||
426 | } | ||
427 | #endif | ||
428 | do_div(elapsed, AHZ); | ||
429 | ac.ac_btime = xtime.tv_sec - elapsed; | ||
430 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ( | ||
431 | current->signal->utime + | ||
432 | current->group_leader->utime)); | ||
433 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ( | ||
434 | current->signal->stime + | ||
435 | current->group_leader->stime)); | ||
436 | /* we really need to bite the bullet and change layout */ | ||
437 | ac.ac_uid = current->uid; | ||
438 | ac.ac_gid = current->gid; | ||
439 | #if ACCT_VERSION==2 | ||
440 | ac.ac_ahz = AHZ; | ||
441 | #endif | ||
442 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | ||
443 | /* backward-compatible 16 bit fields */ | ||
444 | ac.ac_uid16 = current->uid; | ||
445 | ac.ac_gid16 = current->gid; | ||
446 | #endif | ||
447 | #if ACCT_VERSION==3 | ||
448 | ac.ac_pid = current->tgid; | ||
449 | ac.ac_ppid = current->parent->tgid; | ||
450 | #endif | ||
451 | |||
452 | read_lock(&tasklist_lock); /* pin current->signal */ | ||
453 | ac.ac_tty = current->signal->tty ? | ||
454 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; | ||
455 | read_unlock(&tasklist_lock); | ||
456 | |||
457 | ac.ac_flag = 0; | ||
458 | if (current->flags & PF_FORKNOEXEC) | ||
459 | ac.ac_flag |= AFORK; | ||
460 | if (current->flags & PF_SUPERPRIV) | ||
461 | ac.ac_flag |= ASU; | ||
462 | if (current->flags & PF_DUMPCORE) | ||
463 | ac.ac_flag |= ACORE; | ||
464 | if (current->flags & PF_SIGNALED) | ||
465 | ac.ac_flag |= AXSIG; | ||
466 | |||
467 | vsize = 0; | ||
468 | if (current->mm) { | ||
469 | struct vm_area_struct *vma; | ||
470 | down_read(¤t->mm->mmap_sem); | ||
471 | vma = current->mm->mmap; | ||
472 | while (vma) { | ||
473 | vsize += vma->vm_end - vma->vm_start; | ||
474 | vma = vma->vm_next; | ||
475 | } | ||
476 | up_read(¤t->mm->mmap_sem); | ||
477 | } | ||
478 | vsize = vsize / 1024; | ||
479 | ac.ac_mem = encode_comp_t(vsize); | ||
480 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ | ||
481 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); | ||
482 | ac.ac_minflt = encode_comp_t(current->signal->min_flt + | ||
483 | current->group_leader->min_flt); | ||
484 | ac.ac_majflt = encode_comp_t(current->signal->maj_flt + | ||
485 | current->group_leader->maj_flt); | ||
486 | ac.ac_swaps = encode_comp_t(0); | ||
487 | ac.ac_exitcode = exitcode; | ||
488 | |||
489 | /* | ||
490 | * Kernel segment override to datasegment and write it | ||
491 | * to the accounting file. | ||
492 | */ | ||
493 | fs = get_fs(); | ||
494 | set_fs(KERNEL_DS); | ||
495 | /* | ||
496 | * Accounting records are not subject to resource limits. | ||
497 | */ | ||
498 | flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
499 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; | ||
500 | file->f_op->write(file, (char *)&ac, | ||
501 | sizeof(acct_t), &file->f_pos); | ||
502 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; | ||
503 | set_fs(fs); | ||
504 | } | ||
505 | |||
506 | /* | ||
507 | * acct_process - now just a wrapper around do_acct_process | ||
508 | */ | ||
509 | void acct_process(long exitcode) | ||
510 | { | ||
511 | struct file *file = NULL; | ||
512 | |||
513 | /* | ||
514 | * accelerate the common fastpath: | ||
515 | */ | ||
516 | if (!acct_globals.file) | ||
517 | return; | ||
518 | |||
519 | spin_lock(&acct_globals.lock); | ||
520 | file = acct_globals.file; | ||
521 | if (unlikely(!file)) { | ||
522 | spin_unlock(&acct_globals.lock); | ||
523 | return; | ||
524 | } | ||
525 | get_file(file); | ||
526 | spin_unlock(&acct_globals.lock); | ||
527 | |||
528 | do_acct_process(exitcode, file); | ||
529 | fput(file); | ||
530 | } | ||
531 | |||
532 | |||
533 | /* | ||
534 | * acct_update_integrals | ||
535 | * - update mm integral fields in task_struct | ||
536 | */ | ||
537 | void acct_update_integrals(struct task_struct *tsk) | ||
538 | { | ||
539 | if (likely(tsk->mm)) { | ||
540 | long delta = tsk->stime - tsk->acct_stimexpd; | ||
541 | |||
542 | if (delta == 0) | ||
543 | return; | ||
544 | tsk->acct_stimexpd = tsk->stime; | ||
545 | tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); | ||
546 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; | ||
547 | } | ||
548 | } | ||
549 | |||
550 | /* | ||
551 | * acct_clear_integrals | ||
552 | * - clear the mm integral fields in task_struct | ||
553 | */ | ||
554 | void acct_clear_integrals(struct task_struct *tsk) | ||
555 | { | ||
556 | if (tsk) { | ||
557 | tsk->acct_stimexpd = 0; | ||
558 | tsk->acct_rss_mem1 = 0; | ||
559 | tsk->acct_vm_mem1 = 0; | ||
560 | } | ||
561 | } | ||
diff --git a/kernel/audit.c b/kernel/audit.c new file mode 100644 index 000000000000..0f84dd7af2c8 --- /dev/null +++ b/kernel/audit.c | |||
@@ -0,0 +1,839 @@ | |||
1 | /* audit.c -- Auditing support -*- linux-c -*- | ||
2 | * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. | ||
3 | * System-call specific features have moved to auditsc.c | ||
4 | * | ||
5 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | ||
6 | * All Rights Reserved. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 | * | ||
22 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> | ||
23 | * | ||
24 | * Goals: 1) Integrate fully with SELinux. | ||
25 | * 2) Minimal run-time overhead: | ||
26 | * a) Minimal when syscall auditing is disabled (audit_enable=0). | ||
27 | * b) Small when syscall auditing is enabled and no audit record | ||
28 | * is generated (defer as much work as possible to record | ||
29 | * generation time): | ||
30 | * i) context is allocated, | ||
31 | * ii) names from getname are stored without a copy, and | ||
32 | * iii) inode information stored from path_lookup. | ||
33 | * 3) Ability to disable syscall auditing at boot time (audit=0). | ||
34 | * 4) Usable by other parts of the kernel (if audit_log* is called, | ||
35 | * then a syscall record will be generated automatically for the | ||
36 | * current syscall). | ||
37 | * 5) Netlink interface to user-space. | ||
38 | * 6) Support low-overhead kernel-based filtering to minimize the | ||
39 | * information that must be passed to user-space. | ||
40 | * | ||
41 | * Example user-space utilities: http://people.redhat.com/faith/audit/ | ||
42 | */ | ||
43 | |||
44 | #include <linux/init.h> | ||
45 | #include <asm/atomic.h> | ||
46 | #include <asm/types.h> | ||
47 | #include <linux/mm.h> | ||
48 | #include <linux/module.h> | ||
49 | |||
50 | #include <linux/audit.h> | ||
51 | |||
52 | #include <net/sock.h> | ||
53 | #include <linux/skbuff.h> | ||
54 | #include <linux/netlink.h> | ||
55 | |||
56 | /* No auditing will take place until audit_initialized != 0. | ||
57 | * (Initialization happens after skb_init is called.) */ | ||
58 | static int audit_initialized; | ||
59 | |||
60 | /* No syscall auditing will take place unless audit_enabled != 0. */ | ||
61 | int audit_enabled; | ||
62 | |||
63 | /* Default state when kernel boots without any parameters. */ | ||
64 | static int audit_default; | ||
65 | |||
66 | /* If auditing cannot proceed, audit_failure selects what happens. */ | ||
67 | static int audit_failure = AUDIT_FAIL_PRINTK; | ||
68 | |||
69 | /* If audit records are to be written to the netlink socket, audit_pid | ||
70 | * contains the (non-zero) pid. */ | ||
71 | static int audit_pid; | ||
72 | |||
73 | /* If audit_limit is non-zero, limit the rate of sending audit records | ||
74 | * to that number per second. This prevents DoS attacks, but results in | ||
75 | * audit records being dropped. */ | ||
76 | static int audit_rate_limit; | ||
77 | |||
78 | /* Number of outstanding audit_buffers allowed. */ | ||
79 | static int audit_backlog_limit = 64; | ||
80 | static atomic_t audit_backlog = ATOMIC_INIT(0); | ||
81 | |||
82 | /* Records can be lost in several ways: | ||
83 | 0) [suppressed in audit_alloc] | ||
84 | 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] | ||
85 | 2) out of memory in audit_log_move [alloc_skb] | ||
86 | 3) suppressed due to audit_rate_limit | ||
87 | 4) suppressed due to audit_backlog_limit | ||
88 | */ | ||
89 | static atomic_t audit_lost = ATOMIC_INIT(0); | ||
90 | |||
91 | /* The netlink socket. */ | ||
92 | static struct sock *audit_sock; | ||
93 | |||
94 | /* There are two lists of audit buffers. The txlist contains audit | ||
95 | * buffers that cannot be sent immediately to the netlink device because | ||
96 | * we are in an irq context (these are sent later in a tasklet). | ||
97 | * | ||
98 | * The second list is a list of pre-allocated audit buffers (if more | ||
99 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of | ||
100 | * being placed on the freelist). */ | ||
101 | static DEFINE_SPINLOCK(audit_txlist_lock); | ||
102 | static DEFINE_SPINLOCK(audit_freelist_lock); | ||
103 | static int audit_freelist_count = 0; | ||
104 | static LIST_HEAD(audit_txlist); | ||
105 | static LIST_HEAD(audit_freelist); | ||
106 | |||
107 | /* There are three lists of rules -- one to search at task creation | ||
108 | * time, one to search at syscall entry time, and another to search at | ||
109 | * syscall exit time. */ | ||
110 | static LIST_HEAD(audit_tsklist); | ||
111 | static LIST_HEAD(audit_entlist); | ||
112 | static LIST_HEAD(audit_extlist); | ||
113 | |||
114 | /* The netlink socket is only to be read by 1 CPU, which lets us assume | ||
115 | * that list additions and deletions never happen simultaneiously in | ||
116 | * auditsc.c */ | ||
117 | static DECLARE_MUTEX(audit_netlink_sem); | ||
118 | |||
119 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting | ||
120 | * audit records. Since printk uses a 1024 byte buffer, this buffer | ||
121 | * should be at least that large. */ | ||
122 | #define AUDIT_BUFSIZ 1024 | ||
123 | |||
124 | /* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the | ||
125 | * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ | ||
126 | #define AUDIT_MAXFREE (2*NR_CPUS) | ||
127 | |||
128 | /* The audit_buffer is used when formatting an audit record. The caller | ||
129 | * locks briefly to get the record off the freelist or to allocate the | ||
130 | * buffer, and locks briefly to send the buffer to the netlink layer or | ||
131 | * to place it on a transmit queue. Multiple audit_buffers can be in | ||
132 | * use simultaneously. */ | ||
133 | struct audit_buffer { | ||
134 | struct list_head list; | ||
135 | struct sk_buff_head sklist; /* formatted skbs ready to send */ | ||
136 | struct audit_context *ctx; /* NULL or associated context */ | ||
137 | int len; /* used area of tmp */ | ||
138 | char tmp[AUDIT_BUFSIZ]; | ||
139 | |||
140 | /* Pointer to header and contents */ | ||
141 | struct nlmsghdr *nlh; | ||
142 | int total; | ||
143 | int type; | ||
144 | int pid; | ||
145 | int count; /* Times requeued */ | ||
146 | }; | ||
147 | |||
148 | void audit_set_type(struct audit_buffer *ab, int type) | ||
149 | { | ||
150 | ab->type = type; | ||
151 | } | ||
152 | |||
153 | struct audit_entry { | ||
154 | struct list_head list; | ||
155 | struct audit_rule rule; | ||
156 | }; | ||
157 | |||
158 | static void audit_log_end_irq(struct audit_buffer *ab); | ||
159 | static void audit_log_end_fast(struct audit_buffer *ab); | ||
160 | |||
161 | static void audit_panic(const char *message) | ||
162 | { | ||
163 | switch (audit_failure) | ||
164 | { | ||
165 | case AUDIT_FAIL_SILENT: | ||
166 | break; | ||
167 | case AUDIT_FAIL_PRINTK: | ||
168 | printk(KERN_ERR "audit: %s\n", message); | ||
169 | break; | ||
170 | case AUDIT_FAIL_PANIC: | ||
171 | panic("audit: %s\n", message); | ||
172 | break; | ||
173 | } | ||
174 | } | ||
175 | |||
176 | static inline int audit_rate_check(void) | ||
177 | { | ||
178 | static unsigned long last_check = 0; | ||
179 | static int messages = 0; | ||
180 | static DEFINE_SPINLOCK(lock); | ||
181 | unsigned long flags; | ||
182 | unsigned long now; | ||
183 | unsigned long elapsed; | ||
184 | int retval = 0; | ||
185 | |||
186 | if (!audit_rate_limit) return 1; | ||
187 | |||
188 | spin_lock_irqsave(&lock, flags); | ||
189 | if (++messages < audit_rate_limit) { | ||
190 | retval = 1; | ||
191 | } else { | ||
192 | now = jiffies; | ||
193 | elapsed = now - last_check; | ||
194 | if (elapsed > HZ) { | ||
195 | last_check = now; | ||
196 | messages = 0; | ||
197 | retval = 1; | ||
198 | } | ||
199 | } | ||
200 | spin_unlock_irqrestore(&lock, flags); | ||
201 | |||
202 | return retval; | ||
203 | } | ||
204 | |||
205 | /* Emit at least 1 message per second, even if audit_rate_check is | ||
206 | * throttling. */ | ||
207 | void audit_log_lost(const char *message) | ||
208 | { | ||
209 | static unsigned long last_msg = 0; | ||
210 | static DEFINE_SPINLOCK(lock); | ||
211 | unsigned long flags; | ||
212 | unsigned long now; | ||
213 | int print; | ||
214 | |||
215 | atomic_inc(&audit_lost); | ||
216 | |||
217 | print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); | ||
218 | |||
219 | if (!print) { | ||
220 | spin_lock_irqsave(&lock, flags); | ||
221 | now = jiffies; | ||
222 | if (now - last_msg > HZ) { | ||
223 | print = 1; | ||
224 | last_msg = now; | ||
225 | } | ||
226 | spin_unlock_irqrestore(&lock, flags); | ||
227 | } | ||
228 | |||
229 | if (print) { | ||
230 | printk(KERN_WARNING | ||
231 | "audit: audit_lost=%d audit_backlog=%d" | ||
232 | " audit_rate_limit=%d audit_backlog_limit=%d\n", | ||
233 | atomic_read(&audit_lost), | ||
234 | atomic_read(&audit_backlog), | ||
235 | audit_rate_limit, | ||
236 | audit_backlog_limit); | ||
237 | audit_panic(message); | ||
238 | } | ||
239 | |||
240 | } | ||
241 | |||
242 | static int audit_set_rate_limit(int limit) | ||
243 | { | ||
244 | int old = audit_rate_limit; | ||
245 | audit_rate_limit = limit; | ||
246 | audit_log(current->audit_context, "audit_rate_limit=%d old=%d", | ||
247 | audit_rate_limit, old); | ||
248 | return old; | ||
249 | } | ||
250 | |||
251 | static int audit_set_backlog_limit(int limit) | ||
252 | { | ||
253 | int old = audit_backlog_limit; | ||
254 | audit_backlog_limit = limit; | ||
255 | audit_log(current->audit_context, "audit_backlog_limit=%d old=%d", | ||
256 | audit_backlog_limit, old); | ||
257 | return old; | ||
258 | } | ||
259 | |||
260 | static int audit_set_enabled(int state) | ||
261 | { | ||
262 | int old = audit_enabled; | ||
263 | if (state != 0 && state != 1) | ||
264 | return -EINVAL; | ||
265 | audit_enabled = state; | ||
266 | audit_log(current->audit_context, "audit_enabled=%d old=%d", | ||
267 | audit_enabled, old); | ||
268 | return old; | ||
269 | } | ||
270 | |||
271 | static int audit_set_failure(int state) | ||
272 | { | ||
273 | int old = audit_failure; | ||
274 | if (state != AUDIT_FAIL_SILENT | ||
275 | && state != AUDIT_FAIL_PRINTK | ||
276 | && state != AUDIT_FAIL_PANIC) | ||
277 | return -EINVAL; | ||
278 | audit_failure = state; | ||
279 | audit_log(current->audit_context, "audit_failure=%d old=%d", | ||
280 | audit_failure, old); | ||
281 | return old; | ||
282 | } | ||
283 | |||
284 | #ifdef CONFIG_NET | ||
285 | void audit_send_reply(int pid, int seq, int type, int done, int multi, | ||
286 | void *payload, int size) | ||
287 | { | ||
288 | struct sk_buff *skb; | ||
289 | struct nlmsghdr *nlh; | ||
290 | int len = NLMSG_SPACE(size); | ||
291 | void *data; | ||
292 | int flags = multi ? NLM_F_MULTI : 0; | ||
293 | int t = done ? NLMSG_DONE : type; | ||
294 | |||
295 | skb = alloc_skb(len, GFP_KERNEL); | ||
296 | if (!skb) | ||
297 | goto nlmsg_failure; | ||
298 | |||
299 | nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh)); | ||
300 | nlh->nlmsg_flags = flags; | ||
301 | data = NLMSG_DATA(nlh); | ||
302 | memcpy(data, payload, size); | ||
303 | netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT); | ||
304 | return; | ||
305 | |||
306 | nlmsg_failure: /* Used by NLMSG_PUT */ | ||
307 | if (skb) | ||
308 | kfree_skb(skb); | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit | ||
313 | * control messages. | ||
314 | */ | ||
315 | static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | ||
316 | { | ||
317 | int err = 0; | ||
318 | |||
319 | switch (msg_type) { | ||
320 | case AUDIT_GET: | ||
321 | case AUDIT_LIST: | ||
322 | case AUDIT_SET: | ||
323 | case AUDIT_ADD: | ||
324 | case AUDIT_DEL: | ||
325 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) | ||
326 | err = -EPERM; | ||
327 | break; | ||
328 | case AUDIT_USER: | ||
329 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) | ||
330 | err = -EPERM; | ||
331 | break; | ||
332 | default: /* bad msg */ | ||
333 | err = -EINVAL; | ||
334 | } | ||
335 | |||
336 | return err; | ||
337 | } | ||
338 | |||
339 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | ||
340 | { | ||
341 | u32 uid, pid, seq; | ||
342 | void *data; | ||
343 | struct audit_status *status_get, status_set; | ||
344 | int err; | ||
345 | struct audit_buffer *ab; | ||
346 | u16 msg_type = nlh->nlmsg_type; | ||
347 | |||
348 | err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); | ||
349 | if (err) | ||
350 | return err; | ||
351 | |||
352 | pid = NETLINK_CREDS(skb)->pid; | ||
353 | uid = NETLINK_CREDS(skb)->uid; | ||
354 | seq = nlh->nlmsg_seq; | ||
355 | data = NLMSG_DATA(nlh); | ||
356 | |||
357 | switch (msg_type) { | ||
358 | case AUDIT_GET: | ||
359 | status_set.enabled = audit_enabled; | ||
360 | status_set.failure = audit_failure; | ||
361 | status_set.pid = audit_pid; | ||
362 | status_set.rate_limit = audit_rate_limit; | ||
363 | status_set.backlog_limit = audit_backlog_limit; | ||
364 | status_set.lost = atomic_read(&audit_lost); | ||
365 | status_set.backlog = atomic_read(&audit_backlog); | ||
366 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, | ||
367 | &status_set, sizeof(status_set)); | ||
368 | break; | ||
369 | case AUDIT_SET: | ||
370 | if (nlh->nlmsg_len < sizeof(struct audit_status)) | ||
371 | return -EINVAL; | ||
372 | status_get = (struct audit_status *)data; | ||
373 | if (status_get->mask & AUDIT_STATUS_ENABLED) { | ||
374 | err = audit_set_enabled(status_get->enabled); | ||
375 | if (err < 0) return err; | ||
376 | } | ||
377 | if (status_get->mask & AUDIT_STATUS_FAILURE) { | ||
378 | err = audit_set_failure(status_get->failure); | ||
379 | if (err < 0) return err; | ||
380 | } | ||
381 | if (status_get->mask & AUDIT_STATUS_PID) { | ||
382 | int old = audit_pid; | ||
383 | audit_pid = status_get->pid; | ||
384 | audit_log(current->audit_context, | ||
385 | "audit_pid=%d old=%d", audit_pid, old); | ||
386 | } | ||
387 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | ||
388 | audit_set_rate_limit(status_get->rate_limit); | ||
389 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) | ||
390 | audit_set_backlog_limit(status_get->backlog_limit); | ||
391 | break; | ||
392 | case AUDIT_USER: | ||
393 | ab = audit_log_start(NULL); | ||
394 | if (!ab) | ||
395 | break; /* audit_panic has been called */ | ||
396 | audit_log_format(ab, | ||
397 | "user pid=%d uid=%d length=%d msg='%.1024s'", | ||
398 | pid, uid, | ||
399 | (int)(nlh->nlmsg_len | ||
400 | - ((char *)data - (char *)nlh)), | ||
401 | (char *)data); | ||
402 | ab->type = AUDIT_USER; | ||
403 | ab->pid = pid; | ||
404 | audit_log_end(ab); | ||
405 | break; | ||
406 | case AUDIT_ADD: | ||
407 | case AUDIT_DEL: | ||
408 | if (nlh->nlmsg_len < sizeof(struct audit_rule)) | ||
409 | return -EINVAL; | ||
410 | /* fallthrough */ | ||
411 | case AUDIT_LIST: | ||
412 | #ifdef CONFIG_AUDITSYSCALL | ||
413 | err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, | ||
414 | uid, seq, data); | ||
415 | #else | ||
416 | err = -EOPNOTSUPP; | ||
417 | #endif | ||
418 | break; | ||
419 | default: | ||
420 | err = -EINVAL; | ||
421 | break; | ||
422 | } | ||
423 | |||
424 | return err < 0 ? err : 0; | ||
425 | } | ||
426 | |||
427 | /* Get message from skb (based on rtnetlink_rcv_skb). Each message is | ||
428 | * processed by audit_receive_msg. Malformed skbs with wrong length are | ||
429 | * discarded silently. */ | ||
430 | static int audit_receive_skb(struct sk_buff *skb) | ||
431 | { | ||
432 | int err; | ||
433 | struct nlmsghdr *nlh; | ||
434 | u32 rlen; | ||
435 | |||
436 | while (skb->len >= NLMSG_SPACE(0)) { | ||
437 | nlh = (struct nlmsghdr *)skb->data; | ||
438 | if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) | ||
439 | return 0; | ||
440 | rlen = NLMSG_ALIGN(nlh->nlmsg_len); | ||
441 | if (rlen > skb->len) | ||
442 | rlen = skb->len; | ||
443 | if ((err = audit_receive_msg(skb, nlh))) { | ||
444 | netlink_ack(skb, nlh, err); | ||
445 | } else if (nlh->nlmsg_flags & NLM_F_ACK) | ||
446 | netlink_ack(skb, nlh, 0); | ||
447 | skb_pull(skb, rlen); | ||
448 | } | ||
449 | return 0; | ||
450 | } | ||
451 | |||
452 | /* Receive messages from netlink socket. */ | ||
453 | static void audit_receive(struct sock *sk, int length) | ||
454 | { | ||
455 | struct sk_buff *skb; | ||
456 | |||
457 | if (down_trylock(&audit_netlink_sem)) | ||
458 | return; | ||
459 | |||
460 | /* FIXME: this must not cause starvation */ | ||
461 | while ((skb = skb_dequeue(&sk->sk_receive_queue))) { | ||
462 | if (audit_receive_skb(skb) && skb->len) | ||
463 | skb_queue_head(&sk->sk_receive_queue, skb); | ||
464 | else | ||
465 | kfree_skb(skb); | ||
466 | } | ||
467 | up(&audit_netlink_sem); | ||
468 | } | ||
469 | |||
470 | /* Move data from tmp buffer into an skb. This is an extra copy, and | ||
471 | * that is unfortunate. However, the copy will only occur when a record | ||
472 | * is being written to user space, which is already a high-overhead | ||
473 | * operation. (Elimination of the copy is possible, for example, by | ||
474 | * writing directly into a pre-allocated skb, at the cost of wasting | ||
475 | * memory. */ | ||
476 | static void audit_log_move(struct audit_buffer *ab) | ||
477 | { | ||
478 | struct sk_buff *skb; | ||
479 | char *start; | ||
480 | int extra = ab->nlh ? 0 : NLMSG_SPACE(0); | ||
481 | |||
482 | /* possible resubmission */ | ||
483 | if (ab->len == 0) | ||
484 | return; | ||
485 | |||
486 | skb = skb_peek(&ab->sklist); | ||
487 | if (!skb || skb_tailroom(skb) <= ab->len + extra) { | ||
488 | skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC); | ||
489 | if (!skb) { | ||
490 | ab->len = 0; /* Lose information in ab->tmp */ | ||
491 | audit_log_lost("out of memory in audit_log_move"); | ||
492 | return; | ||
493 | } | ||
494 | __skb_queue_tail(&ab->sklist, skb); | ||
495 | if (!ab->nlh) | ||
496 | ab->nlh = (struct nlmsghdr *)skb_put(skb, | ||
497 | NLMSG_SPACE(0)); | ||
498 | } | ||
499 | start = skb_put(skb, ab->len); | ||
500 | memcpy(start, ab->tmp, ab->len); | ||
501 | ab->len = 0; | ||
502 | } | ||
503 | |||
504 | /* Iterate over the skbuff in the audit_buffer, sending their contents | ||
505 | * to user space. */ | ||
506 | static inline int audit_log_drain(struct audit_buffer *ab) | ||
507 | { | ||
508 | struct sk_buff *skb; | ||
509 | |||
510 | while ((skb = skb_dequeue(&ab->sklist))) { | ||
511 | int retval = 0; | ||
512 | |||
513 | if (audit_pid) { | ||
514 | if (ab->nlh) { | ||
515 | ab->nlh->nlmsg_len = ab->total; | ||
516 | ab->nlh->nlmsg_type = ab->type; | ||
517 | ab->nlh->nlmsg_flags = 0; | ||
518 | ab->nlh->nlmsg_seq = 0; | ||
519 | ab->nlh->nlmsg_pid = ab->pid; | ||
520 | } | ||
521 | skb_get(skb); /* because netlink_* frees */ | ||
522 | retval = netlink_unicast(audit_sock, skb, audit_pid, | ||
523 | MSG_DONTWAIT); | ||
524 | } | ||
525 | if (retval == -EAGAIN && ab->count < 5) { | ||
526 | ++ab->count; | ||
527 | skb_queue_tail(&ab->sklist, skb); | ||
528 | audit_log_end_irq(ab); | ||
529 | return 1; | ||
530 | } | ||
531 | if (retval < 0) { | ||
532 | if (retval == -ECONNREFUSED) { | ||
533 | printk(KERN_ERR | ||
534 | "audit: *NO* daemon at audit_pid=%d\n", | ||
535 | audit_pid); | ||
536 | audit_pid = 0; | ||
537 | } else | ||
538 | audit_log_lost("netlink socket too busy"); | ||
539 | } | ||
540 | if (!audit_pid) { /* No daemon */ | ||
541 | int offset = ab->nlh ? NLMSG_SPACE(0) : 0; | ||
542 | int len = skb->len - offset; | ||
543 | printk(KERN_ERR "%*.*s\n", | ||
544 | len, len, skb->data + offset); | ||
545 | } | ||
546 | kfree_skb(skb); | ||
547 | ab->nlh = NULL; | ||
548 | } | ||
549 | return 0; | ||
550 | } | ||
551 | |||
552 | /* Initialize audit support at boot time. */ | ||
553 | static int __init audit_init(void) | ||
554 | { | ||
555 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | ||
556 | audit_default ? "enabled" : "disabled"); | ||
557 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); | ||
558 | if (!audit_sock) | ||
559 | audit_panic("cannot initialize netlink socket"); | ||
560 | |||
561 | audit_initialized = 1; | ||
562 | audit_enabled = audit_default; | ||
563 | audit_log(NULL, "initialized"); | ||
564 | return 0; | ||
565 | } | ||
566 | |||
567 | #else | ||
568 | /* Without CONFIG_NET, we have no skbuffs. For now, print what we have | ||
569 | * in the buffer. */ | ||
570 | static void audit_log_move(struct audit_buffer *ab) | ||
571 | { | ||
572 | printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp); | ||
573 | ab->len = 0; | ||
574 | } | ||
575 | |||
576 | static inline int audit_log_drain(struct audit_buffer *ab) | ||
577 | { | ||
578 | return 0; | ||
579 | } | ||
580 | |||
581 | /* Initialize audit support at boot time. */ | ||
582 | int __init audit_init(void) | ||
583 | { | ||
584 | printk(KERN_INFO "audit: initializing WITHOUT netlink support\n"); | ||
585 | audit_sock = NULL; | ||
586 | audit_pid = 0; | ||
587 | |||
588 | audit_initialized = 1; | ||
589 | audit_enabled = audit_default; | ||
590 | audit_log(NULL, "initialized"); | ||
591 | return 0; | ||
592 | } | ||
593 | #endif | ||
594 | |||
595 | __initcall(audit_init); | ||
596 | |||
597 | /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ | ||
598 | static int __init audit_enable(char *str) | ||
599 | { | ||
600 | audit_default = !!simple_strtol(str, NULL, 0); | ||
601 | printk(KERN_INFO "audit: %s%s\n", | ||
602 | audit_default ? "enabled" : "disabled", | ||
603 | audit_initialized ? "" : " (after initialization)"); | ||
604 | if (audit_initialized) | ||
605 | audit_enabled = audit_default; | ||
606 | return 0; | ||
607 | } | ||
608 | |||
609 | __setup("audit=", audit_enable); | ||
610 | |||
611 | |||
612 | /* Obtain an audit buffer. This routine does locking to obtain the | ||
613 | * audit buffer, but then no locking is required for calls to | ||
614 | * audit_log_*format. If the tsk is a task that is currently in a | ||
615 | * syscall, then the syscall is marked as auditable and an audit record | ||
616 | * will be written at syscall exit. If there is no associated task, tsk | ||
617 | * should be NULL. */ | ||
618 | struct audit_buffer *audit_log_start(struct audit_context *ctx) | ||
619 | { | ||
620 | struct audit_buffer *ab = NULL; | ||
621 | unsigned long flags; | ||
622 | struct timespec t; | ||
623 | int serial = 0; | ||
624 | |||
625 | if (!audit_initialized) | ||
626 | return NULL; | ||
627 | |||
628 | if (audit_backlog_limit | ||
629 | && atomic_read(&audit_backlog) > audit_backlog_limit) { | ||
630 | if (audit_rate_check()) | ||
631 | printk(KERN_WARNING | ||
632 | "audit: audit_backlog=%d > " | ||
633 | "audit_backlog_limit=%d\n", | ||
634 | atomic_read(&audit_backlog), | ||
635 | audit_backlog_limit); | ||
636 | audit_log_lost("backlog limit exceeded"); | ||
637 | return NULL; | ||
638 | } | ||
639 | |||
640 | spin_lock_irqsave(&audit_freelist_lock, flags); | ||
641 | if (!list_empty(&audit_freelist)) { | ||
642 | ab = list_entry(audit_freelist.next, | ||
643 | struct audit_buffer, list); | ||
644 | list_del(&ab->list); | ||
645 | --audit_freelist_count; | ||
646 | } | ||
647 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | ||
648 | |||
649 | if (!ab) | ||
650 | ab = kmalloc(sizeof(*ab), GFP_ATOMIC); | ||
651 | if (!ab) { | ||
652 | audit_log_lost("out of memory in audit_log_start"); | ||
653 | return NULL; | ||
654 | } | ||
655 | |||
656 | atomic_inc(&audit_backlog); | ||
657 | skb_queue_head_init(&ab->sklist); | ||
658 | |||
659 | ab->ctx = ctx; | ||
660 | ab->len = 0; | ||
661 | ab->nlh = NULL; | ||
662 | ab->total = 0; | ||
663 | ab->type = AUDIT_KERNEL; | ||
664 | ab->pid = 0; | ||
665 | ab->count = 0; | ||
666 | |||
667 | #ifdef CONFIG_AUDITSYSCALL | ||
668 | if (ab->ctx) | ||
669 | audit_get_stamp(ab->ctx, &t, &serial); | ||
670 | else | ||
671 | #endif | ||
672 | t = CURRENT_TIME; | ||
673 | |||
674 | audit_log_format(ab, "audit(%lu.%03lu:%u): ", | ||
675 | t.tv_sec, t.tv_nsec/1000000, serial); | ||
676 | return ab; | ||
677 | } | ||
678 | |||
679 | |||
680 | /* Format an audit message into the audit buffer. If there isn't enough | ||
681 | * room in the audit buffer, more room will be allocated and vsnprint | ||
682 | * will be called a second time. Currently, we assume that a printk | ||
683 | * can't format message larger than 1024 bytes, so we don't either. */ | ||
684 | static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, | ||
685 | va_list args) | ||
686 | { | ||
687 | int len, avail; | ||
688 | |||
689 | if (!ab) | ||
690 | return; | ||
691 | |||
692 | avail = sizeof(ab->tmp) - ab->len; | ||
693 | if (avail <= 0) { | ||
694 | audit_log_move(ab); | ||
695 | avail = sizeof(ab->tmp) - ab->len; | ||
696 | } | ||
697 | len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); | ||
698 | if (len >= avail) { | ||
699 | /* The printk buffer is 1024 bytes long, so if we get | ||
700 | * here and AUDIT_BUFSIZ is at least 1024, then we can | ||
701 | * log everything that printk could have logged. */ | ||
702 | audit_log_move(ab); | ||
703 | avail = sizeof(ab->tmp) - ab->len; | ||
704 | len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); | ||
705 | } | ||
706 | ab->len += (len < avail) ? len : avail; | ||
707 | ab->total += (len < avail) ? len : avail; | ||
708 | } | ||
709 | |||
710 | /* Format a message into the audit buffer. All the work is done in | ||
711 | * audit_log_vformat. */ | ||
712 | void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) | ||
713 | { | ||
714 | va_list args; | ||
715 | |||
716 | if (!ab) | ||
717 | return; | ||
718 | va_start(args, fmt); | ||
719 | audit_log_vformat(ab, fmt, args); | ||
720 | va_end(args); | ||
721 | } | ||
722 | |||
723 | /* This is a helper-function to print the d_path without using a static | ||
724 | * buffer or allocating another buffer in addition to the one in | ||
725 | * audit_buffer. */ | ||
726 | void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | ||
727 | struct dentry *dentry, struct vfsmount *vfsmnt) | ||
728 | { | ||
729 | char *p; | ||
730 | int len, avail; | ||
731 | |||
732 | if (prefix) audit_log_format(ab, " %s", prefix); | ||
733 | |||
734 | if (ab->len > 128) | ||
735 | audit_log_move(ab); | ||
736 | avail = sizeof(ab->tmp) - ab->len; | ||
737 | p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail); | ||
738 | if (IS_ERR(p)) { | ||
739 | /* FIXME: can we save some information here? */ | ||
740 | audit_log_format(ab, "<toolong>"); | ||
741 | } else { | ||
742 | /* path isn't at start of buffer */ | ||
743 | len = (ab->tmp + sizeof(ab->tmp) - 1) - p; | ||
744 | memmove(ab->tmp + ab->len, p, len); | ||
745 | ab->len += len; | ||
746 | ab->total += len; | ||
747 | } | ||
748 | } | ||
749 | |||
750 | /* Remove queued messages from the audit_txlist and send them to userspace. */ | ||
751 | static void audit_tasklet_handler(unsigned long arg) | ||
752 | { | ||
753 | LIST_HEAD(list); | ||
754 | struct audit_buffer *ab; | ||
755 | unsigned long flags; | ||
756 | |||
757 | spin_lock_irqsave(&audit_txlist_lock, flags); | ||
758 | list_splice_init(&audit_txlist, &list); | ||
759 | spin_unlock_irqrestore(&audit_txlist_lock, flags); | ||
760 | |||
761 | while (!list_empty(&list)) { | ||
762 | ab = list_entry(list.next, struct audit_buffer, list); | ||
763 | list_del(&ab->list); | ||
764 | audit_log_end_fast(ab); | ||
765 | } | ||
766 | } | ||
767 | |||
768 | static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0); | ||
769 | |||
770 | /* The netlink_* functions cannot be called inside an irq context, so | ||
771 | * the audit buffer is places on a queue and a tasklet is scheduled to | ||
772 | * remove them from the queue outside the irq context. May be called in | ||
773 | * any context. */ | ||
774 | static void audit_log_end_irq(struct audit_buffer *ab) | ||
775 | { | ||
776 | unsigned long flags; | ||
777 | |||
778 | if (!ab) | ||
779 | return; | ||
780 | spin_lock_irqsave(&audit_txlist_lock, flags); | ||
781 | list_add_tail(&ab->list, &audit_txlist); | ||
782 | spin_unlock_irqrestore(&audit_txlist_lock, flags); | ||
783 | |||
784 | tasklet_schedule(&audit_tasklet); | ||
785 | } | ||
786 | |||
787 | /* Send the message in the audit buffer directly to user space. May not | ||
788 | * be called in an irq context. */ | ||
789 | static void audit_log_end_fast(struct audit_buffer *ab) | ||
790 | { | ||
791 | unsigned long flags; | ||
792 | |||
793 | BUG_ON(in_irq()); | ||
794 | if (!ab) | ||
795 | return; | ||
796 | if (!audit_rate_check()) { | ||
797 | audit_log_lost("rate limit exceeded"); | ||
798 | } else { | ||
799 | audit_log_move(ab); | ||
800 | if (audit_log_drain(ab)) | ||
801 | return; | ||
802 | } | ||
803 | |||
804 | atomic_dec(&audit_backlog); | ||
805 | spin_lock_irqsave(&audit_freelist_lock, flags); | ||
806 | if (++audit_freelist_count > AUDIT_MAXFREE) | ||
807 | kfree(ab); | ||
808 | else | ||
809 | list_add(&ab->list, &audit_freelist); | ||
810 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | ||
811 | } | ||
812 | |||
813 | /* Send or queue the message in the audit buffer, depending on the | ||
814 | * current context. (A convenience function that may be called in any | ||
815 | * context.) */ | ||
816 | void audit_log_end(struct audit_buffer *ab) | ||
817 | { | ||
818 | if (in_irq()) | ||
819 | audit_log_end_irq(ab); | ||
820 | else | ||
821 | audit_log_end_fast(ab); | ||
822 | } | ||
823 | |||
824 | /* Log an audit record. This is a convenience function that calls | ||
825 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be | ||
826 | * called in any context. */ | ||
827 | void audit_log(struct audit_context *ctx, const char *fmt, ...) | ||
828 | { | ||
829 | struct audit_buffer *ab; | ||
830 | va_list args; | ||
831 | |||
832 | ab = audit_log_start(ctx); | ||
833 | if (ab) { | ||
834 | va_start(args, fmt); | ||
835 | audit_log_vformat(ab, fmt, args); | ||
836 | va_end(args); | ||
837 | audit_log_end(ab); | ||
838 | } | ||
839 | } | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c new file mode 100644 index 000000000000..8c454852d6a5 --- /dev/null +++ b/kernel/auditsc.c | |||
@@ -0,0 +1,1015 @@ | |||
1 | /* auditsc.c -- System-call auditing support -*- linux-c -*- | ||
2 | * Handles all system-call specific auditing features. | ||
3 | * | ||
4 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | ||
5 | * All Rights Reserved. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
20 | * | ||
21 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> | ||
22 | * | ||
23 | * Many of the ideas implemented here are from Stephen C. Tweedie, | ||
24 | * especially the idea of avoiding a copy by using getname. | ||
25 | * | ||
26 | * The method for actual interception of syscall entry and exit (not in | ||
27 | * this file -- see entry.S) is based on a GPL'd patch written by | ||
28 | * okir@suse.de and Copyright 2003 SuSE Linux AG. | ||
29 | * | ||
30 | */ | ||
31 | |||
32 | #include <linux/init.h> | ||
33 | #include <asm/atomic.h> | ||
34 | #include <asm/types.h> | ||
35 | #include <linux/mm.h> | ||
36 | #include <linux/module.h> | ||
37 | |||
38 | #include <linux/audit.h> | ||
39 | #include <linux/personality.h> | ||
40 | #include <linux/time.h> | ||
41 | #include <asm/unistd.h> | ||
42 | |||
43 | /* 0 = no checking | ||
44 | 1 = put_count checking | ||
45 | 2 = verbose put_count checking | ||
46 | */ | ||
47 | #define AUDIT_DEBUG 0 | ||
48 | |||
49 | /* No syscall auditing will take place unless audit_enabled != 0. */ | ||
50 | extern int audit_enabled; | ||
51 | |||
52 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | ||
53 | * for saving names from getname(). */ | ||
54 | #define AUDIT_NAMES 20 | ||
55 | |||
56 | /* AUDIT_NAMES_RESERVED is the number of slots we reserve in the | ||
57 | * audit_context from being used for nameless inodes from | ||
58 | * path_lookup. */ | ||
59 | #define AUDIT_NAMES_RESERVED 7 | ||
60 | |||
61 | /* At task start time, the audit_state is set in the audit_context using | ||
62 | a per-task filter. At syscall entry, the audit_state is augmented by | ||
63 | the syscall filter. */ | ||
64 | enum audit_state { | ||
65 | AUDIT_DISABLED, /* Do not create per-task audit_context. | ||
66 | * No syscall-specific audit records can | ||
67 | * be generated. */ | ||
68 | AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, | ||
69 | * but don't necessarily fill it in at | ||
70 | * syscall entry time (i.e., filter | ||
71 | * instead). */ | ||
72 | AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, | ||
73 | * and always fill it in at syscall | ||
74 | * entry time. This makes a full | ||
75 | * syscall record available if some | ||
76 | * other part of the kernel decides it | ||
77 | * should be recorded. */ | ||
78 | AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, | ||
79 | * always fill it in at syscall entry | ||
80 | * time, and always write out the audit | ||
81 | * record at syscall exit time. */ | ||
82 | }; | ||
83 | |||
84 | /* When fs/namei.c:getname() is called, we store the pointer in name and | ||
85 | * we don't let putname() free it (instead we free all of the saved | ||
86 | * pointers at syscall exit time). | ||
87 | * | ||
88 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ | ||
89 | struct audit_names { | ||
90 | const char *name; | ||
91 | unsigned long ino; | ||
92 | dev_t dev; | ||
93 | umode_t mode; | ||
94 | uid_t uid; | ||
95 | gid_t gid; | ||
96 | dev_t rdev; | ||
97 | }; | ||
98 | |||
99 | struct audit_aux_data { | ||
100 | struct audit_aux_data *next; | ||
101 | int type; | ||
102 | }; | ||
103 | |||
104 | #define AUDIT_AUX_IPCPERM 0 | ||
105 | |||
106 | struct audit_aux_data_ipcctl { | ||
107 | struct audit_aux_data d; | ||
108 | struct ipc_perm p; | ||
109 | unsigned long qbytes; | ||
110 | uid_t uid; | ||
111 | gid_t gid; | ||
112 | mode_t mode; | ||
113 | }; | ||
114 | |||
115 | |||
116 | /* The per-task audit context. */ | ||
117 | struct audit_context { | ||
118 | int in_syscall; /* 1 if task is in a syscall */ | ||
119 | enum audit_state state; | ||
120 | unsigned int serial; /* serial number for record */ | ||
121 | struct timespec ctime; /* time of syscall entry */ | ||
122 | uid_t loginuid; /* login uid (identity) */ | ||
123 | int major; /* syscall number */ | ||
124 | unsigned long argv[4]; /* syscall arguments */ | ||
125 | int return_valid; /* return code is valid */ | ||
126 | int return_code;/* syscall return code */ | ||
127 | int auditable; /* 1 if record should be written */ | ||
128 | int name_count; | ||
129 | struct audit_names names[AUDIT_NAMES]; | ||
130 | struct audit_context *previous; /* For nested syscalls */ | ||
131 | struct audit_aux_data *aux; | ||
132 | |||
133 | /* Save things to print about task_struct */ | ||
134 | pid_t pid; | ||
135 | uid_t uid, euid, suid, fsuid; | ||
136 | gid_t gid, egid, sgid, fsgid; | ||
137 | unsigned long personality; | ||
138 | |||
139 | #if AUDIT_DEBUG | ||
140 | int put_count; | ||
141 | int ino_count; | ||
142 | #endif | ||
143 | }; | ||
144 | |||
145 | /* Public API */ | ||
146 | /* There are three lists of rules -- one to search at task creation | ||
147 | * time, one to search at syscall entry time, and another to search at | ||
148 | * syscall exit time. */ | ||
149 | static LIST_HEAD(audit_tsklist); | ||
150 | static LIST_HEAD(audit_entlist); | ||
151 | static LIST_HEAD(audit_extlist); | ||
152 | |||
153 | struct audit_entry { | ||
154 | struct list_head list; | ||
155 | struct rcu_head rcu; | ||
156 | struct audit_rule rule; | ||
157 | }; | ||
158 | |||
159 | /* Check to see if two rules are identical. It is called from | ||
160 | * audit_del_rule during AUDIT_DEL. */ | ||
161 | static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) | ||
162 | { | ||
163 | int i; | ||
164 | |||
165 | if (a->flags != b->flags) | ||
166 | return 1; | ||
167 | |||
168 | if (a->action != b->action) | ||
169 | return 1; | ||
170 | |||
171 | if (a->field_count != b->field_count) | ||
172 | return 1; | ||
173 | |||
174 | for (i = 0; i < a->field_count; i++) { | ||
175 | if (a->fields[i] != b->fields[i] | ||
176 | || a->values[i] != b->values[i]) | ||
177 | return 1; | ||
178 | } | ||
179 | |||
180 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | ||
181 | if (a->mask[i] != b->mask[i]) | ||
182 | return 1; | ||
183 | |||
184 | return 0; | ||
185 | } | ||
186 | |||
187 | /* Note that audit_add_rule and audit_del_rule are called via | ||
188 | * audit_receive() in audit.c, and are protected by | ||
189 | * audit_netlink_sem. */ | ||
190 | static inline int audit_add_rule(struct audit_entry *entry, | ||
191 | struct list_head *list) | ||
192 | { | ||
193 | if (entry->rule.flags & AUDIT_PREPEND) { | ||
194 | entry->rule.flags &= ~AUDIT_PREPEND; | ||
195 | list_add_rcu(&entry->list, list); | ||
196 | } else { | ||
197 | list_add_tail_rcu(&entry->list, list); | ||
198 | } | ||
199 | return 0; | ||
200 | } | ||
201 | |||
202 | static void audit_free_rule(struct rcu_head *head) | ||
203 | { | ||
204 | struct audit_entry *e = container_of(head, struct audit_entry, rcu); | ||
205 | kfree(e); | ||
206 | } | ||
207 | |||
208 | /* Note that audit_add_rule and audit_del_rule are called via | ||
209 | * audit_receive() in audit.c, and are protected by | ||
210 | * audit_netlink_sem. */ | ||
211 | static inline int audit_del_rule(struct audit_rule *rule, | ||
212 | struct list_head *list) | ||
213 | { | ||
214 | struct audit_entry *e; | ||
215 | |||
216 | /* Do not use the _rcu iterator here, since this is the only | ||
217 | * deletion routine. */ | ||
218 | list_for_each_entry(e, list, list) { | ||
219 | if (!audit_compare_rule(rule, &e->rule)) { | ||
220 | list_del_rcu(&e->list); | ||
221 | call_rcu(&e->rcu, audit_free_rule); | ||
222 | return 0; | ||
223 | } | ||
224 | } | ||
225 | return -EFAULT; /* No matching rule */ | ||
226 | } | ||
227 | |||
228 | #ifdef CONFIG_NET | ||
229 | /* Copy rule from user-space to kernel-space. Called during | ||
230 | * AUDIT_ADD. */ | ||
231 | static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) | ||
232 | { | ||
233 | int i; | ||
234 | |||
235 | if (s->action != AUDIT_NEVER | ||
236 | && s->action != AUDIT_POSSIBLE | ||
237 | && s->action != AUDIT_ALWAYS) | ||
238 | return -1; | ||
239 | if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) | ||
240 | return -1; | ||
241 | |||
242 | d->flags = s->flags; | ||
243 | d->action = s->action; | ||
244 | d->field_count = s->field_count; | ||
245 | for (i = 0; i < d->field_count; i++) { | ||
246 | d->fields[i] = s->fields[i]; | ||
247 | d->values[i] = s->values[i]; | ||
248 | } | ||
249 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; | ||
250 | return 0; | ||
251 | } | ||
252 | |||
253 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data) | ||
254 | { | ||
255 | u32 flags; | ||
256 | struct audit_entry *entry; | ||
257 | int err = 0; | ||
258 | |||
259 | switch (type) { | ||
260 | case AUDIT_LIST: | ||
261 | /* The *_rcu iterators not needed here because we are | ||
262 | always called with audit_netlink_sem held. */ | ||
263 | list_for_each_entry(entry, &audit_tsklist, list) | ||
264 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
265 | &entry->rule, sizeof(entry->rule)); | ||
266 | list_for_each_entry(entry, &audit_entlist, list) | ||
267 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
268 | &entry->rule, sizeof(entry->rule)); | ||
269 | list_for_each_entry(entry, &audit_extlist, list) | ||
270 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
271 | &entry->rule, sizeof(entry->rule)); | ||
272 | audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | ||
273 | break; | ||
274 | case AUDIT_ADD: | ||
275 | if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) | ||
276 | return -ENOMEM; | ||
277 | if (audit_copy_rule(&entry->rule, data)) { | ||
278 | kfree(entry); | ||
279 | return -EINVAL; | ||
280 | } | ||
281 | flags = entry->rule.flags; | ||
282 | if (!err && (flags & AUDIT_PER_TASK)) | ||
283 | err = audit_add_rule(entry, &audit_tsklist); | ||
284 | if (!err && (flags & AUDIT_AT_ENTRY)) | ||
285 | err = audit_add_rule(entry, &audit_entlist); | ||
286 | if (!err && (flags & AUDIT_AT_EXIT)) | ||
287 | err = audit_add_rule(entry, &audit_extlist); | ||
288 | break; | ||
289 | case AUDIT_DEL: | ||
290 | flags =((struct audit_rule *)data)->flags; | ||
291 | if (!err && (flags & AUDIT_PER_TASK)) | ||
292 | err = audit_del_rule(data, &audit_tsklist); | ||
293 | if (!err && (flags & AUDIT_AT_ENTRY)) | ||
294 | err = audit_del_rule(data, &audit_entlist); | ||
295 | if (!err && (flags & AUDIT_AT_EXIT)) | ||
296 | err = audit_del_rule(data, &audit_extlist); | ||
297 | break; | ||
298 | default: | ||
299 | return -EINVAL; | ||
300 | } | ||
301 | |||
302 | return err; | ||
303 | } | ||
304 | #endif | ||
305 | |||
306 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | ||
307 | * otherwise. */ | ||
308 | static int audit_filter_rules(struct task_struct *tsk, | ||
309 | struct audit_rule *rule, | ||
310 | struct audit_context *ctx, | ||
311 | enum audit_state *state) | ||
312 | { | ||
313 | int i, j; | ||
314 | |||
315 | for (i = 0; i < rule->field_count; i++) { | ||
316 | u32 field = rule->fields[i] & ~AUDIT_NEGATE; | ||
317 | u32 value = rule->values[i]; | ||
318 | int result = 0; | ||
319 | |||
320 | switch (field) { | ||
321 | case AUDIT_PID: | ||
322 | result = (tsk->pid == value); | ||
323 | break; | ||
324 | case AUDIT_UID: | ||
325 | result = (tsk->uid == value); | ||
326 | break; | ||
327 | case AUDIT_EUID: | ||
328 | result = (tsk->euid == value); | ||
329 | break; | ||
330 | case AUDIT_SUID: | ||
331 | result = (tsk->suid == value); | ||
332 | break; | ||
333 | case AUDIT_FSUID: | ||
334 | result = (tsk->fsuid == value); | ||
335 | break; | ||
336 | case AUDIT_GID: | ||
337 | result = (tsk->gid == value); | ||
338 | break; | ||
339 | case AUDIT_EGID: | ||
340 | result = (tsk->egid == value); | ||
341 | break; | ||
342 | case AUDIT_SGID: | ||
343 | result = (tsk->sgid == value); | ||
344 | break; | ||
345 | case AUDIT_FSGID: | ||
346 | result = (tsk->fsgid == value); | ||
347 | break; | ||
348 | case AUDIT_PERS: | ||
349 | result = (tsk->personality == value); | ||
350 | break; | ||
351 | |||
352 | case AUDIT_EXIT: | ||
353 | if (ctx && ctx->return_valid) | ||
354 | result = (ctx->return_code == value); | ||
355 | break; | ||
356 | case AUDIT_SUCCESS: | ||
357 | if (ctx && ctx->return_valid) | ||
358 | result = (ctx->return_code >= 0); | ||
359 | break; | ||
360 | case AUDIT_DEVMAJOR: | ||
361 | if (ctx) { | ||
362 | for (j = 0; j < ctx->name_count; j++) { | ||
363 | if (MAJOR(ctx->names[j].dev)==value) { | ||
364 | ++result; | ||
365 | break; | ||
366 | } | ||
367 | } | ||
368 | } | ||
369 | break; | ||
370 | case AUDIT_DEVMINOR: | ||
371 | if (ctx) { | ||
372 | for (j = 0; j < ctx->name_count; j++) { | ||
373 | if (MINOR(ctx->names[j].dev)==value) { | ||
374 | ++result; | ||
375 | break; | ||
376 | } | ||
377 | } | ||
378 | } | ||
379 | break; | ||
380 | case AUDIT_INODE: | ||
381 | if (ctx) { | ||
382 | for (j = 0; j < ctx->name_count; j++) { | ||
383 | if (ctx->names[j].ino == value) { | ||
384 | ++result; | ||
385 | break; | ||
386 | } | ||
387 | } | ||
388 | } | ||
389 | break; | ||
390 | case AUDIT_LOGINUID: | ||
391 | result = 0; | ||
392 | if (ctx) | ||
393 | result = (ctx->loginuid == value); | ||
394 | break; | ||
395 | case AUDIT_ARG0: | ||
396 | case AUDIT_ARG1: | ||
397 | case AUDIT_ARG2: | ||
398 | case AUDIT_ARG3: | ||
399 | if (ctx) | ||
400 | result = (ctx->argv[field-AUDIT_ARG0]==value); | ||
401 | break; | ||
402 | } | ||
403 | |||
404 | if (rule->fields[i] & AUDIT_NEGATE) | ||
405 | result = !result; | ||
406 | if (!result) | ||
407 | return 0; | ||
408 | } | ||
409 | switch (rule->action) { | ||
410 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | ||
411 | case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; | ||
412 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | ||
413 | } | ||
414 | return 1; | ||
415 | } | ||
416 | |||
417 | /* At process creation time, we can determine if system-call auditing is | ||
418 | * completely disabled for this task. Since we only have the task | ||
419 | * structure at this point, we can only check uid and gid. | ||
420 | */ | ||
421 | static enum audit_state audit_filter_task(struct task_struct *tsk) | ||
422 | { | ||
423 | struct audit_entry *e; | ||
424 | enum audit_state state; | ||
425 | |||
426 | rcu_read_lock(); | ||
427 | list_for_each_entry_rcu(e, &audit_tsklist, list) { | ||
428 | if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { | ||
429 | rcu_read_unlock(); | ||
430 | return state; | ||
431 | } | ||
432 | } | ||
433 | rcu_read_unlock(); | ||
434 | return AUDIT_BUILD_CONTEXT; | ||
435 | } | ||
436 | |||
437 | /* At syscall entry and exit time, this filter is called if the | ||
438 | * audit_state is not low enough that auditing cannot take place, but is | ||
439 | * also not high enough that we already know we have to write and audit | ||
440 | * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). | ||
441 | */ | ||
442 | static enum audit_state audit_filter_syscall(struct task_struct *tsk, | ||
443 | struct audit_context *ctx, | ||
444 | struct list_head *list) | ||
445 | { | ||
446 | struct audit_entry *e; | ||
447 | enum audit_state state; | ||
448 | int word = AUDIT_WORD(ctx->major); | ||
449 | int bit = AUDIT_BIT(ctx->major); | ||
450 | |||
451 | rcu_read_lock(); | ||
452 | list_for_each_entry_rcu(e, list, list) { | ||
453 | if ((e->rule.mask[word] & bit) == bit | ||
454 | && audit_filter_rules(tsk, &e->rule, ctx, &state)) { | ||
455 | rcu_read_unlock(); | ||
456 | return state; | ||
457 | } | ||
458 | } | ||
459 | rcu_read_unlock(); | ||
460 | return AUDIT_BUILD_CONTEXT; | ||
461 | } | ||
462 | |||
463 | /* This should be called with task_lock() held. */ | ||
464 | static inline struct audit_context *audit_get_context(struct task_struct *tsk, | ||
465 | int return_valid, | ||
466 | int return_code) | ||
467 | { | ||
468 | struct audit_context *context = tsk->audit_context; | ||
469 | |||
470 | if (likely(!context)) | ||
471 | return NULL; | ||
472 | context->return_valid = return_valid; | ||
473 | context->return_code = return_code; | ||
474 | |||
475 | if (context->in_syscall && !context->auditable) { | ||
476 | enum audit_state state; | ||
477 | state = audit_filter_syscall(tsk, context, &audit_extlist); | ||
478 | if (state == AUDIT_RECORD_CONTEXT) | ||
479 | context->auditable = 1; | ||
480 | } | ||
481 | |||
482 | context->pid = tsk->pid; | ||
483 | context->uid = tsk->uid; | ||
484 | context->gid = tsk->gid; | ||
485 | context->euid = tsk->euid; | ||
486 | context->suid = tsk->suid; | ||
487 | context->fsuid = tsk->fsuid; | ||
488 | context->egid = tsk->egid; | ||
489 | context->sgid = tsk->sgid; | ||
490 | context->fsgid = tsk->fsgid; | ||
491 | context->personality = tsk->personality; | ||
492 | tsk->audit_context = NULL; | ||
493 | return context; | ||
494 | } | ||
495 | |||
496 | static inline void audit_free_names(struct audit_context *context) | ||
497 | { | ||
498 | int i; | ||
499 | |||
500 | #if AUDIT_DEBUG == 2 | ||
501 | if (context->auditable | ||
502 | ||context->put_count + context->ino_count != context->name_count) { | ||
503 | printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d" | ||
504 | " name_count=%d put_count=%d" | ||
505 | " ino_count=%d [NOT freeing]\n", | ||
506 | __LINE__, | ||
507 | context->serial, context->major, context->in_syscall, | ||
508 | context->name_count, context->put_count, | ||
509 | context->ino_count); | ||
510 | for (i = 0; i < context->name_count; i++) | ||
511 | printk(KERN_ERR "names[%d] = %p = %s\n", i, | ||
512 | context->names[i].name, | ||
513 | context->names[i].name); | ||
514 | dump_stack(); | ||
515 | return; | ||
516 | } | ||
517 | #endif | ||
518 | #if AUDIT_DEBUG | ||
519 | context->put_count = 0; | ||
520 | context->ino_count = 0; | ||
521 | #endif | ||
522 | |||
523 | for (i = 0; i < context->name_count; i++) | ||
524 | if (context->names[i].name) | ||
525 | __putname(context->names[i].name); | ||
526 | context->name_count = 0; | ||
527 | } | ||
528 | |||
529 | static inline void audit_free_aux(struct audit_context *context) | ||
530 | { | ||
531 | struct audit_aux_data *aux; | ||
532 | |||
533 | while ((aux = context->aux)) { | ||
534 | context->aux = aux->next; | ||
535 | kfree(aux); | ||
536 | } | ||
537 | } | ||
538 | |||
539 | static inline void audit_zero_context(struct audit_context *context, | ||
540 | enum audit_state state) | ||
541 | { | ||
542 | uid_t loginuid = context->loginuid; | ||
543 | |||
544 | memset(context, 0, sizeof(*context)); | ||
545 | context->state = state; | ||
546 | context->loginuid = loginuid; | ||
547 | } | ||
548 | |||
549 | static inline struct audit_context *audit_alloc_context(enum audit_state state) | ||
550 | { | ||
551 | struct audit_context *context; | ||
552 | |||
553 | if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) | ||
554 | return NULL; | ||
555 | audit_zero_context(context, state); | ||
556 | return context; | ||
557 | } | ||
558 | |||
559 | /* Filter on the task information and allocate a per-task audit context | ||
560 | * if necessary. Doing so turns on system call auditing for the | ||
561 | * specified task. This is called from copy_process, so no lock is | ||
562 | * needed. */ | ||
563 | int audit_alloc(struct task_struct *tsk) | ||
564 | { | ||
565 | struct audit_context *context; | ||
566 | enum audit_state state; | ||
567 | |||
568 | if (likely(!audit_enabled)) | ||
569 | return 0; /* Return if not auditing. */ | ||
570 | |||
571 | state = audit_filter_task(tsk); | ||
572 | if (likely(state == AUDIT_DISABLED)) | ||
573 | return 0; | ||
574 | |||
575 | if (!(context = audit_alloc_context(state))) { | ||
576 | audit_log_lost("out of memory in audit_alloc"); | ||
577 | return -ENOMEM; | ||
578 | } | ||
579 | |||
580 | /* Preserve login uid */ | ||
581 | context->loginuid = -1; | ||
582 | if (current->audit_context) | ||
583 | context->loginuid = current->audit_context->loginuid; | ||
584 | |||
585 | tsk->audit_context = context; | ||
586 | set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); | ||
587 | return 0; | ||
588 | } | ||
589 | |||
590 | static inline void audit_free_context(struct audit_context *context) | ||
591 | { | ||
592 | struct audit_context *previous; | ||
593 | int count = 0; | ||
594 | |||
595 | do { | ||
596 | previous = context->previous; | ||
597 | if (previous || (count && count < 10)) { | ||
598 | ++count; | ||
599 | printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" | ||
600 | " freeing multiple contexts (%d)\n", | ||
601 | context->serial, context->major, | ||
602 | context->name_count, count); | ||
603 | } | ||
604 | audit_free_names(context); | ||
605 | audit_free_aux(context); | ||
606 | kfree(context); | ||
607 | context = previous; | ||
608 | } while (context); | ||
609 | if (count >= 10) | ||
610 | printk(KERN_ERR "audit: freed %d contexts\n", count); | ||
611 | } | ||
612 | |||
613 | static void audit_log_exit(struct audit_context *context) | ||
614 | { | ||
615 | int i; | ||
616 | struct audit_buffer *ab; | ||
617 | |||
618 | ab = audit_log_start(context); | ||
619 | if (!ab) | ||
620 | return; /* audit_panic has been called */ | ||
621 | audit_log_format(ab, "syscall=%d", context->major); | ||
622 | if (context->personality != PER_LINUX) | ||
623 | audit_log_format(ab, " per=%lx", context->personality); | ||
624 | if (context->return_valid) | ||
625 | audit_log_format(ab, " exit=%d", context->return_code); | ||
626 | audit_log_format(ab, | ||
627 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | ||
628 | " pid=%d loginuid=%d uid=%d gid=%d" | ||
629 | " euid=%d suid=%d fsuid=%d" | ||
630 | " egid=%d sgid=%d fsgid=%d", | ||
631 | context->argv[0], | ||
632 | context->argv[1], | ||
633 | context->argv[2], | ||
634 | context->argv[3], | ||
635 | context->name_count, | ||
636 | context->pid, | ||
637 | context->loginuid, | ||
638 | context->uid, | ||
639 | context->gid, | ||
640 | context->euid, context->suid, context->fsuid, | ||
641 | context->egid, context->sgid, context->fsgid); | ||
642 | audit_log_end(ab); | ||
643 | while (context->aux) { | ||
644 | struct audit_aux_data *aux; | ||
645 | |||
646 | ab = audit_log_start(context); | ||
647 | if (!ab) | ||
648 | continue; /* audit_panic has been called */ | ||
649 | |||
650 | aux = context->aux; | ||
651 | context->aux = aux->next; | ||
652 | |||
653 | audit_log_format(ab, "auxitem=%d", aux->type); | ||
654 | switch (aux->type) { | ||
655 | case AUDIT_AUX_IPCPERM: { | ||
656 | struct audit_aux_data_ipcctl *axi = (void *)aux; | ||
657 | audit_log_format(ab, | ||
658 | " qbytes=%lx uid=%d gid=%d mode=%x", | ||
659 | axi->qbytes, axi->uid, axi->gid, axi->mode); | ||
660 | } | ||
661 | } | ||
662 | audit_log_end(ab); | ||
663 | kfree(aux); | ||
664 | } | ||
665 | |||
666 | for (i = 0; i < context->name_count; i++) { | ||
667 | ab = audit_log_start(context); | ||
668 | if (!ab) | ||
669 | continue; /* audit_panic has been called */ | ||
670 | audit_log_format(ab, "item=%d", i); | ||
671 | if (context->names[i].name) | ||
672 | audit_log_format(ab, " name=%s", | ||
673 | context->names[i].name); | ||
674 | if (context->names[i].ino != (unsigned long)-1) | ||
675 | audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" | ||
676 | " uid=%d gid=%d rdev=%02x:%02x", | ||
677 | context->names[i].ino, | ||
678 | MAJOR(context->names[i].dev), | ||
679 | MINOR(context->names[i].dev), | ||
680 | context->names[i].mode, | ||
681 | context->names[i].uid, | ||
682 | context->names[i].gid, | ||
683 | MAJOR(context->names[i].rdev), | ||
684 | MINOR(context->names[i].rdev)); | ||
685 | audit_log_end(ab); | ||
686 | } | ||
687 | } | ||
688 | |||
689 | /* Free a per-task audit context. Called from copy_process and | ||
690 | * __put_task_struct. */ | ||
691 | void audit_free(struct task_struct *tsk) | ||
692 | { | ||
693 | struct audit_context *context; | ||
694 | |||
695 | task_lock(tsk); | ||
696 | context = audit_get_context(tsk, 0, 0); | ||
697 | task_unlock(tsk); | ||
698 | |||
699 | if (likely(!context)) | ||
700 | return; | ||
701 | |||
702 | /* Check for system calls that do not go through the exit | ||
703 | * function (e.g., exit_group), then free context block. */ | ||
704 | if (context->in_syscall && context->auditable) | ||
705 | audit_log_exit(context); | ||
706 | |||
707 | audit_free_context(context); | ||
708 | } | ||
709 | |||
710 | /* Compute a serial number for the audit record. Audit records are | ||
711 | * written to user-space as soon as they are generated, so a complete | ||
712 | * audit record may be written in several pieces. The timestamp of the | ||
713 | * record and this serial number are used by the user-space daemon to | ||
714 | * determine which pieces belong to the same audit record. The | ||
715 | * (timestamp,serial) tuple is unique for each syscall and is live from | ||
716 | * syscall entry to syscall exit. | ||
717 | * | ||
718 | * Atomic values are only guaranteed to be 24-bit, so we count down. | ||
719 | * | ||
720 | * NOTE: Another possibility is to store the formatted records off the | ||
721 | * audit context (for those records that have a context), and emit them | ||
722 | * all at syscall exit. However, this could delay the reporting of | ||
723 | * significant errors until syscall exit (or never, if the system | ||
724 | * halts). */ | ||
725 | static inline unsigned int audit_serial(void) | ||
726 | { | ||
727 | static atomic_t serial = ATOMIC_INIT(0xffffff); | ||
728 | unsigned int a, b; | ||
729 | |||
730 | do { | ||
731 | a = atomic_read(&serial); | ||
732 | if (atomic_dec_and_test(&serial)) | ||
733 | atomic_set(&serial, 0xffffff); | ||
734 | b = atomic_read(&serial); | ||
735 | } while (b != a - 1); | ||
736 | |||
737 | return 0xffffff - b; | ||
738 | } | ||
739 | |||
740 | /* Fill in audit context at syscall entry. This only happens if the | ||
741 | * audit context was created when the task was created and the state or | ||
742 | * filters demand the audit context be built. If the state from the | ||
743 | * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, | ||
744 | * then the record will be written at syscall exit time (otherwise, it | ||
745 | * will only be written if another part of the kernel requests that it | ||
746 | * be written). */ | ||
747 | void audit_syscall_entry(struct task_struct *tsk, int major, | ||
748 | unsigned long a1, unsigned long a2, | ||
749 | unsigned long a3, unsigned long a4) | ||
750 | { | ||
751 | struct audit_context *context = tsk->audit_context; | ||
752 | enum audit_state state; | ||
753 | |||
754 | BUG_ON(!context); | ||
755 | |||
756 | /* This happens only on certain architectures that make system | ||
757 | * calls in kernel_thread via the entry.S interface, instead of | ||
758 | * with direct calls. (If you are porting to a new | ||
759 | * architecture, hitting this condition can indicate that you | ||
760 | * got the _exit/_leave calls backward in entry.S.) | ||
761 | * | ||
762 | * i386 no | ||
763 | * x86_64 no | ||
764 | * ppc64 yes (see arch/ppc64/kernel/misc.S) | ||
765 | * | ||
766 | * This also happens with vm86 emulation in a non-nested manner | ||
767 | * (entries without exits), so this case must be caught. | ||
768 | */ | ||
769 | if (context->in_syscall) { | ||
770 | struct audit_context *newctx; | ||
771 | |||
772 | #if defined(__NR_vm86) && defined(__NR_vm86old) | ||
773 | /* vm86 mode should only be entered once */ | ||
774 | if (major == __NR_vm86 || major == __NR_vm86old) | ||
775 | return; | ||
776 | #endif | ||
777 | #if AUDIT_DEBUG | ||
778 | printk(KERN_ERR | ||
779 | "audit(:%d) pid=%d in syscall=%d;" | ||
780 | " entering syscall=%d\n", | ||
781 | context->serial, tsk->pid, context->major, major); | ||
782 | #endif | ||
783 | newctx = audit_alloc_context(context->state); | ||
784 | if (newctx) { | ||
785 | newctx->previous = context; | ||
786 | context = newctx; | ||
787 | tsk->audit_context = newctx; | ||
788 | } else { | ||
789 | /* If we can't alloc a new context, the best we | ||
790 | * can do is to leak memory (any pending putname | ||
791 | * will be lost). The only other alternative is | ||
792 | * to abandon auditing. */ | ||
793 | audit_zero_context(context, context->state); | ||
794 | } | ||
795 | } | ||
796 | BUG_ON(context->in_syscall || context->name_count); | ||
797 | |||
798 | if (!audit_enabled) | ||
799 | return; | ||
800 | |||
801 | context->major = major; | ||
802 | context->argv[0] = a1; | ||
803 | context->argv[1] = a2; | ||
804 | context->argv[2] = a3; | ||
805 | context->argv[3] = a4; | ||
806 | |||
807 | state = context->state; | ||
808 | if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) | ||
809 | state = audit_filter_syscall(tsk, context, &audit_entlist); | ||
810 | if (likely(state == AUDIT_DISABLED)) | ||
811 | return; | ||
812 | |||
813 | context->serial = audit_serial(); | ||
814 | context->ctime = CURRENT_TIME; | ||
815 | context->in_syscall = 1; | ||
816 | context->auditable = !!(state == AUDIT_RECORD_CONTEXT); | ||
817 | } | ||
818 | |||
819 | /* Tear down after system call. If the audit context has been marked as | ||
820 | * auditable (either because of the AUDIT_RECORD_CONTEXT state from | ||
821 | * filtering, or because some other part of the kernel write an audit | ||
822 | * message), then write out the syscall information. In call cases, | ||
823 | * free the names stored from getname(). */ | ||
824 | void audit_syscall_exit(struct task_struct *tsk, int return_code) | ||
825 | { | ||
826 | struct audit_context *context; | ||
827 | |||
828 | get_task_struct(tsk); | ||
829 | task_lock(tsk); | ||
830 | context = audit_get_context(tsk, 1, return_code); | ||
831 | task_unlock(tsk); | ||
832 | |||
833 | /* Not having a context here is ok, since the parent may have | ||
834 | * called __put_task_struct. */ | ||
835 | if (likely(!context)) | ||
836 | return; | ||
837 | |||
838 | if (context->in_syscall && context->auditable) | ||
839 | audit_log_exit(context); | ||
840 | |||
841 | context->in_syscall = 0; | ||
842 | context->auditable = 0; | ||
843 | if (context->previous) { | ||
844 | struct audit_context *new_context = context->previous; | ||
845 | context->previous = NULL; | ||
846 | audit_free_context(context); | ||
847 | tsk->audit_context = new_context; | ||
848 | } else { | ||
849 | audit_free_names(context); | ||
850 | audit_free_aux(context); | ||
851 | audit_zero_context(context, context->state); | ||
852 | tsk->audit_context = context; | ||
853 | } | ||
854 | put_task_struct(tsk); | ||
855 | } | ||
856 | |||
857 | /* Add a name to the list. Called from fs/namei.c:getname(). */ | ||
858 | void audit_getname(const char *name) | ||
859 | { | ||
860 | struct audit_context *context = current->audit_context; | ||
861 | |||
862 | if (!context || IS_ERR(name) || !name) | ||
863 | return; | ||
864 | |||
865 | if (!context->in_syscall) { | ||
866 | #if AUDIT_DEBUG == 2 | ||
867 | printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", | ||
868 | __FILE__, __LINE__, context->serial, name); | ||
869 | dump_stack(); | ||
870 | #endif | ||
871 | return; | ||
872 | } | ||
873 | BUG_ON(context->name_count >= AUDIT_NAMES); | ||
874 | context->names[context->name_count].name = name; | ||
875 | context->names[context->name_count].ino = (unsigned long)-1; | ||
876 | ++context->name_count; | ||
877 | } | ||
878 | |||
879 | /* Intercept a putname request. Called from | ||
880 | * include/linux/fs.h:putname(). If we have stored the name from | ||
881 | * getname in the audit context, then we delay the putname until syscall | ||
882 | * exit. */ | ||
883 | void audit_putname(const char *name) | ||
884 | { | ||
885 | struct audit_context *context = current->audit_context; | ||
886 | |||
887 | BUG_ON(!context); | ||
888 | if (!context->in_syscall) { | ||
889 | #if AUDIT_DEBUG == 2 | ||
890 | printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", | ||
891 | __FILE__, __LINE__, context->serial, name); | ||
892 | if (context->name_count) { | ||
893 | int i; | ||
894 | for (i = 0; i < context->name_count; i++) | ||
895 | printk(KERN_ERR "name[%d] = %p = %s\n", i, | ||
896 | context->names[i].name, | ||
897 | context->names[i].name); | ||
898 | } | ||
899 | #endif | ||
900 | __putname(name); | ||
901 | } | ||
902 | #if AUDIT_DEBUG | ||
903 | else { | ||
904 | ++context->put_count; | ||
905 | if (context->put_count > context->name_count) { | ||
906 | printk(KERN_ERR "%s:%d(:%d): major=%d" | ||
907 | " in_syscall=%d putname(%p) name_count=%d" | ||
908 | " put_count=%d\n", | ||
909 | __FILE__, __LINE__, | ||
910 | context->serial, context->major, | ||
911 | context->in_syscall, name, context->name_count, | ||
912 | context->put_count); | ||
913 | dump_stack(); | ||
914 | } | ||
915 | } | ||
916 | #endif | ||
917 | } | ||
918 | |||
919 | /* Store the inode and device from a lookup. Called from | ||
920 | * fs/namei.c:path_lookup(). */ | ||
921 | void audit_inode(const char *name, const struct inode *inode) | ||
922 | { | ||
923 | int idx; | ||
924 | struct audit_context *context = current->audit_context; | ||
925 | |||
926 | if (!context->in_syscall) | ||
927 | return; | ||
928 | if (context->name_count | ||
929 | && context->names[context->name_count-1].name | ||
930 | && context->names[context->name_count-1].name == name) | ||
931 | idx = context->name_count - 1; | ||
932 | else if (context->name_count > 1 | ||
933 | && context->names[context->name_count-2].name | ||
934 | && context->names[context->name_count-2].name == name) | ||
935 | idx = context->name_count - 2; | ||
936 | else { | ||
937 | /* FIXME: how much do we care about inodes that have no | ||
938 | * associated name? */ | ||
939 | if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED) | ||
940 | return; | ||
941 | idx = context->name_count++; | ||
942 | context->names[idx].name = NULL; | ||
943 | #if AUDIT_DEBUG | ||
944 | ++context->ino_count; | ||
945 | #endif | ||
946 | } | ||
947 | context->names[idx].ino = inode->i_ino; | ||
948 | context->names[idx].dev = inode->i_sb->s_dev; | ||
949 | context->names[idx].mode = inode->i_mode; | ||
950 | context->names[idx].uid = inode->i_uid; | ||
951 | context->names[idx].gid = inode->i_gid; | ||
952 | context->names[idx].rdev = inode->i_rdev; | ||
953 | } | ||
954 | |||
955 | void audit_get_stamp(struct audit_context *ctx, | ||
956 | struct timespec *t, int *serial) | ||
957 | { | ||
958 | if (ctx) { | ||
959 | t->tv_sec = ctx->ctime.tv_sec; | ||
960 | t->tv_nsec = ctx->ctime.tv_nsec; | ||
961 | *serial = ctx->serial; | ||
962 | ctx->auditable = 1; | ||
963 | } else { | ||
964 | *t = CURRENT_TIME; | ||
965 | *serial = 0; | ||
966 | } | ||
967 | } | ||
968 | |||
969 | extern int audit_set_type(struct audit_buffer *ab, int type); | ||
970 | |||
971 | int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid) | ||
972 | { | ||
973 | if (ctx) { | ||
974 | struct audit_buffer *ab; | ||
975 | |||
976 | ab = audit_log_start(NULL); | ||
977 | if (ab) { | ||
978 | audit_log_format(ab, "login pid=%d uid=%u " | ||
979 | "old loginuid=%u new loginuid=%u", | ||
980 | ctx->pid, ctx->uid, ctx->loginuid, loginuid); | ||
981 | audit_set_type(ab, AUDIT_LOGIN); | ||
982 | audit_log_end(ab); | ||
983 | } | ||
984 | ctx->loginuid = loginuid; | ||
985 | } | ||
986 | return 0; | ||
987 | } | ||
988 | |||
989 | uid_t audit_get_loginuid(struct audit_context *ctx) | ||
990 | { | ||
991 | return ctx ? ctx->loginuid : -1; | ||
992 | } | ||
993 | |||
994 | int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) | ||
995 | { | ||
996 | struct audit_aux_data_ipcctl *ax; | ||
997 | struct audit_context *context = current->audit_context; | ||
998 | |||
999 | if (likely(!context)) | ||
1000 | return 0; | ||
1001 | |||
1002 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); | ||
1003 | if (!ax) | ||
1004 | return -ENOMEM; | ||
1005 | |||
1006 | ax->qbytes = qbytes; | ||
1007 | ax->uid = uid; | ||
1008 | ax->gid = gid; | ||
1009 | ax->mode = mode; | ||
1010 | |||
1011 | ax->d.type = AUDIT_AUX_IPCPERM; | ||
1012 | ax->d.next = context->aux; | ||
1013 | context->aux = (void *)ax; | ||
1014 | return 0; | ||
1015 | } | ||
diff --git a/kernel/capability.c b/kernel/capability.c new file mode 100644 index 000000000000..64db1ee820c2 --- /dev/null +++ b/kernel/capability.c | |||
@@ -0,0 +1,220 @@ | |||
1 | /* | ||
2 | * linux/kernel/capability.c | ||
3 | * | ||
4 | * Copyright (C) 1997 Andrew Main <zefram@fysh.org> | ||
5 | * | ||
6 | * Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com> | ||
7 | * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> | ||
8 | */ | ||
9 | |||
10 | #include <linux/mm.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/security.h> | ||
13 | #include <linux/syscalls.h> | ||
14 | #include <asm/uaccess.h> | ||
15 | |||
16 | unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ | ||
17 | kernel_cap_t cap_bset = CAP_INIT_EFF_SET; | ||
18 | |||
19 | EXPORT_SYMBOL(securebits); | ||
20 | EXPORT_SYMBOL(cap_bset); | ||
21 | |||
22 | /* | ||
23 | * This lock protects task->cap_* for all tasks including current. | ||
24 | * Locking rule: acquire this prior to tasklist_lock. | ||
25 | */ | ||
26 | static DEFINE_SPINLOCK(task_capability_lock); | ||
27 | |||
28 | /* | ||
29 | * For sys_getproccap() and sys_setproccap(), any of the three | ||
30 | * capability set pointers may be NULL -- indicating that that set is | ||
31 | * uninteresting and/or not to be changed. | ||
32 | */ | ||
33 | |||
34 | /* | ||
35 | * sys_capget - get the capabilities of a given process. | ||
36 | */ | ||
37 | asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) | ||
38 | { | ||
39 | int ret = 0; | ||
40 | pid_t pid; | ||
41 | __u32 version; | ||
42 | task_t *target; | ||
43 | struct __user_cap_data_struct data; | ||
44 | |||
45 | if (get_user(version, &header->version)) | ||
46 | return -EFAULT; | ||
47 | |||
48 | if (version != _LINUX_CAPABILITY_VERSION) { | ||
49 | if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) | ||
50 | return -EFAULT; | ||
51 | return -EINVAL; | ||
52 | } | ||
53 | |||
54 | if (get_user(pid, &header->pid)) | ||
55 | return -EFAULT; | ||
56 | |||
57 | if (pid < 0) | ||
58 | return -EINVAL; | ||
59 | |||
60 | spin_lock(&task_capability_lock); | ||
61 | read_lock(&tasklist_lock); | ||
62 | |||
63 | if (pid && pid != current->pid) { | ||
64 | target = find_task_by_pid(pid); | ||
65 | if (!target) { | ||
66 | ret = -ESRCH; | ||
67 | goto out; | ||
68 | } | ||
69 | } else | ||
70 | target = current; | ||
71 | |||
72 | ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted); | ||
73 | |||
74 | out: | ||
75 | read_unlock(&tasklist_lock); | ||
76 | spin_unlock(&task_capability_lock); | ||
77 | |||
78 | if (!ret && copy_to_user(dataptr, &data, sizeof data)) | ||
79 | return -EFAULT; | ||
80 | |||
81 | return ret; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * cap_set_pg - set capabilities for all processes in a given process | ||
86 | * group. We call this holding task_capability_lock and tasklist_lock. | ||
87 | */ | ||
88 | static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, | ||
89 | kernel_cap_t *inheritable, | ||
90 | kernel_cap_t *permitted) | ||
91 | { | ||
92 | task_t *g, *target; | ||
93 | int ret = -EPERM; | ||
94 | int found = 0; | ||
95 | |||
96 | do_each_task_pid(pgrp, PIDTYPE_PGID, g) { | ||
97 | target = g; | ||
98 | while_each_thread(g, target) { | ||
99 | if (!security_capset_check(target, effective, | ||
100 | inheritable, | ||
101 | permitted)) { | ||
102 | security_capset_set(target, effective, | ||
103 | inheritable, | ||
104 | permitted); | ||
105 | ret = 0; | ||
106 | } | ||
107 | found = 1; | ||
108 | } | ||
109 | } while_each_task_pid(pgrp, PIDTYPE_PGID, g); | ||
110 | |||
111 | if (!found) | ||
112 | ret = 0; | ||
113 | return ret; | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * cap_set_all - set capabilities for all processes other than init | ||
118 | * and self. We call this holding task_capability_lock and tasklist_lock. | ||
119 | */ | ||
120 | static inline int cap_set_all(kernel_cap_t *effective, | ||
121 | kernel_cap_t *inheritable, | ||
122 | kernel_cap_t *permitted) | ||
123 | { | ||
124 | task_t *g, *target; | ||
125 | int ret = -EPERM; | ||
126 | int found = 0; | ||
127 | |||
128 | do_each_thread(g, target) { | ||
129 | if (target == current || target->pid == 1) | ||
130 | continue; | ||
131 | found = 1; | ||
132 | if (security_capset_check(target, effective, inheritable, | ||
133 | permitted)) | ||
134 | continue; | ||
135 | ret = 0; | ||
136 | security_capset_set(target, effective, inheritable, permitted); | ||
137 | } while_each_thread(g, target); | ||
138 | |||
139 | if (!found) | ||
140 | ret = 0; | ||
141 | return ret; | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | * sys_capset - set capabilities for a given process, all processes, or all | ||
146 | * processes in a given process group. | ||
147 | * | ||
148 | * The restrictions on setting capabilities are specified as: | ||
149 | * | ||
150 | * [pid is for the 'target' task. 'current' is the calling task.] | ||
151 | * | ||
152 | * I: any raised capabilities must be a subset of the (old current) permitted | ||
153 | * P: any raised capabilities must be a subset of the (old current) permitted | ||
154 | * E: must be set to a subset of (new target) permitted | ||
155 | */ | ||
156 | asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) | ||
157 | { | ||
158 | kernel_cap_t inheritable, permitted, effective; | ||
159 | __u32 version; | ||
160 | task_t *target; | ||
161 | int ret; | ||
162 | pid_t pid; | ||
163 | |||
164 | if (get_user(version, &header->version)) | ||
165 | return -EFAULT; | ||
166 | |||
167 | if (version != _LINUX_CAPABILITY_VERSION) { | ||
168 | if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) | ||
169 | return -EFAULT; | ||
170 | return -EINVAL; | ||
171 | } | ||
172 | |||
173 | if (get_user(pid, &header->pid)) | ||
174 | return -EFAULT; | ||
175 | |||
176 | if (pid && pid != current->pid && !capable(CAP_SETPCAP)) | ||
177 | return -EPERM; | ||
178 | |||
179 | if (copy_from_user(&effective, &data->effective, sizeof(effective)) || | ||
180 | copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) || | ||
181 | copy_from_user(&permitted, &data->permitted, sizeof(permitted))) | ||
182 | return -EFAULT; | ||
183 | |||
184 | spin_lock(&task_capability_lock); | ||
185 | read_lock(&tasklist_lock); | ||
186 | |||
187 | if (pid > 0 && pid != current->pid) { | ||
188 | target = find_task_by_pid(pid); | ||
189 | if (!target) { | ||
190 | ret = -ESRCH; | ||
191 | goto out; | ||
192 | } | ||
193 | } else | ||
194 | target = current; | ||
195 | |||
196 | ret = 0; | ||
197 | |||
198 | /* having verified that the proposed changes are legal, | ||
199 | we now put them into effect. */ | ||
200 | if (pid < 0) { | ||
201 | if (pid == -1) /* all procs other than current and init */ | ||
202 | ret = cap_set_all(&effective, &inheritable, &permitted); | ||
203 | |||
204 | else /* all procs in process group */ | ||
205 | ret = cap_set_pg(-pid, &effective, &inheritable, | ||
206 | &permitted); | ||
207 | } else { | ||
208 | ret = security_capset_check(target, &effective, &inheritable, | ||
209 | &permitted); | ||
210 | if (!ret) | ||
211 | security_capset_set(target, &effective, &inheritable, | ||
212 | &permitted); | ||
213 | } | ||
214 | |||
215 | out: | ||
216 | read_unlock(&tasklist_lock); | ||
217 | spin_unlock(&task_capability_lock); | ||
218 | |||
219 | return ret; | ||
220 | } | ||
diff --git a/kernel/compat.c b/kernel/compat.c new file mode 100644 index 000000000000..dad10656bf14 --- /dev/null +++ b/kernel/compat.c | |||
@@ -0,0 +1,860 @@ | |||
1 | /* | ||
2 | * linux/kernel/compat.c | ||
3 | * | ||
4 | * Kernel compatibililty routines for e.g. 32 bit syscall support | ||
5 | * on 64 bit kernels. | ||
6 | * | ||
7 | * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License version 2 as | ||
11 | * published by the Free Software Foundation. | ||
12 | */ | ||
13 | |||
14 | #include <linux/linkage.h> | ||
15 | #include <linux/compat.h> | ||
16 | #include <linux/errno.h> | ||
17 | #include <linux/time.h> | ||
18 | #include <linux/signal.h> | ||
19 | #include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ | ||
20 | #include <linux/futex.h> /* for FUTEX_WAIT */ | ||
21 | #include <linux/syscalls.h> | ||
22 | #include <linux/unistd.h> | ||
23 | #include <linux/security.h> | ||
24 | |||
25 | #include <asm/uaccess.h> | ||
26 | #include <asm/bug.h> | ||
27 | |||
28 | int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) | ||
29 | { | ||
30 | return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || | ||
31 | __get_user(ts->tv_sec, &cts->tv_sec) || | ||
32 | __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; | ||
33 | } | ||
34 | |||
35 | int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) | ||
36 | { | ||
37 | return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || | ||
38 | __put_user(ts->tv_sec, &cts->tv_sec) || | ||
39 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; | ||
40 | } | ||
41 | |||
42 | static long compat_nanosleep_restart(struct restart_block *restart) | ||
43 | { | ||
44 | unsigned long expire = restart->arg0, now = jiffies; | ||
45 | struct compat_timespec __user *rmtp; | ||
46 | |||
47 | /* Did it expire while we handled signals? */ | ||
48 | if (!time_after(expire, now)) | ||
49 | return 0; | ||
50 | |||
51 | current->state = TASK_INTERRUPTIBLE; | ||
52 | expire = schedule_timeout(expire - now); | ||
53 | if (expire == 0) | ||
54 | return 0; | ||
55 | |||
56 | rmtp = (struct compat_timespec __user *)restart->arg1; | ||
57 | if (rmtp) { | ||
58 | struct compat_timespec ct; | ||
59 | struct timespec t; | ||
60 | |||
61 | jiffies_to_timespec(expire, &t); | ||
62 | ct.tv_sec = t.tv_sec; | ||
63 | ct.tv_nsec = t.tv_nsec; | ||
64 | if (copy_to_user(rmtp, &ct, sizeof(ct))) | ||
65 | return -EFAULT; | ||
66 | } | ||
67 | /* The 'restart' block is already filled in */ | ||
68 | return -ERESTART_RESTARTBLOCK; | ||
69 | } | ||
70 | |||
71 | asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, | ||
72 | struct compat_timespec __user *rmtp) | ||
73 | { | ||
74 | struct timespec t; | ||
75 | struct restart_block *restart; | ||
76 | unsigned long expire; | ||
77 | |||
78 | if (get_compat_timespec(&t, rqtp)) | ||
79 | return -EFAULT; | ||
80 | |||
81 | if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) | ||
82 | return -EINVAL; | ||
83 | |||
84 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | ||
85 | current->state = TASK_INTERRUPTIBLE; | ||
86 | expire = schedule_timeout(expire); | ||
87 | if (expire == 0) | ||
88 | return 0; | ||
89 | |||
90 | if (rmtp) { | ||
91 | jiffies_to_timespec(expire, &t); | ||
92 | if (put_compat_timespec(&t, rmtp)) | ||
93 | return -EFAULT; | ||
94 | } | ||
95 | restart = ¤t_thread_info()->restart_block; | ||
96 | restart->fn = compat_nanosleep_restart; | ||
97 | restart->arg0 = jiffies + expire; | ||
98 | restart->arg1 = (unsigned long) rmtp; | ||
99 | return -ERESTART_RESTARTBLOCK; | ||
100 | } | ||
101 | |||
102 | static inline long get_compat_itimerval(struct itimerval *o, | ||
103 | struct compat_itimerval __user *i) | ||
104 | { | ||
105 | return (!access_ok(VERIFY_READ, i, sizeof(*i)) || | ||
106 | (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | | ||
107 | __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | | ||
108 | __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | | ||
109 | __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); | ||
110 | } | ||
111 | |||
112 | static inline long put_compat_itimerval(struct compat_itimerval __user *o, | ||
113 | struct itimerval *i) | ||
114 | { | ||
115 | return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || | ||
116 | (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | | ||
117 | __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | | ||
118 | __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | | ||
119 | __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); | ||
120 | } | ||
121 | |||
122 | asmlinkage long compat_sys_getitimer(int which, | ||
123 | struct compat_itimerval __user *it) | ||
124 | { | ||
125 | struct itimerval kit; | ||
126 | int error; | ||
127 | |||
128 | error = do_getitimer(which, &kit); | ||
129 | if (!error && put_compat_itimerval(it, &kit)) | ||
130 | error = -EFAULT; | ||
131 | return error; | ||
132 | } | ||
133 | |||
134 | asmlinkage long compat_sys_setitimer(int which, | ||
135 | struct compat_itimerval __user *in, | ||
136 | struct compat_itimerval __user *out) | ||
137 | { | ||
138 | struct itimerval kin, kout; | ||
139 | int error; | ||
140 | |||
141 | if (in) { | ||
142 | if (get_compat_itimerval(&kin, in)) | ||
143 | return -EFAULT; | ||
144 | } else | ||
145 | memset(&kin, 0, sizeof(kin)); | ||
146 | |||
147 | error = do_setitimer(which, &kin, out ? &kout : NULL); | ||
148 | if (error || !out) | ||
149 | return error; | ||
150 | if (put_compat_itimerval(out, &kout)) | ||
151 | return -EFAULT; | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) | ||
156 | { | ||
157 | /* | ||
158 | * In the SMP world we might just be unlucky and have one of | ||
159 | * the times increment as we use it. Since the value is an | ||
160 | * atomically safe type this is just fine. Conceptually its | ||
161 | * as if the syscall took an instant longer to occur. | ||
162 | */ | ||
163 | if (tbuf) { | ||
164 | struct compat_tms tmp; | ||
165 | struct task_struct *tsk = current; | ||
166 | struct task_struct *t; | ||
167 | cputime_t utime, stime, cutime, cstime; | ||
168 | |||
169 | read_lock(&tasklist_lock); | ||
170 | utime = tsk->signal->utime; | ||
171 | stime = tsk->signal->stime; | ||
172 | t = tsk; | ||
173 | do { | ||
174 | utime = cputime_add(utime, t->utime); | ||
175 | stime = cputime_add(stime, t->stime); | ||
176 | t = next_thread(t); | ||
177 | } while (t != tsk); | ||
178 | |||
179 | /* | ||
180 | * While we have tasklist_lock read-locked, no dying thread | ||
181 | * can be updating current->signal->[us]time. Instead, | ||
182 | * we got their counts included in the live thread loop. | ||
183 | * However, another thread can come in right now and | ||
184 | * do a wait call that updates current->signal->c[us]time. | ||
185 | * To make sure we always see that pair updated atomically, | ||
186 | * we take the siglock around fetching them. | ||
187 | */ | ||
188 | spin_lock_irq(&tsk->sighand->siglock); | ||
189 | cutime = tsk->signal->cutime; | ||
190 | cstime = tsk->signal->cstime; | ||
191 | spin_unlock_irq(&tsk->sighand->siglock); | ||
192 | read_unlock(&tasklist_lock); | ||
193 | |||
194 | tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); | ||
195 | tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); | ||
196 | tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); | ||
197 | tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); | ||
198 | if (copy_to_user(tbuf, &tmp, sizeof(tmp))) | ||
199 | return -EFAULT; | ||
200 | } | ||
201 | return compat_jiffies_to_clock_t(jiffies); | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * Assumption: old_sigset_t and compat_old_sigset_t are both | ||
206 | * types that can be passed to put_user()/get_user(). | ||
207 | */ | ||
208 | |||
209 | asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | ||
210 | { | ||
211 | old_sigset_t s; | ||
212 | long ret; | ||
213 | mm_segment_t old_fs = get_fs(); | ||
214 | |||
215 | set_fs(KERNEL_DS); | ||
216 | ret = sys_sigpending((old_sigset_t __user *) &s); | ||
217 | set_fs(old_fs); | ||
218 | if (ret == 0) | ||
219 | ret = put_user(s, set); | ||
220 | return ret; | ||
221 | } | ||
222 | |||
223 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | ||
224 | compat_old_sigset_t __user *oset) | ||
225 | { | ||
226 | old_sigset_t s; | ||
227 | long ret; | ||
228 | mm_segment_t old_fs; | ||
229 | |||
230 | if (set && get_user(s, set)) | ||
231 | return -EFAULT; | ||
232 | old_fs = get_fs(); | ||
233 | set_fs(KERNEL_DS); | ||
234 | ret = sys_sigprocmask(how, | ||
235 | set ? (old_sigset_t __user *) &s : NULL, | ||
236 | oset ? (old_sigset_t __user *) &s : NULL); | ||
237 | set_fs(old_fs); | ||
238 | if (ret == 0) | ||
239 | if (oset) | ||
240 | ret = put_user(s, oset); | ||
241 | return ret; | ||
242 | } | ||
243 | |||
244 | #ifdef CONFIG_FUTEX | ||
245 | asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, | ||
246 | struct compat_timespec __user *utime, u32 __user *uaddr2, | ||
247 | int val3) | ||
248 | { | ||
249 | struct timespec t; | ||
250 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | ||
251 | int val2 = 0; | ||
252 | |||
253 | if ((op == FUTEX_WAIT) && utime) { | ||
254 | if (get_compat_timespec(&t, utime)) | ||
255 | return -EFAULT; | ||
256 | timeout = timespec_to_jiffies(&t) + 1; | ||
257 | } | ||
258 | if (op >= FUTEX_REQUEUE) | ||
259 | val2 = (int) (unsigned long) utime; | ||
260 | |||
261 | return do_futex((unsigned long)uaddr, op, val, timeout, | ||
262 | (unsigned long)uaddr2, val2, val3); | ||
263 | } | ||
264 | #endif | ||
265 | |||
266 | asmlinkage long compat_sys_setrlimit(unsigned int resource, | ||
267 | struct compat_rlimit __user *rlim) | ||
268 | { | ||
269 | struct rlimit r; | ||
270 | int ret; | ||
271 | mm_segment_t old_fs = get_fs (); | ||
272 | |||
273 | if (resource >= RLIM_NLIMITS) | ||
274 | return -EINVAL; | ||
275 | |||
276 | if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || | ||
277 | __get_user(r.rlim_cur, &rlim->rlim_cur) || | ||
278 | __get_user(r.rlim_max, &rlim->rlim_max)) | ||
279 | return -EFAULT; | ||
280 | |||
281 | if (r.rlim_cur == COMPAT_RLIM_INFINITY) | ||
282 | r.rlim_cur = RLIM_INFINITY; | ||
283 | if (r.rlim_max == COMPAT_RLIM_INFINITY) | ||
284 | r.rlim_max = RLIM_INFINITY; | ||
285 | set_fs(KERNEL_DS); | ||
286 | ret = sys_setrlimit(resource, (struct rlimit __user *) &r); | ||
287 | set_fs(old_fs); | ||
288 | return ret; | ||
289 | } | ||
290 | |||
291 | #ifdef COMPAT_RLIM_OLD_INFINITY | ||
292 | |||
293 | asmlinkage long compat_sys_old_getrlimit(unsigned int resource, | ||
294 | struct compat_rlimit __user *rlim) | ||
295 | { | ||
296 | struct rlimit r; | ||
297 | int ret; | ||
298 | mm_segment_t old_fs = get_fs(); | ||
299 | |||
300 | set_fs(KERNEL_DS); | ||
301 | ret = sys_old_getrlimit(resource, &r); | ||
302 | set_fs(old_fs); | ||
303 | |||
304 | if (!ret) { | ||
305 | if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) | ||
306 | r.rlim_cur = COMPAT_RLIM_INFINITY; | ||
307 | if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) | ||
308 | r.rlim_max = COMPAT_RLIM_INFINITY; | ||
309 | |||
310 | if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || | ||
311 | __put_user(r.rlim_cur, &rlim->rlim_cur) || | ||
312 | __put_user(r.rlim_max, &rlim->rlim_max)) | ||
313 | return -EFAULT; | ||
314 | } | ||
315 | return ret; | ||
316 | } | ||
317 | |||
318 | #endif | ||
319 | |||
320 | asmlinkage long compat_sys_getrlimit (unsigned int resource, | ||
321 | struct compat_rlimit __user *rlim) | ||
322 | { | ||
323 | struct rlimit r; | ||
324 | int ret; | ||
325 | mm_segment_t old_fs = get_fs(); | ||
326 | |||
327 | set_fs(KERNEL_DS); | ||
328 | ret = sys_getrlimit(resource, (struct rlimit __user *) &r); | ||
329 | set_fs(old_fs); | ||
330 | if (!ret) { | ||
331 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) | ||
332 | r.rlim_cur = COMPAT_RLIM_INFINITY; | ||
333 | if (r.rlim_max > COMPAT_RLIM_INFINITY) | ||
334 | r.rlim_max = COMPAT_RLIM_INFINITY; | ||
335 | |||
336 | if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || | ||
337 | __put_user(r.rlim_cur, &rlim->rlim_cur) || | ||
338 | __put_user(r.rlim_max, &rlim->rlim_max)) | ||
339 | return -EFAULT; | ||
340 | } | ||
341 | return ret; | ||
342 | } | ||
343 | |||
344 | int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) | ||
345 | { | ||
346 | if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || | ||
347 | __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || | ||
348 | __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || | ||
349 | __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || | ||
350 | __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || | ||
351 | __put_user(r->ru_maxrss, &ru->ru_maxrss) || | ||
352 | __put_user(r->ru_ixrss, &ru->ru_ixrss) || | ||
353 | __put_user(r->ru_idrss, &ru->ru_idrss) || | ||
354 | __put_user(r->ru_isrss, &ru->ru_isrss) || | ||
355 | __put_user(r->ru_minflt, &ru->ru_minflt) || | ||
356 | __put_user(r->ru_majflt, &ru->ru_majflt) || | ||
357 | __put_user(r->ru_nswap, &ru->ru_nswap) || | ||
358 | __put_user(r->ru_inblock, &ru->ru_inblock) || | ||
359 | __put_user(r->ru_oublock, &ru->ru_oublock) || | ||
360 | __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || | ||
361 | __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || | ||
362 | __put_user(r->ru_nsignals, &ru->ru_nsignals) || | ||
363 | __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || | ||
364 | __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) | ||
365 | return -EFAULT; | ||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) | ||
370 | { | ||
371 | struct rusage r; | ||
372 | int ret; | ||
373 | mm_segment_t old_fs = get_fs(); | ||
374 | |||
375 | set_fs(KERNEL_DS); | ||
376 | ret = sys_getrusage(who, (struct rusage __user *) &r); | ||
377 | set_fs(old_fs); | ||
378 | |||
379 | if (ret) | ||
380 | return ret; | ||
381 | |||
382 | if (put_compat_rusage(&r, ru)) | ||
383 | return -EFAULT; | ||
384 | |||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | asmlinkage long | ||
389 | compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, | ||
390 | struct compat_rusage __user *ru) | ||
391 | { | ||
392 | if (!ru) { | ||
393 | return sys_wait4(pid, stat_addr, options, NULL); | ||
394 | } else { | ||
395 | struct rusage r; | ||
396 | int ret; | ||
397 | unsigned int status; | ||
398 | mm_segment_t old_fs = get_fs(); | ||
399 | |||
400 | set_fs (KERNEL_DS); | ||
401 | ret = sys_wait4(pid, | ||
402 | (stat_addr ? | ||
403 | (unsigned int __user *) &status : NULL), | ||
404 | options, (struct rusage __user *) &r); | ||
405 | set_fs (old_fs); | ||
406 | |||
407 | if (ret > 0) { | ||
408 | if (put_compat_rusage(&r, ru)) | ||
409 | return -EFAULT; | ||
410 | if (stat_addr && put_user(status, stat_addr)) | ||
411 | return -EFAULT; | ||
412 | } | ||
413 | return ret; | ||
414 | } | ||
415 | } | ||
416 | |||
417 | asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, | ||
418 | struct compat_siginfo __user *uinfo, int options, | ||
419 | struct compat_rusage __user *uru) | ||
420 | { | ||
421 | siginfo_t info; | ||
422 | struct rusage ru; | ||
423 | long ret; | ||
424 | mm_segment_t old_fs = get_fs(); | ||
425 | |||
426 | memset(&info, 0, sizeof(info)); | ||
427 | |||
428 | set_fs(KERNEL_DS); | ||
429 | ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, | ||
430 | uru ? (struct rusage __user *)&ru : NULL); | ||
431 | set_fs(old_fs); | ||
432 | |||
433 | if ((ret < 0) || (info.si_signo == 0)) | ||
434 | return ret; | ||
435 | |||
436 | if (uru) { | ||
437 | ret = put_compat_rusage(&ru, uru); | ||
438 | if (ret) | ||
439 | return ret; | ||
440 | } | ||
441 | |||
442 | BUG_ON(info.si_code & __SI_MASK); | ||
443 | info.si_code |= __SI_CHLD; | ||
444 | return copy_siginfo_to_user32(uinfo, &info); | ||
445 | } | ||
446 | |||
447 | static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, | ||
448 | unsigned len, cpumask_t *new_mask) | ||
449 | { | ||
450 | unsigned long *k; | ||
451 | |||
452 | if (len < sizeof(cpumask_t)) | ||
453 | memset(new_mask, 0, sizeof(cpumask_t)); | ||
454 | else if (len > sizeof(cpumask_t)) | ||
455 | len = sizeof(cpumask_t); | ||
456 | |||
457 | k = cpus_addr(*new_mask); | ||
458 | return compat_get_bitmap(k, user_mask_ptr, len * 8); | ||
459 | } | ||
460 | |||
461 | asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, | ||
462 | unsigned int len, | ||
463 | compat_ulong_t __user *user_mask_ptr) | ||
464 | { | ||
465 | cpumask_t new_mask; | ||
466 | int retval; | ||
467 | |||
468 | retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); | ||
469 | if (retval) | ||
470 | return retval; | ||
471 | |||
472 | return sched_setaffinity(pid, new_mask); | ||
473 | } | ||
474 | |||
475 | asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, | ||
476 | compat_ulong_t __user *user_mask_ptr) | ||
477 | { | ||
478 | int ret; | ||
479 | cpumask_t mask; | ||
480 | unsigned long *k; | ||
481 | unsigned int min_length = sizeof(cpumask_t); | ||
482 | |||
483 | if (NR_CPUS <= BITS_PER_COMPAT_LONG) | ||
484 | min_length = sizeof(compat_ulong_t); | ||
485 | |||
486 | if (len < min_length) | ||
487 | return -EINVAL; | ||
488 | |||
489 | ret = sched_getaffinity(pid, &mask); | ||
490 | if (ret < 0) | ||
491 | return ret; | ||
492 | |||
493 | k = cpus_addr(mask); | ||
494 | ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); | ||
495 | if (ret) | ||
496 | return ret; | ||
497 | |||
498 | return min_length; | ||
499 | } | ||
500 | |||
501 | static int get_compat_itimerspec(struct itimerspec *dst, | ||
502 | struct compat_itimerspec __user *src) | ||
503 | { | ||
504 | if (get_compat_timespec(&dst->it_interval, &src->it_interval) || | ||
505 | get_compat_timespec(&dst->it_value, &src->it_value)) | ||
506 | return -EFAULT; | ||
507 | return 0; | ||
508 | } | ||
509 | |||
510 | static int put_compat_itimerspec(struct compat_itimerspec __user *dst, | ||
511 | struct itimerspec *src) | ||
512 | { | ||
513 | if (put_compat_timespec(&src->it_interval, &dst->it_interval) || | ||
514 | put_compat_timespec(&src->it_value, &dst->it_value)) | ||
515 | return -EFAULT; | ||
516 | return 0; | ||
517 | } | ||
518 | |||
519 | long compat_sys_timer_settime(timer_t timer_id, int flags, | ||
520 | struct compat_itimerspec __user *new, | ||
521 | struct compat_itimerspec __user *old) | ||
522 | { | ||
523 | long err; | ||
524 | mm_segment_t oldfs; | ||
525 | struct itimerspec newts, oldts; | ||
526 | |||
527 | if (!new) | ||
528 | return -EINVAL; | ||
529 | if (get_compat_itimerspec(&newts, new)) | ||
530 | return -EFAULT; | ||
531 | oldfs = get_fs(); | ||
532 | set_fs(KERNEL_DS); | ||
533 | err = sys_timer_settime(timer_id, flags, | ||
534 | (struct itimerspec __user *) &newts, | ||
535 | (struct itimerspec __user *) &oldts); | ||
536 | set_fs(oldfs); | ||
537 | if (!err && old && put_compat_itimerspec(old, &oldts)) | ||
538 | return -EFAULT; | ||
539 | return err; | ||
540 | } | ||
541 | |||
542 | long compat_sys_timer_gettime(timer_t timer_id, | ||
543 | struct compat_itimerspec __user *setting) | ||
544 | { | ||
545 | long err; | ||
546 | mm_segment_t oldfs; | ||
547 | struct itimerspec ts; | ||
548 | |||
549 | oldfs = get_fs(); | ||
550 | set_fs(KERNEL_DS); | ||
551 | err = sys_timer_gettime(timer_id, | ||
552 | (struct itimerspec __user *) &ts); | ||
553 | set_fs(oldfs); | ||
554 | if (!err && put_compat_itimerspec(setting, &ts)) | ||
555 | return -EFAULT; | ||
556 | return err; | ||
557 | } | ||
558 | |||
559 | long compat_sys_clock_settime(clockid_t which_clock, | ||
560 | struct compat_timespec __user *tp) | ||
561 | { | ||
562 | long err; | ||
563 | mm_segment_t oldfs; | ||
564 | struct timespec ts; | ||
565 | |||
566 | if (get_compat_timespec(&ts, tp)) | ||
567 | return -EFAULT; | ||
568 | oldfs = get_fs(); | ||
569 | set_fs(KERNEL_DS); | ||
570 | err = sys_clock_settime(which_clock, | ||
571 | (struct timespec __user *) &ts); | ||
572 | set_fs(oldfs); | ||
573 | return err; | ||
574 | } | ||
575 | |||
576 | long compat_sys_clock_gettime(clockid_t which_clock, | ||
577 | struct compat_timespec __user *tp) | ||
578 | { | ||
579 | long err; | ||
580 | mm_segment_t oldfs; | ||
581 | struct timespec ts; | ||
582 | |||
583 | oldfs = get_fs(); | ||
584 | set_fs(KERNEL_DS); | ||
585 | err = sys_clock_gettime(which_clock, | ||
586 | (struct timespec __user *) &ts); | ||
587 | set_fs(oldfs); | ||
588 | if (!err && put_compat_timespec(&ts, tp)) | ||
589 | return -EFAULT; | ||
590 | return err; | ||
591 | } | ||
592 | |||
593 | long compat_sys_clock_getres(clockid_t which_clock, | ||
594 | struct compat_timespec __user *tp) | ||
595 | { | ||
596 | long err; | ||
597 | mm_segment_t oldfs; | ||
598 | struct timespec ts; | ||
599 | |||
600 | oldfs = get_fs(); | ||
601 | set_fs(KERNEL_DS); | ||
602 | err = sys_clock_getres(which_clock, | ||
603 | (struct timespec __user *) &ts); | ||
604 | set_fs(oldfs); | ||
605 | if (!err && tp && put_compat_timespec(&ts, tp)) | ||
606 | return -EFAULT; | ||
607 | return err; | ||
608 | } | ||
609 | |||
610 | long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, | ||
611 | struct compat_timespec __user *rqtp, | ||
612 | struct compat_timespec __user *rmtp) | ||
613 | { | ||
614 | long err; | ||
615 | mm_segment_t oldfs; | ||
616 | struct timespec in, out; | ||
617 | |||
618 | if (get_compat_timespec(&in, rqtp)) | ||
619 | return -EFAULT; | ||
620 | |||
621 | oldfs = get_fs(); | ||
622 | set_fs(KERNEL_DS); | ||
623 | err = sys_clock_nanosleep(which_clock, flags, | ||
624 | (struct timespec __user *) &in, | ||
625 | (struct timespec __user *) &out); | ||
626 | set_fs(oldfs); | ||
627 | if ((err == -ERESTART_RESTARTBLOCK) && rmtp && | ||
628 | put_compat_timespec(&out, rmtp)) | ||
629 | return -EFAULT; | ||
630 | return err; | ||
631 | } | ||
632 | |||
633 | /* | ||
634 | * We currently only need the following fields from the sigevent | ||
635 | * structure: sigev_value, sigev_signo, sig_notify and (sometimes | ||
636 | * sigev_notify_thread_id). The others are handled in user mode. | ||
637 | * We also assume that copying sigev_value.sival_int is sufficient | ||
638 | * to keep all the bits of sigev_value.sival_ptr intact. | ||
639 | */ | ||
640 | int get_compat_sigevent(struct sigevent *event, | ||
641 | const struct compat_sigevent __user *u_event) | ||
642 | { | ||
643 | memset(&event, 0, sizeof(*event)); | ||
644 | return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) || | ||
645 | __get_user(event->sigev_value.sival_int, | ||
646 | &u_event->sigev_value.sival_int) || | ||
647 | __get_user(event->sigev_signo, &u_event->sigev_signo) || | ||
648 | __get_user(event->sigev_notify, &u_event->sigev_notify) || | ||
649 | __get_user(event->sigev_notify_thread_id, | ||
650 | &u_event->sigev_notify_thread_id)) | ||
651 | ? -EFAULT : 0; | ||
652 | } | ||
653 | |||
654 | /* timer_create is architecture specific because it needs sigevent conversion */ | ||
655 | |||
656 | long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, | ||
657 | unsigned long bitmap_size) | ||
658 | { | ||
659 | int i, j; | ||
660 | unsigned long m; | ||
661 | compat_ulong_t um; | ||
662 | unsigned long nr_compat_longs; | ||
663 | |||
664 | /* align bitmap up to nearest compat_long_t boundary */ | ||
665 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); | ||
666 | |||
667 | if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) | ||
668 | return -EFAULT; | ||
669 | |||
670 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | ||
671 | |||
672 | for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { | ||
673 | m = 0; | ||
674 | |||
675 | for (j = 0; j < sizeof(m)/sizeof(um); j++) { | ||
676 | /* | ||
677 | * We dont want to read past the end of the userspace | ||
678 | * bitmap. We must however ensure the end of the | ||
679 | * kernel bitmap is zeroed. | ||
680 | */ | ||
681 | if (nr_compat_longs-- > 0) { | ||
682 | if (__get_user(um, umask)) | ||
683 | return -EFAULT; | ||
684 | } else { | ||
685 | um = 0; | ||
686 | } | ||
687 | |||
688 | umask++; | ||
689 | m |= (long)um << (j * BITS_PER_COMPAT_LONG); | ||
690 | } | ||
691 | *mask++ = m; | ||
692 | } | ||
693 | |||
694 | return 0; | ||
695 | } | ||
696 | |||
697 | long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, | ||
698 | unsigned long bitmap_size) | ||
699 | { | ||
700 | int i, j; | ||
701 | unsigned long m; | ||
702 | compat_ulong_t um; | ||
703 | unsigned long nr_compat_longs; | ||
704 | |||
705 | /* align bitmap up to nearest compat_long_t boundary */ | ||
706 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); | ||
707 | |||
708 | if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) | ||
709 | return -EFAULT; | ||
710 | |||
711 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | ||
712 | |||
713 | for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { | ||
714 | m = *mask++; | ||
715 | |||
716 | for (j = 0; j < sizeof(m)/sizeof(um); j++) { | ||
717 | um = m; | ||
718 | |||
719 | /* | ||
720 | * We dont want to write past the end of the userspace | ||
721 | * bitmap. | ||
722 | */ | ||
723 | if (nr_compat_longs-- > 0) { | ||
724 | if (__put_user(um, umask)) | ||
725 | return -EFAULT; | ||
726 | } | ||
727 | |||
728 | umask++; | ||
729 | m >>= 4*sizeof(um); | ||
730 | m >>= 4*sizeof(um); | ||
731 | } | ||
732 | } | ||
733 | |||
734 | return 0; | ||
735 | } | ||
736 | |||
737 | void | ||
738 | sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | ||
739 | { | ||
740 | switch (_NSIG_WORDS) { | ||
741 | #if defined (__COMPAT_ENDIAN_SWAP__) | ||
742 | case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); | ||
743 | case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); | ||
744 | case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); | ||
745 | case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); | ||
746 | #else | ||
747 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); | ||
748 | case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); | ||
749 | case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); | ||
750 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); | ||
751 | #endif | ||
752 | } | ||
753 | } | ||
754 | |||
755 | asmlinkage long | ||
756 | compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | ||
757 | struct compat_siginfo __user *uinfo, | ||
758 | struct compat_timespec __user *uts, compat_size_t sigsetsize) | ||
759 | { | ||
760 | compat_sigset_t s32; | ||
761 | sigset_t s; | ||
762 | int sig; | ||
763 | struct timespec t; | ||
764 | siginfo_t info; | ||
765 | long ret, timeout = 0; | ||
766 | |||
767 | if (sigsetsize != sizeof(sigset_t)) | ||
768 | return -EINVAL; | ||
769 | |||
770 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) | ||
771 | return -EFAULT; | ||
772 | sigset_from_compat(&s, &s32); | ||
773 | sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
774 | signotset(&s); | ||
775 | |||
776 | if (uts) { | ||
777 | if (get_compat_timespec (&t, uts)) | ||
778 | return -EFAULT; | ||
779 | if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 | ||
780 | || t.tv_sec < 0) | ||
781 | return -EINVAL; | ||
782 | } | ||
783 | |||
784 | spin_lock_irq(¤t->sighand->siglock); | ||
785 | sig = dequeue_signal(current, &s, &info); | ||
786 | if (!sig) { | ||
787 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
788 | if (uts) | ||
789 | timeout = timespec_to_jiffies(&t) | ||
790 | +(t.tv_sec || t.tv_nsec); | ||
791 | if (timeout) { | ||
792 | current->real_blocked = current->blocked; | ||
793 | sigandsets(¤t->blocked, ¤t->blocked, &s); | ||
794 | |||
795 | recalc_sigpending(); | ||
796 | spin_unlock_irq(¤t->sighand->siglock); | ||
797 | |||
798 | current->state = TASK_INTERRUPTIBLE; | ||
799 | timeout = schedule_timeout(timeout); | ||
800 | |||
801 | spin_lock_irq(¤t->sighand->siglock); | ||
802 | sig = dequeue_signal(current, &s, &info); | ||
803 | current->blocked = current->real_blocked; | ||
804 | siginitset(¤t->real_blocked, 0); | ||
805 | recalc_sigpending(); | ||
806 | } | ||
807 | } | ||
808 | spin_unlock_irq(¤t->sighand->siglock); | ||
809 | |||
810 | if (sig) { | ||
811 | ret = sig; | ||
812 | if (uinfo) { | ||
813 | if (copy_siginfo_to_user32(uinfo, &info)) | ||
814 | ret = -EFAULT; | ||
815 | } | ||
816 | }else { | ||
817 | ret = timeout?-EINTR:-EAGAIN; | ||
818 | } | ||
819 | return ret; | ||
820 | |||
821 | } | ||
822 | |||
823 | #ifdef __ARCH_WANT_COMPAT_SYS_TIME | ||
824 | |||
825 | /* compat_time_t is a 32 bit "long" and needs to get converted. */ | ||
826 | |||
827 | asmlinkage long compat_sys_time(compat_time_t __user * tloc) | ||
828 | { | ||
829 | compat_time_t i; | ||
830 | struct timeval tv; | ||
831 | |||
832 | do_gettimeofday(&tv); | ||
833 | i = tv.tv_sec; | ||
834 | |||
835 | if (tloc) { | ||
836 | if (put_user(i,tloc)) | ||
837 | i = -EFAULT; | ||
838 | } | ||
839 | return i; | ||
840 | } | ||
841 | |||
842 | asmlinkage long compat_sys_stime(compat_time_t __user *tptr) | ||
843 | { | ||
844 | struct timespec tv; | ||
845 | int err; | ||
846 | |||
847 | if (get_user(tv.tv_sec, tptr)) | ||
848 | return -EFAULT; | ||
849 | |||
850 | tv.tv_nsec = 0; | ||
851 | |||
852 | err = security_settime(&tv, NULL); | ||
853 | if (err) | ||
854 | return err; | ||
855 | |||
856 | do_settimeofday(&tv); | ||
857 | return 0; | ||
858 | } | ||
859 | |||
860 | #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ | ||
diff --git a/kernel/configs.c b/kernel/configs.c new file mode 100644 index 000000000000..986f7af31e0a --- /dev/null +++ b/kernel/configs.c | |||
@@ -0,0 +1,118 @@ | |||
1 | /* | ||
2 | * kernel/configs.c | ||
3 | * Echo the kernel .config file used to build the kernel | ||
4 | * | ||
5 | * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> | ||
6 | * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> | ||
7 | * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> | ||
8 | * Copyright (C) 2002 Hewlett-Packard Company | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2 of the License, or (at | ||
13 | * your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, but | ||
16 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
18 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
19 | * details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program; if not, write to the Free Software | ||
23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/config.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/proc_fs.h> | ||
30 | #include <linux/seq_file.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <asm/uaccess.h> | ||
33 | |||
34 | /**************************************************/ | ||
35 | /* the actual current config file */ | ||
36 | |||
37 | /* | ||
38 | * Define kernel_config_data and kernel_config_data_size, which contains the | ||
39 | * wrapped and compressed configuration file. The file is first compressed | ||
40 | * with gzip and then bounded by two eight byte magic numbers to allow | ||
41 | * extraction from a binary kernel image: | ||
42 | * | ||
43 | * IKCFG_ST | ||
44 | * <image> | ||
45 | * IKCFG_ED | ||
46 | */ | ||
47 | #define MAGIC_START "IKCFG_ST" | ||
48 | #define MAGIC_END "IKCFG_ED" | ||
49 | #include "config_data.h" | ||
50 | |||
51 | |||
52 | #define MAGIC_SIZE (sizeof(MAGIC_START) - 1) | ||
53 | #define kernel_config_data_size \ | ||
54 | (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) | ||
55 | |||
56 | #ifdef CONFIG_IKCONFIG_PROC | ||
57 | |||
58 | /**************************************************/ | ||
59 | /* globals and useful constants */ | ||
60 | |||
61 | static ssize_t | ||
62 | ikconfig_read_current(struct file *file, char __user *buf, | ||
63 | size_t len, loff_t * offset) | ||
64 | { | ||
65 | loff_t pos = *offset; | ||
66 | ssize_t count; | ||
67 | |||
68 | if (pos >= kernel_config_data_size) | ||
69 | return 0; | ||
70 | |||
71 | count = min(len, (size_t)(kernel_config_data_size - pos)); | ||
72 | if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count)) | ||
73 | return -EFAULT; | ||
74 | |||
75 | *offset += count; | ||
76 | return count; | ||
77 | } | ||
78 | |||
79 | static struct file_operations ikconfig_file_ops = { | ||
80 | .owner = THIS_MODULE, | ||
81 | .read = ikconfig_read_current, | ||
82 | }; | ||
83 | |||
84 | /***************************************************/ | ||
85 | /* ikconfig_init: start up everything we need to */ | ||
86 | |||
87 | static int __init ikconfig_init(void) | ||
88 | { | ||
89 | struct proc_dir_entry *entry; | ||
90 | |||
91 | /* create the current config file */ | ||
92 | entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, | ||
93 | &proc_root); | ||
94 | if (!entry) | ||
95 | return -ENOMEM; | ||
96 | |||
97 | entry->proc_fops = &ikconfig_file_ops; | ||
98 | entry->size = kernel_config_data_size; | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | /***************************************************/ | ||
104 | /* ikconfig_cleanup: clean up our mess */ | ||
105 | |||
106 | static void __exit ikconfig_cleanup(void) | ||
107 | { | ||
108 | remove_proc_entry("config.gz", &proc_root); | ||
109 | } | ||
110 | |||
111 | module_init(ikconfig_init); | ||
112 | module_exit(ikconfig_cleanup); | ||
113 | |||
114 | MODULE_LICENSE("GPL"); | ||
115 | MODULE_AUTHOR("Randy Dunlap"); | ||
116 | MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); | ||
117 | |||
118 | #endif /* CONFIG_IKCONFIG_PROC */ | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c new file mode 100644 index 000000000000..628f4ccda127 --- /dev/null +++ b/kernel/cpu.c | |||
@@ -0,0 +1,193 @@ | |||
1 | /* CPU control. | ||
2 | * (C) 2001, 2002, 2003, 2004 Rusty Russell | ||
3 | * | ||
4 | * This code is licenced under the GPL. | ||
5 | */ | ||
6 | #include <linux/proc_fs.h> | ||
7 | #include <linux/smp.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/notifier.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/unistd.h> | ||
12 | #include <linux/cpu.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/kthread.h> | ||
15 | #include <linux/stop_machine.h> | ||
16 | #include <asm/semaphore.h> | ||
17 | |||
18 | /* This protects CPUs going up and down... */ | ||
19 | DECLARE_MUTEX(cpucontrol); | ||
20 | |||
21 | static struct notifier_block *cpu_chain; | ||
22 | |||
23 | /* Need to know about CPUs going up/down? */ | ||
24 | int register_cpu_notifier(struct notifier_block *nb) | ||
25 | { | ||
26 | int ret; | ||
27 | |||
28 | if ((ret = down_interruptible(&cpucontrol)) != 0) | ||
29 | return ret; | ||
30 | ret = notifier_chain_register(&cpu_chain, nb); | ||
31 | up(&cpucontrol); | ||
32 | return ret; | ||
33 | } | ||
34 | EXPORT_SYMBOL(register_cpu_notifier); | ||
35 | |||
36 | void unregister_cpu_notifier(struct notifier_block *nb) | ||
37 | { | ||
38 | down(&cpucontrol); | ||
39 | notifier_chain_unregister(&cpu_chain, nb); | ||
40 | up(&cpucontrol); | ||
41 | } | ||
42 | EXPORT_SYMBOL(unregister_cpu_notifier); | ||
43 | |||
44 | #ifdef CONFIG_HOTPLUG_CPU | ||
45 | static inline void check_for_tasks(int cpu) | ||
46 | { | ||
47 | struct task_struct *p; | ||
48 | |||
49 | write_lock_irq(&tasklist_lock); | ||
50 | for_each_process(p) { | ||
51 | if (task_cpu(p) == cpu && | ||
52 | (!cputime_eq(p->utime, cputime_zero) || | ||
53 | !cputime_eq(p->stime, cputime_zero))) | ||
54 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ | ||
55 | (state = %ld, flags = %lx) \n", | ||
56 | p->comm, p->pid, cpu, p->state, p->flags); | ||
57 | } | ||
58 | write_unlock_irq(&tasklist_lock); | ||
59 | } | ||
60 | |||
61 | /* Take this CPU down. */ | ||
62 | static int take_cpu_down(void *unused) | ||
63 | { | ||
64 | int err; | ||
65 | |||
66 | /* Take offline: makes arch_cpu_down somewhat easier. */ | ||
67 | cpu_clear(smp_processor_id(), cpu_online_map); | ||
68 | |||
69 | /* Ensure this CPU doesn't handle any more interrupts. */ | ||
70 | err = __cpu_disable(); | ||
71 | if (err < 0) | ||
72 | cpu_set(smp_processor_id(), cpu_online_map); | ||
73 | else | ||
74 | /* Force idle task to run as soon as we yield: it should | ||
75 | immediately notice cpu is offline and die quickly. */ | ||
76 | sched_idle_next(); | ||
77 | |||
78 | return err; | ||
79 | } | ||
80 | |||
81 | int cpu_down(unsigned int cpu) | ||
82 | { | ||
83 | int err; | ||
84 | struct task_struct *p; | ||
85 | cpumask_t old_allowed, tmp; | ||
86 | |||
87 | if ((err = lock_cpu_hotplug_interruptible()) != 0) | ||
88 | return err; | ||
89 | |||
90 | if (num_online_cpus() == 1) { | ||
91 | err = -EBUSY; | ||
92 | goto out; | ||
93 | } | ||
94 | |||
95 | if (!cpu_online(cpu)) { | ||
96 | err = -EINVAL; | ||
97 | goto out; | ||
98 | } | ||
99 | |||
100 | err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, | ||
101 | (void *)(long)cpu); | ||
102 | if (err == NOTIFY_BAD) { | ||
103 | printk("%s: attempt to take down CPU %u failed\n", | ||
104 | __FUNCTION__, cpu); | ||
105 | err = -EINVAL; | ||
106 | goto out; | ||
107 | } | ||
108 | |||
109 | /* Ensure that we are not runnable on dying cpu */ | ||
110 | old_allowed = current->cpus_allowed; | ||
111 | tmp = CPU_MASK_ALL; | ||
112 | cpu_clear(cpu, tmp); | ||
113 | set_cpus_allowed(current, tmp); | ||
114 | |||
115 | p = __stop_machine_run(take_cpu_down, NULL, cpu); | ||
116 | if (IS_ERR(p)) { | ||
117 | /* CPU didn't die: tell everyone. Can't complain. */ | ||
118 | if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, | ||
119 | (void *)(long)cpu) == NOTIFY_BAD) | ||
120 | BUG(); | ||
121 | |||
122 | err = PTR_ERR(p); | ||
123 | goto out_allowed; | ||
124 | } | ||
125 | |||
126 | if (cpu_online(cpu)) | ||
127 | goto out_thread; | ||
128 | |||
129 | /* Wait for it to sleep (leaving idle task). */ | ||
130 | while (!idle_cpu(cpu)) | ||
131 | yield(); | ||
132 | |||
133 | /* This actually kills the CPU. */ | ||
134 | __cpu_die(cpu); | ||
135 | |||
136 | /* Move it here so it can run. */ | ||
137 | kthread_bind(p, get_cpu()); | ||
138 | put_cpu(); | ||
139 | |||
140 | /* CPU is completely dead: tell everyone. Too late to complain. */ | ||
141 | if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) | ||
142 | == NOTIFY_BAD) | ||
143 | BUG(); | ||
144 | |||
145 | check_for_tasks(cpu); | ||
146 | |||
147 | out_thread: | ||
148 | err = kthread_stop(p); | ||
149 | out_allowed: | ||
150 | set_cpus_allowed(current, old_allowed); | ||
151 | out: | ||
152 | unlock_cpu_hotplug(); | ||
153 | return err; | ||
154 | } | ||
155 | #endif /*CONFIG_HOTPLUG_CPU*/ | ||
156 | |||
157 | int __devinit cpu_up(unsigned int cpu) | ||
158 | { | ||
159 | int ret; | ||
160 | void *hcpu = (void *)(long)cpu; | ||
161 | |||
162 | if ((ret = down_interruptible(&cpucontrol)) != 0) | ||
163 | return ret; | ||
164 | |||
165 | if (cpu_online(cpu) || !cpu_present(cpu)) { | ||
166 | ret = -EINVAL; | ||
167 | goto out; | ||
168 | } | ||
169 | ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); | ||
170 | if (ret == NOTIFY_BAD) { | ||
171 | printk("%s: attempt to bring up CPU %u failed\n", | ||
172 | __FUNCTION__, cpu); | ||
173 | ret = -EINVAL; | ||
174 | goto out_notify; | ||
175 | } | ||
176 | |||
177 | /* Arch-specific enabling code. */ | ||
178 | ret = __cpu_up(cpu); | ||
179 | if (ret != 0) | ||
180 | goto out_notify; | ||
181 | if (!cpu_online(cpu)) | ||
182 | BUG(); | ||
183 | |||
184 | /* Now call notifier in preparation. */ | ||
185 | notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); | ||
186 | |||
187 | out_notify: | ||
188 | if (ret != 0) | ||
189 | notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); | ||
190 | out: | ||
191 | up(&cpucontrol); | ||
192 | return ret; | ||
193 | } | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c new file mode 100644 index 000000000000..69792bbe2281 --- /dev/null +++ b/kernel/cpuset.c | |||
@@ -0,0 +1,1564 @@ | |||
1 | /* | ||
2 | * kernel/cpuset.c | ||
3 | * | ||
4 | * Processor and Memory placement constraints for sets of tasks. | ||
5 | * | ||
6 | * Copyright (C) 2003 BULL SA. | ||
7 | * Copyright (C) 2004 Silicon Graphics, Inc. | ||
8 | * | ||
9 | * Portions derived from Patrick Mochel's sysfs code. | ||
10 | * sysfs is Copyright (c) 2001-3 Patrick Mochel | ||
11 | * Portions Copyright (c) 2004 Silicon Graphics, Inc. | ||
12 | * | ||
13 | * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> | ||
14 | * 2003-10-22 Updates by Stephen Hemminger. | ||
15 | * 2004 May-July Rework by Paul Jackson <pj@sgi.com> | ||
16 | * | ||
17 | * This file is subject to the terms and conditions of the GNU General Public | ||
18 | * License. See the file COPYING in the main directory of the Linux | ||
19 | * distribution for more details. | ||
20 | */ | ||
21 | |||
22 | #include <linux/config.h> | ||
23 | #include <linux/cpu.h> | ||
24 | #include <linux/cpumask.h> | ||
25 | #include <linux/cpuset.h> | ||
26 | #include <linux/err.h> | ||
27 | #include <linux/errno.h> | ||
28 | #include <linux/file.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/init.h> | ||
31 | #include <linux/interrupt.h> | ||
32 | #include <linux/kernel.h> | ||
33 | #include <linux/kmod.h> | ||
34 | #include <linux/list.h> | ||
35 | #include <linux/mm.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/mount.h> | ||
38 | #include <linux/namei.h> | ||
39 | #include <linux/pagemap.h> | ||
40 | #include <linux/proc_fs.h> | ||
41 | #include <linux/sched.h> | ||
42 | #include <linux/seq_file.h> | ||
43 | #include <linux/slab.h> | ||
44 | #include <linux/smp_lock.h> | ||
45 | #include <linux/spinlock.h> | ||
46 | #include <linux/stat.h> | ||
47 | #include <linux/string.h> | ||
48 | #include <linux/time.h> | ||
49 | #include <linux/backing-dev.h> | ||
50 | #include <linux/sort.h> | ||
51 | |||
52 | #include <asm/uaccess.h> | ||
53 | #include <asm/atomic.h> | ||
54 | #include <asm/semaphore.h> | ||
55 | |||
56 | #define CPUSET_SUPER_MAGIC 0x27e0eb | ||
57 | |||
58 | struct cpuset { | ||
59 | unsigned long flags; /* "unsigned long" so bitops work */ | ||
60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | ||
61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | ||
62 | |||
63 | atomic_t count; /* count tasks using this cpuset */ | ||
64 | |||
65 | /* | ||
66 | * We link our 'sibling' struct into our parents 'children'. | ||
67 | * Our children link their 'sibling' into our 'children'. | ||
68 | */ | ||
69 | struct list_head sibling; /* my parents children */ | ||
70 | struct list_head children; /* my children */ | ||
71 | |||
72 | struct cpuset *parent; /* my parent */ | ||
73 | struct dentry *dentry; /* cpuset fs entry */ | ||
74 | |||
75 | /* | ||
76 | * Copy of global cpuset_mems_generation as of the most | ||
77 | * recent time this cpuset changed its mems_allowed. | ||
78 | */ | ||
79 | int mems_generation; | ||
80 | }; | ||
81 | |||
82 | /* bits in struct cpuset flags field */ | ||
83 | typedef enum { | ||
84 | CS_CPU_EXCLUSIVE, | ||
85 | CS_MEM_EXCLUSIVE, | ||
86 | CS_REMOVED, | ||
87 | CS_NOTIFY_ON_RELEASE | ||
88 | } cpuset_flagbits_t; | ||
89 | |||
90 | /* convenient tests for these bits */ | ||
91 | static inline int is_cpu_exclusive(const struct cpuset *cs) | ||
92 | { | ||
93 | return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); | ||
94 | } | ||
95 | |||
96 | static inline int is_mem_exclusive(const struct cpuset *cs) | ||
97 | { | ||
98 | return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); | ||
99 | } | ||
100 | |||
101 | static inline int is_removed(const struct cpuset *cs) | ||
102 | { | ||
103 | return !!test_bit(CS_REMOVED, &cs->flags); | ||
104 | } | ||
105 | |||
106 | static inline int notify_on_release(const struct cpuset *cs) | ||
107 | { | ||
108 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * Increment this atomic integer everytime any cpuset changes its | ||
113 | * mems_allowed value. Users of cpusets can track this generation | ||
114 | * number, and avoid having to lock and reload mems_allowed unless | ||
115 | * the cpuset they're using changes generation. | ||
116 | * | ||
117 | * A single, global generation is needed because attach_task() could | ||
118 | * reattach a task to a different cpuset, which must not have its | ||
119 | * generation numbers aliased with those of that tasks previous cpuset. | ||
120 | * | ||
121 | * Generations are needed for mems_allowed because one task cannot | ||
122 | * modify anothers memory placement. So we must enable every task, | ||
123 | * on every visit to __alloc_pages(), to efficiently check whether | ||
124 | * its current->cpuset->mems_allowed has changed, requiring an update | ||
125 | * of its current->mems_allowed. | ||
126 | */ | ||
127 | static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); | ||
128 | |||
129 | static struct cpuset top_cpuset = { | ||
130 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | ||
131 | .cpus_allowed = CPU_MASK_ALL, | ||
132 | .mems_allowed = NODE_MASK_ALL, | ||
133 | .count = ATOMIC_INIT(0), | ||
134 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), | ||
135 | .children = LIST_HEAD_INIT(top_cpuset.children), | ||
136 | .parent = NULL, | ||
137 | .dentry = NULL, | ||
138 | .mems_generation = 0, | ||
139 | }; | ||
140 | |||
141 | static struct vfsmount *cpuset_mount; | ||
142 | static struct super_block *cpuset_sb = NULL; | ||
143 | |||
144 | /* | ||
145 | * cpuset_sem should be held by anyone who is depending on the children | ||
146 | * or sibling lists of any cpuset, or performing non-atomic operations | ||
147 | * on the flags or *_allowed values of a cpuset, such as raising the | ||
148 | * CS_REMOVED flag bit iff it is not already raised, or reading and | ||
149 | * conditionally modifying the *_allowed values. One kernel global | ||
150 | * cpuset semaphore should be sufficient - these things don't change | ||
151 | * that much. | ||
152 | * | ||
153 | * The code that modifies cpusets holds cpuset_sem across the entire | ||
154 | * operation, from cpuset_common_file_write() down, single threading | ||
155 | * all cpuset modifications (except for counter manipulations from | ||
156 | * fork and exit) across the system. This presumes that cpuset | ||
157 | * modifications are rare - better kept simple and safe, even if slow. | ||
158 | * | ||
159 | * The code that reads cpusets, such as in cpuset_common_file_read() | ||
160 | * and below, only holds cpuset_sem across small pieces of code, such | ||
161 | * as when reading out possibly multi-word cpumasks and nodemasks, as | ||
162 | * the risks are less, and the desire for performance a little greater. | ||
163 | * The proc_cpuset_show() routine needs to hold cpuset_sem to insure | ||
164 | * that no cs->dentry is NULL, as it walks up the cpuset tree to root. | ||
165 | * | ||
166 | * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't | ||
167 | * (usually) grab cpuset_sem. These are the two most performance | ||
168 | * critical pieces of code here. The exception occurs on exit(), | ||
169 | * if the last task using a cpuset exits, and the cpuset was marked | ||
170 | * notify_on_release. In that case, the cpuset_sem is taken, the | ||
171 | * path to the released cpuset calculated, and a usermode call made | ||
172 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | ||
173 | * relative to the root of cpuset file system) as the argument. | ||
174 | * | ||
175 | * A cpuset can only be deleted if both its 'count' of using tasks is | ||
176 | * zero, and its list of 'children' cpusets is empty. Since all tasks | ||
177 | * in the system use _some_ cpuset, and since there is always at least | ||
178 | * one task in the system (init, pid == 1), therefore, top_cpuset | ||
179 | * always has either children cpusets and/or using tasks. So no need | ||
180 | * for any special hack to ensure that top_cpuset cannot be deleted. | ||
181 | */ | ||
182 | |||
183 | static DECLARE_MUTEX(cpuset_sem); | ||
184 | |||
185 | /* | ||
186 | * A couple of forward declarations required, due to cyclic reference loop: | ||
187 | * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file | ||
188 | * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir. | ||
189 | */ | ||
190 | |||
191 | static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode); | ||
192 | static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry); | ||
193 | |||
194 | static struct backing_dev_info cpuset_backing_dev_info = { | ||
195 | .ra_pages = 0, /* No readahead */ | ||
196 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, | ||
197 | }; | ||
198 | |||
199 | static struct inode *cpuset_new_inode(mode_t mode) | ||
200 | { | ||
201 | struct inode *inode = new_inode(cpuset_sb); | ||
202 | |||
203 | if (inode) { | ||
204 | inode->i_mode = mode; | ||
205 | inode->i_uid = current->fsuid; | ||
206 | inode->i_gid = current->fsgid; | ||
207 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
208 | inode->i_blocks = 0; | ||
209 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
210 | inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; | ||
211 | } | ||
212 | return inode; | ||
213 | } | ||
214 | |||
215 | static void cpuset_diput(struct dentry *dentry, struct inode *inode) | ||
216 | { | ||
217 | /* is dentry a directory ? if so, kfree() associated cpuset */ | ||
218 | if (S_ISDIR(inode->i_mode)) { | ||
219 | struct cpuset *cs = dentry->d_fsdata; | ||
220 | BUG_ON(!(is_removed(cs))); | ||
221 | kfree(cs); | ||
222 | } | ||
223 | iput(inode); | ||
224 | } | ||
225 | |||
226 | static struct dentry_operations cpuset_dops = { | ||
227 | .d_iput = cpuset_diput, | ||
228 | }; | ||
229 | |||
230 | static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) | ||
231 | { | ||
232 | struct qstr qstr; | ||
233 | struct dentry *d; | ||
234 | |||
235 | qstr.name = name; | ||
236 | qstr.len = strlen(name); | ||
237 | qstr.hash = full_name_hash(name, qstr.len); | ||
238 | d = lookup_hash(&qstr, parent); | ||
239 | if (!IS_ERR(d)) | ||
240 | d->d_op = &cpuset_dops; | ||
241 | return d; | ||
242 | } | ||
243 | |||
244 | static void remove_dir(struct dentry *d) | ||
245 | { | ||
246 | struct dentry *parent = dget(d->d_parent); | ||
247 | |||
248 | d_delete(d); | ||
249 | simple_rmdir(parent->d_inode, d); | ||
250 | dput(parent); | ||
251 | } | ||
252 | |||
253 | /* | ||
254 | * NOTE : the dentry must have been dget()'ed | ||
255 | */ | ||
256 | static void cpuset_d_remove_dir(struct dentry *dentry) | ||
257 | { | ||
258 | struct list_head *node; | ||
259 | |||
260 | spin_lock(&dcache_lock); | ||
261 | node = dentry->d_subdirs.next; | ||
262 | while (node != &dentry->d_subdirs) { | ||
263 | struct dentry *d = list_entry(node, struct dentry, d_child); | ||
264 | list_del_init(node); | ||
265 | if (d->d_inode) { | ||
266 | d = dget_locked(d); | ||
267 | spin_unlock(&dcache_lock); | ||
268 | d_delete(d); | ||
269 | simple_unlink(dentry->d_inode, d); | ||
270 | dput(d); | ||
271 | spin_lock(&dcache_lock); | ||
272 | } | ||
273 | node = dentry->d_subdirs.next; | ||
274 | } | ||
275 | list_del_init(&dentry->d_child); | ||
276 | spin_unlock(&dcache_lock); | ||
277 | remove_dir(dentry); | ||
278 | } | ||
279 | |||
280 | static struct super_operations cpuset_ops = { | ||
281 | .statfs = simple_statfs, | ||
282 | .drop_inode = generic_delete_inode, | ||
283 | }; | ||
284 | |||
285 | static int cpuset_fill_super(struct super_block *sb, void *unused_data, | ||
286 | int unused_silent) | ||
287 | { | ||
288 | struct inode *inode; | ||
289 | struct dentry *root; | ||
290 | |||
291 | sb->s_blocksize = PAGE_CACHE_SIZE; | ||
292 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | ||
293 | sb->s_magic = CPUSET_SUPER_MAGIC; | ||
294 | sb->s_op = &cpuset_ops; | ||
295 | cpuset_sb = sb; | ||
296 | |||
297 | inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR); | ||
298 | if (inode) { | ||
299 | inode->i_op = &simple_dir_inode_operations; | ||
300 | inode->i_fop = &simple_dir_operations; | ||
301 | /* directories start off with i_nlink == 2 (for "." entry) */ | ||
302 | inode->i_nlink++; | ||
303 | } else { | ||
304 | return -ENOMEM; | ||
305 | } | ||
306 | |||
307 | root = d_alloc_root(inode); | ||
308 | if (!root) { | ||
309 | iput(inode); | ||
310 | return -ENOMEM; | ||
311 | } | ||
312 | sb->s_root = root; | ||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, | ||
317 | int flags, const char *unused_dev_name, | ||
318 | void *data) | ||
319 | { | ||
320 | return get_sb_single(fs_type, flags, data, cpuset_fill_super); | ||
321 | } | ||
322 | |||
323 | static struct file_system_type cpuset_fs_type = { | ||
324 | .name = "cpuset", | ||
325 | .get_sb = cpuset_get_sb, | ||
326 | .kill_sb = kill_litter_super, | ||
327 | }; | ||
328 | |||
329 | /* struct cftype: | ||
330 | * | ||
331 | * The files in the cpuset filesystem mostly have a very simple read/write | ||
332 | * handling, some common function will take care of it. Nevertheless some cases | ||
333 | * (read tasks) are special and therefore I define this structure for every | ||
334 | * kind of file. | ||
335 | * | ||
336 | * | ||
337 | * When reading/writing to a file: | ||
338 | * - the cpuset to use in file->f_dentry->d_parent->d_fsdata | ||
339 | * - the 'cftype' of the file is file->f_dentry->d_fsdata | ||
340 | */ | ||
341 | |||
342 | struct cftype { | ||
343 | char *name; | ||
344 | int private; | ||
345 | int (*open) (struct inode *inode, struct file *file); | ||
346 | ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes, | ||
347 | loff_t *ppos); | ||
348 | int (*write) (struct file *file, const char __user *buf, size_t nbytes, | ||
349 | loff_t *ppos); | ||
350 | int (*release) (struct inode *inode, struct file *file); | ||
351 | }; | ||
352 | |||
353 | static inline struct cpuset *__d_cs(struct dentry *dentry) | ||
354 | { | ||
355 | return dentry->d_fsdata; | ||
356 | } | ||
357 | |||
358 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
359 | { | ||
360 | return dentry->d_fsdata; | ||
361 | } | ||
362 | |||
363 | /* | ||
364 | * Call with cpuset_sem held. Writes path of cpuset into buf. | ||
365 | * Returns 0 on success, -errno on error. | ||
366 | */ | ||
367 | |||
368 | static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | ||
369 | { | ||
370 | char *start; | ||
371 | |||
372 | start = buf + buflen; | ||
373 | |||
374 | *--start = '\0'; | ||
375 | for (;;) { | ||
376 | int len = cs->dentry->d_name.len; | ||
377 | if ((start -= len) < buf) | ||
378 | return -ENAMETOOLONG; | ||
379 | memcpy(start, cs->dentry->d_name.name, len); | ||
380 | cs = cs->parent; | ||
381 | if (!cs) | ||
382 | break; | ||
383 | if (!cs->parent) | ||
384 | continue; | ||
385 | if (--start < buf) | ||
386 | return -ENAMETOOLONG; | ||
387 | *start = '/'; | ||
388 | } | ||
389 | memmove(buf, start, buf + buflen - start); | ||
390 | return 0; | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * Notify userspace when a cpuset is released, by running | ||
395 | * /sbin/cpuset_release_agent with the name of the cpuset (path | ||
396 | * relative to the root of cpuset file system) as the argument. | ||
397 | * | ||
398 | * Most likely, this user command will try to rmdir this cpuset. | ||
399 | * | ||
400 | * This races with the possibility that some other task will be | ||
401 | * attached to this cpuset before it is removed, or that some other | ||
402 | * user task will 'mkdir' a child cpuset of this cpuset. That's ok. | ||
403 | * The presumed 'rmdir' will fail quietly if this cpuset is no longer | ||
404 | * unused, and this cpuset will be reprieved from its death sentence, | ||
405 | * to continue to serve a useful existence. Next time it's released, | ||
406 | * we will get notified again, if it still has 'notify_on_release' set. | ||
407 | * | ||
408 | * Note final arg to call_usermodehelper() is 0 - that means | ||
409 | * don't wait. Since we are holding the global cpuset_sem here, | ||
410 | * and we are asking another thread (started from keventd) to rmdir a | ||
411 | * cpuset, we can't wait - or we'd deadlock with the removing thread | ||
412 | * on cpuset_sem. | ||
413 | */ | ||
414 | |||
415 | static int cpuset_release_agent(char *cpuset_str) | ||
416 | { | ||
417 | char *argv[3], *envp[3]; | ||
418 | int i; | ||
419 | |||
420 | i = 0; | ||
421 | argv[i++] = "/sbin/cpuset_release_agent"; | ||
422 | argv[i++] = cpuset_str; | ||
423 | argv[i] = NULL; | ||
424 | |||
425 | i = 0; | ||
426 | /* minimal command environment */ | ||
427 | envp[i++] = "HOME=/"; | ||
428 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
429 | envp[i] = NULL; | ||
430 | |||
431 | return call_usermodehelper(argv[0], argv, envp, 0); | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Either cs->count of using tasks transitioned to zero, or the | ||
436 | * cs->children list of child cpusets just became empty. If this | ||
437 | * cs is notify_on_release() and now both the user count is zero and | ||
438 | * the list of children is empty, send notice to user land. | ||
439 | */ | ||
440 | |||
441 | static void check_for_release(struct cpuset *cs) | ||
442 | { | ||
443 | if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && | ||
444 | list_empty(&cs->children)) { | ||
445 | char *buf; | ||
446 | |||
447 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
448 | if (!buf) | ||
449 | return; | ||
450 | if (cpuset_path(cs, buf, PAGE_SIZE) < 0) | ||
451 | goto out; | ||
452 | cpuset_release_agent(buf); | ||
453 | out: | ||
454 | kfree(buf); | ||
455 | } | ||
456 | } | ||
457 | |||
458 | /* | ||
459 | * Return in *pmask the portion of a cpusets's cpus_allowed that | ||
460 | * are online. If none are online, walk up the cpuset hierarchy | ||
461 | * until we find one that does have some online cpus. If we get | ||
462 | * all the way to the top and still haven't found any online cpus, | ||
463 | * return cpu_online_map. Or if passed a NULL cs from an exit'ing | ||
464 | * task, return cpu_online_map. | ||
465 | * | ||
466 | * One way or another, we guarantee to return some non-empty subset | ||
467 | * of cpu_online_map. | ||
468 | * | ||
469 | * Call with cpuset_sem held. | ||
470 | */ | ||
471 | |||
472 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | ||
473 | { | ||
474 | while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) | ||
475 | cs = cs->parent; | ||
476 | if (cs) | ||
477 | cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); | ||
478 | else | ||
479 | *pmask = cpu_online_map; | ||
480 | BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * Return in *pmask the portion of a cpusets's mems_allowed that | ||
485 | * are online. If none are online, walk up the cpuset hierarchy | ||
486 | * until we find one that does have some online mems. If we get | ||
487 | * all the way to the top and still haven't found any online mems, | ||
488 | * return node_online_map. | ||
489 | * | ||
490 | * One way or another, we guarantee to return some non-empty subset | ||
491 | * of node_online_map. | ||
492 | * | ||
493 | * Call with cpuset_sem held. | ||
494 | */ | ||
495 | |||
496 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | ||
497 | { | ||
498 | while (cs && !nodes_intersects(cs->mems_allowed, node_online_map)) | ||
499 | cs = cs->parent; | ||
500 | if (cs) | ||
501 | nodes_and(*pmask, cs->mems_allowed, node_online_map); | ||
502 | else | ||
503 | *pmask = node_online_map; | ||
504 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); | ||
505 | } | ||
506 | |||
507 | /* | ||
508 | * Refresh current tasks mems_allowed and mems_generation from | ||
509 | * current tasks cpuset. Call with cpuset_sem held. | ||
510 | * | ||
511 | * Be sure to call refresh_mems() on any cpuset operation which | ||
512 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. | ||
513 | * Call after obtaining cpuset_sem lock, before any possible | ||
514 | * allocation. Otherwise one risks trying to allocate memory | ||
515 | * while the task cpuset_mems_generation is not the same as | ||
516 | * the mems_generation in its cpuset, which would deadlock on | ||
517 | * cpuset_sem in cpuset_update_current_mems_allowed(). | ||
518 | * | ||
519 | * Since we hold cpuset_sem, once refresh_mems() is called, the | ||
520 | * test (current->cpuset_mems_generation != cs->mems_generation) | ||
521 | * in cpuset_update_current_mems_allowed() will remain false, | ||
522 | * until we drop cpuset_sem. Anyone else who would change our | ||
523 | * cpusets mems_generation needs to lock cpuset_sem first. | ||
524 | */ | ||
525 | |||
526 | static void refresh_mems(void) | ||
527 | { | ||
528 | struct cpuset *cs = current->cpuset; | ||
529 | |||
530 | if (current->cpuset_mems_generation != cs->mems_generation) { | ||
531 | guarantee_online_mems(cs, ¤t->mems_allowed); | ||
532 | current->cpuset_mems_generation = cs->mems_generation; | ||
533 | } | ||
534 | } | ||
535 | |||
536 | /* | ||
537 | * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? | ||
538 | * | ||
539 | * One cpuset is a subset of another if all its allowed CPUs and | ||
540 | * Memory Nodes are a subset of the other, and its exclusive flags | ||
541 | * are only set if the other's are set. | ||
542 | */ | ||
543 | |||
544 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | ||
545 | { | ||
546 | return cpus_subset(p->cpus_allowed, q->cpus_allowed) && | ||
547 | nodes_subset(p->mems_allowed, q->mems_allowed) && | ||
548 | is_cpu_exclusive(p) <= is_cpu_exclusive(q) && | ||
549 | is_mem_exclusive(p) <= is_mem_exclusive(q); | ||
550 | } | ||
551 | |||
552 | /* | ||
553 | * validate_change() - Used to validate that any proposed cpuset change | ||
554 | * follows the structural rules for cpusets. | ||
555 | * | ||
556 | * If we replaced the flag and mask values of the current cpuset | ||
557 | * (cur) with those values in the trial cpuset (trial), would | ||
558 | * our various subset and exclusive rules still be valid? Presumes | ||
559 | * cpuset_sem held. | ||
560 | * | ||
561 | * 'cur' is the address of an actual, in-use cpuset. Operations | ||
562 | * such as list traversal that depend on the actual address of the | ||
563 | * cpuset in the list must use cur below, not trial. | ||
564 | * | ||
565 | * 'trial' is the address of bulk structure copy of cur, with | ||
566 | * perhaps one or more of the fields cpus_allowed, mems_allowed, | ||
567 | * or flags changed to new, trial values. | ||
568 | * | ||
569 | * Return 0 if valid, -errno if not. | ||
570 | */ | ||
571 | |||
572 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | ||
573 | { | ||
574 | struct cpuset *c, *par; | ||
575 | |||
576 | /* Each of our child cpusets must be a subset of us */ | ||
577 | list_for_each_entry(c, &cur->children, sibling) { | ||
578 | if (!is_cpuset_subset(c, trial)) | ||
579 | return -EBUSY; | ||
580 | } | ||
581 | |||
582 | /* Remaining checks don't apply to root cpuset */ | ||
583 | if ((par = cur->parent) == NULL) | ||
584 | return 0; | ||
585 | |||
586 | /* We must be a subset of our parent cpuset */ | ||
587 | if (!is_cpuset_subset(trial, par)) | ||
588 | return -EACCES; | ||
589 | |||
590 | /* If either I or some sibling (!= me) is exclusive, we can't overlap */ | ||
591 | list_for_each_entry(c, &par->children, sibling) { | ||
592 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | ||
593 | c != cur && | ||
594 | cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) | ||
595 | return -EINVAL; | ||
596 | if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && | ||
597 | c != cur && | ||
598 | nodes_intersects(trial->mems_allowed, c->mems_allowed)) | ||
599 | return -EINVAL; | ||
600 | } | ||
601 | |||
602 | return 0; | ||
603 | } | ||
604 | |||
605 | static int update_cpumask(struct cpuset *cs, char *buf) | ||
606 | { | ||
607 | struct cpuset trialcs; | ||
608 | int retval; | ||
609 | |||
610 | trialcs = *cs; | ||
611 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | ||
612 | if (retval < 0) | ||
613 | return retval; | ||
614 | cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); | ||
615 | if (cpus_empty(trialcs.cpus_allowed)) | ||
616 | return -ENOSPC; | ||
617 | retval = validate_change(cs, &trialcs); | ||
618 | if (retval == 0) | ||
619 | cs->cpus_allowed = trialcs.cpus_allowed; | ||
620 | return retval; | ||
621 | } | ||
622 | |||
623 | static int update_nodemask(struct cpuset *cs, char *buf) | ||
624 | { | ||
625 | struct cpuset trialcs; | ||
626 | int retval; | ||
627 | |||
628 | trialcs = *cs; | ||
629 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
630 | if (retval < 0) | ||
631 | return retval; | ||
632 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); | ||
633 | if (nodes_empty(trialcs.mems_allowed)) | ||
634 | return -ENOSPC; | ||
635 | retval = validate_change(cs, &trialcs); | ||
636 | if (retval == 0) { | ||
637 | cs->mems_allowed = trialcs.mems_allowed; | ||
638 | atomic_inc(&cpuset_mems_generation); | ||
639 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
640 | } | ||
641 | return retval; | ||
642 | } | ||
643 | |||
644 | /* | ||
645 | * update_flag - read a 0 or a 1 in a file and update associated flag | ||
646 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | ||
647 | * CS_NOTIFY_ON_RELEASE) | ||
648 | * cs: the cpuset to update | ||
649 | * buf: the buffer where we read the 0 or 1 | ||
650 | */ | ||
651 | |||
652 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | ||
653 | { | ||
654 | int turning_on; | ||
655 | struct cpuset trialcs; | ||
656 | int err; | ||
657 | |||
658 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); | ||
659 | |||
660 | trialcs = *cs; | ||
661 | if (turning_on) | ||
662 | set_bit(bit, &trialcs.flags); | ||
663 | else | ||
664 | clear_bit(bit, &trialcs.flags); | ||
665 | |||
666 | err = validate_change(cs, &trialcs); | ||
667 | if (err == 0) { | ||
668 | if (turning_on) | ||
669 | set_bit(bit, &cs->flags); | ||
670 | else | ||
671 | clear_bit(bit, &cs->flags); | ||
672 | } | ||
673 | return err; | ||
674 | } | ||
675 | |||
676 | static int attach_task(struct cpuset *cs, char *buf) | ||
677 | { | ||
678 | pid_t pid; | ||
679 | struct task_struct *tsk; | ||
680 | struct cpuset *oldcs; | ||
681 | cpumask_t cpus; | ||
682 | |||
683 | if (sscanf(buf, "%d", &pid) != 1) | ||
684 | return -EIO; | ||
685 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
686 | return -ENOSPC; | ||
687 | |||
688 | if (pid) { | ||
689 | read_lock(&tasklist_lock); | ||
690 | |||
691 | tsk = find_task_by_pid(pid); | ||
692 | if (!tsk) { | ||
693 | read_unlock(&tasklist_lock); | ||
694 | return -ESRCH; | ||
695 | } | ||
696 | |||
697 | get_task_struct(tsk); | ||
698 | read_unlock(&tasklist_lock); | ||
699 | |||
700 | if ((current->euid) && (current->euid != tsk->uid) | ||
701 | && (current->euid != tsk->suid)) { | ||
702 | put_task_struct(tsk); | ||
703 | return -EACCES; | ||
704 | } | ||
705 | } else { | ||
706 | tsk = current; | ||
707 | get_task_struct(tsk); | ||
708 | } | ||
709 | |||
710 | task_lock(tsk); | ||
711 | oldcs = tsk->cpuset; | ||
712 | if (!oldcs) { | ||
713 | task_unlock(tsk); | ||
714 | put_task_struct(tsk); | ||
715 | return -ESRCH; | ||
716 | } | ||
717 | atomic_inc(&cs->count); | ||
718 | tsk->cpuset = cs; | ||
719 | task_unlock(tsk); | ||
720 | |||
721 | guarantee_online_cpus(cs, &cpus); | ||
722 | set_cpus_allowed(tsk, cpus); | ||
723 | |||
724 | put_task_struct(tsk); | ||
725 | if (atomic_dec_and_test(&oldcs->count)) | ||
726 | check_for_release(oldcs); | ||
727 | return 0; | ||
728 | } | ||
729 | |||
730 | /* The various types of files and directories in a cpuset file system */ | ||
731 | |||
732 | typedef enum { | ||
733 | FILE_ROOT, | ||
734 | FILE_DIR, | ||
735 | FILE_CPULIST, | ||
736 | FILE_MEMLIST, | ||
737 | FILE_CPU_EXCLUSIVE, | ||
738 | FILE_MEM_EXCLUSIVE, | ||
739 | FILE_NOTIFY_ON_RELEASE, | ||
740 | FILE_TASKLIST, | ||
741 | } cpuset_filetype_t; | ||
742 | |||
743 | static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, | ||
744 | size_t nbytes, loff_t *unused_ppos) | ||
745 | { | ||
746 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | ||
747 | struct cftype *cft = __d_cft(file->f_dentry); | ||
748 | cpuset_filetype_t type = cft->private; | ||
749 | char *buffer; | ||
750 | int retval = 0; | ||
751 | |||
752 | /* Crude upper limit on largest legitimate cpulist user might write. */ | ||
753 | if (nbytes > 100 + 6 * NR_CPUS) | ||
754 | return -E2BIG; | ||
755 | |||
756 | /* +1 for nul-terminator */ | ||
757 | if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) | ||
758 | return -ENOMEM; | ||
759 | |||
760 | if (copy_from_user(buffer, userbuf, nbytes)) { | ||
761 | retval = -EFAULT; | ||
762 | goto out1; | ||
763 | } | ||
764 | buffer[nbytes] = 0; /* nul-terminate */ | ||
765 | |||
766 | down(&cpuset_sem); | ||
767 | |||
768 | if (is_removed(cs)) { | ||
769 | retval = -ENODEV; | ||
770 | goto out2; | ||
771 | } | ||
772 | |||
773 | switch (type) { | ||
774 | case FILE_CPULIST: | ||
775 | retval = update_cpumask(cs, buffer); | ||
776 | break; | ||
777 | case FILE_MEMLIST: | ||
778 | retval = update_nodemask(cs, buffer); | ||
779 | break; | ||
780 | case FILE_CPU_EXCLUSIVE: | ||
781 | retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); | ||
782 | break; | ||
783 | case FILE_MEM_EXCLUSIVE: | ||
784 | retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); | ||
785 | break; | ||
786 | case FILE_NOTIFY_ON_RELEASE: | ||
787 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); | ||
788 | break; | ||
789 | case FILE_TASKLIST: | ||
790 | retval = attach_task(cs, buffer); | ||
791 | break; | ||
792 | default: | ||
793 | retval = -EINVAL; | ||
794 | goto out2; | ||
795 | } | ||
796 | |||
797 | if (retval == 0) | ||
798 | retval = nbytes; | ||
799 | out2: | ||
800 | up(&cpuset_sem); | ||
801 | out1: | ||
802 | kfree(buffer); | ||
803 | return retval; | ||
804 | } | ||
805 | |||
806 | static ssize_t cpuset_file_write(struct file *file, const char __user *buf, | ||
807 | size_t nbytes, loff_t *ppos) | ||
808 | { | ||
809 | ssize_t retval = 0; | ||
810 | struct cftype *cft = __d_cft(file->f_dentry); | ||
811 | if (!cft) | ||
812 | return -ENODEV; | ||
813 | |||
814 | /* special function ? */ | ||
815 | if (cft->write) | ||
816 | retval = cft->write(file, buf, nbytes, ppos); | ||
817 | else | ||
818 | retval = cpuset_common_file_write(file, buf, nbytes, ppos); | ||
819 | |||
820 | return retval; | ||
821 | } | ||
822 | |||
823 | /* | ||
824 | * These ascii lists should be read in a single call, by using a user | ||
825 | * buffer large enough to hold the entire map. If read in smaller | ||
826 | * chunks, there is no guarantee of atomicity. Since the display format | ||
827 | * used, list of ranges of sequential numbers, is variable length, | ||
828 | * and since these maps can change value dynamically, one could read | ||
829 | * gibberish by doing partial reads while a list was changing. | ||
830 | * A single large read to a buffer that crosses a page boundary is | ||
831 | * ok, because the result being copied to user land is not recomputed | ||
832 | * across a page fault. | ||
833 | */ | ||
834 | |||
835 | static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | ||
836 | { | ||
837 | cpumask_t mask; | ||
838 | |||
839 | down(&cpuset_sem); | ||
840 | mask = cs->cpus_allowed; | ||
841 | up(&cpuset_sem); | ||
842 | |||
843 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | ||
844 | } | ||
845 | |||
846 | static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | ||
847 | { | ||
848 | nodemask_t mask; | ||
849 | |||
850 | down(&cpuset_sem); | ||
851 | mask = cs->mems_allowed; | ||
852 | up(&cpuset_sem); | ||
853 | |||
854 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | ||
855 | } | ||
856 | |||
857 | static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | ||
858 | size_t nbytes, loff_t *ppos) | ||
859 | { | ||
860 | struct cftype *cft = __d_cft(file->f_dentry); | ||
861 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | ||
862 | cpuset_filetype_t type = cft->private; | ||
863 | char *page; | ||
864 | ssize_t retval = 0; | ||
865 | char *s; | ||
866 | char *start; | ||
867 | size_t n; | ||
868 | |||
869 | if (!(page = (char *)__get_free_page(GFP_KERNEL))) | ||
870 | return -ENOMEM; | ||
871 | |||
872 | s = page; | ||
873 | |||
874 | switch (type) { | ||
875 | case FILE_CPULIST: | ||
876 | s += cpuset_sprintf_cpulist(s, cs); | ||
877 | break; | ||
878 | case FILE_MEMLIST: | ||
879 | s += cpuset_sprintf_memlist(s, cs); | ||
880 | break; | ||
881 | case FILE_CPU_EXCLUSIVE: | ||
882 | *s++ = is_cpu_exclusive(cs) ? '1' : '0'; | ||
883 | break; | ||
884 | case FILE_MEM_EXCLUSIVE: | ||
885 | *s++ = is_mem_exclusive(cs) ? '1' : '0'; | ||
886 | break; | ||
887 | case FILE_NOTIFY_ON_RELEASE: | ||
888 | *s++ = notify_on_release(cs) ? '1' : '0'; | ||
889 | break; | ||
890 | default: | ||
891 | retval = -EINVAL; | ||
892 | goto out; | ||
893 | } | ||
894 | *s++ = '\n'; | ||
895 | *s = '\0'; | ||
896 | |||
897 | start = page + *ppos; | ||
898 | n = s - start; | ||
899 | retval = n - copy_to_user(buf, start, min(n, nbytes)); | ||
900 | *ppos += retval; | ||
901 | out: | ||
902 | free_page((unsigned long)page); | ||
903 | return retval; | ||
904 | } | ||
905 | |||
906 | static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes, | ||
907 | loff_t *ppos) | ||
908 | { | ||
909 | ssize_t retval = 0; | ||
910 | struct cftype *cft = __d_cft(file->f_dentry); | ||
911 | if (!cft) | ||
912 | return -ENODEV; | ||
913 | |||
914 | /* special function ? */ | ||
915 | if (cft->read) | ||
916 | retval = cft->read(file, buf, nbytes, ppos); | ||
917 | else | ||
918 | retval = cpuset_common_file_read(file, buf, nbytes, ppos); | ||
919 | |||
920 | return retval; | ||
921 | } | ||
922 | |||
923 | static int cpuset_file_open(struct inode *inode, struct file *file) | ||
924 | { | ||
925 | int err; | ||
926 | struct cftype *cft; | ||
927 | |||
928 | err = generic_file_open(inode, file); | ||
929 | if (err) | ||
930 | return err; | ||
931 | |||
932 | cft = __d_cft(file->f_dentry); | ||
933 | if (!cft) | ||
934 | return -ENODEV; | ||
935 | if (cft->open) | ||
936 | err = cft->open(inode, file); | ||
937 | else | ||
938 | err = 0; | ||
939 | |||
940 | return err; | ||
941 | } | ||
942 | |||
943 | static int cpuset_file_release(struct inode *inode, struct file *file) | ||
944 | { | ||
945 | struct cftype *cft = __d_cft(file->f_dentry); | ||
946 | if (cft->release) | ||
947 | return cft->release(inode, file); | ||
948 | return 0; | ||
949 | } | ||
950 | |||
951 | static struct file_operations cpuset_file_operations = { | ||
952 | .read = cpuset_file_read, | ||
953 | .write = cpuset_file_write, | ||
954 | .llseek = generic_file_llseek, | ||
955 | .open = cpuset_file_open, | ||
956 | .release = cpuset_file_release, | ||
957 | }; | ||
958 | |||
959 | static struct inode_operations cpuset_dir_inode_operations = { | ||
960 | .lookup = simple_lookup, | ||
961 | .mkdir = cpuset_mkdir, | ||
962 | .rmdir = cpuset_rmdir, | ||
963 | }; | ||
964 | |||
965 | static int cpuset_create_file(struct dentry *dentry, int mode) | ||
966 | { | ||
967 | struct inode *inode; | ||
968 | |||
969 | if (!dentry) | ||
970 | return -ENOENT; | ||
971 | if (dentry->d_inode) | ||
972 | return -EEXIST; | ||
973 | |||
974 | inode = cpuset_new_inode(mode); | ||
975 | if (!inode) | ||
976 | return -ENOMEM; | ||
977 | |||
978 | if (S_ISDIR(mode)) { | ||
979 | inode->i_op = &cpuset_dir_inode_operations; | ||
980 | inode->i_fop = &simple_dir_operations; | ||
981 | |||
982 | /* start off with i_nlink == 2 (for "." entry) */ | ||
983 | inode->i_nlink++; | ||
984 | } else if (S_ISREG(mode)) { | ||
985 | inode->i_size = 0; | ||
986 | inode->i_fop = &cpuset_file_operations; | ||
987 | } | ||
988 | |||
989 | d_instantiate(dentry, inode); | ||
990 | dget(dentry); /* Extra count - pin the dentry in core */ | ||
991 | return 0; | ||
992 | } | ||
993 | |||
994 | /* | ||
995 | * cpuset_create_dir - create a directory for an object. | ||
996 | * cs: the cpuset we create the directory for. | ||
997 | * It must have a valid ->parent field | ||
998 | * And we are going to fill its ->dentry field. | ||
999 | * name: The name to give to the cpuset directory. Will be copied. | ||
1000 | * mode: mode to set on new directory. | ||
1001 | */ | ||
1002 | |||
1003 | static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode) | ||
1004 | { | ||
1005 | struct dentry *dentry = NULL; | ||
1006 | struct dentry *parent; | ||
1007 | int error = 0; | ||
1008 | |||
1009 | parent = cs->parent->dentry; | ||
1010 | dentry = cpuset_get_dentry(parent, name); | ||
1011 | if (IS_ERR(dentry)) | ||
1012 | return PTR_ERR(dentry); | ||
1013 | error = cpuset_create_file(dentry, S_IFDIR | mode); | ||
1014 | if (!error) { | ||
1015 | dentry->d_fsdata = cs; | ||
1016 | parent->d_inode->i_nlink++; | ||
1017 | cs->dentry = dentry; | ||
1018 | } | ||
1019 | dput(dentry); | ||
1020 | |||
1021 | return error; | ||
1022 | } | ||
1023 | |||
1024 | static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) | ||
1025 | { | ||
1026 | struct dentry *dentry; | ||
1027 | int error; | ||
1028 | |||
1029 | down(&dir->d_inode->i_sem); | ||
1030 | dentry = cpuset_get_dentry(dir, cft->name); | ||
1031 | if (!IS_ERR(dentry)) { | ||
1032 | error = cpuset_create_file(dentry, 0644 | S_IFREG); | ||
1033 | if (!error) | ||
1034 | dentry->d_fsdata = (void *)cft; | ||
1035 | dput(dentry); | ||
1036 | } else | ||
1037 | error = PTR_ERR(dentry); | ||
1038 | up(&dir->d_inode->i_sem); | ||
1039 | return error; | ||
1040 | } | ||
1041 | |||
1042 | /* | ||
1043 | * Stuff for reading the 'tasks' file. | ||
1044 | * | ||
1045 | * Reading this file can return large amounts of data if a cpuset has | ||
1046 | * *lots* of attached tasks. So it may need several calls to read(), | ||
1047 | * but we cannot guarantee that the information we produce is correct | ||
1048 | * unless we produce it entirely atomically. | ||
1049 | * | ||
1050 | * Upon tasks file open(), a struct ctr_struct is allocated, that | ||
1051 | * will have a pointer to an array (also allocated here). The struct | ||
1052 | * ctr_struct * is stored in file->private_data. Its resources will | ||
1053 | * be freed by release() when the file is closed. The array is used | ||
1054 | * to sprintf the PIDs and then used by read(). | ||
1055 | */ | ||
1056 | |||
1057 | /* cpusets_tasks_read array */ | ||
1058 | |||
1059 | struct ctr_struct { | ||
1060 | char *buf; | ||
1061 | int bufsz; | ||
1062 | }; | ||
1063 | |||
1064 | /* | ||
1065 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. | ||
1066 | * Return actual number of pids loaded. | ||
1067 | */ | ||
1068 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | ||
1069 | { | ||
1070 | int n = 0; | ||
1071 | struct task_struct *g, *p; | ||
1072 | |||
1073 | read_lock(&tasklist_lock); | ||
1074 | |||
1075 | do_each_thread(g, p) { | ||
1076 | if (p->cpuset == cs) { | ||
1077 | pidarray[n++] = p->pid; | ||
1078 | if (unlikely(n == npids)) | ||
1079 | goto array_full; | ||
1080 | } | ||
1081 | } while_each_thread(g, p); | ||
1082 | |||
1083 | array_full: | ||
1084 | read_unlock(&tasklist_lock); | ||
1085 | return n; | ||
1086 | } | ||
1087 | |||
1088 | static int cmppid(const void *a, const void *b) | ||
1089 | { | ||
1090 | return *(pid_t *)a - *(pid_t *)b; | ||
1091 | } | ||
1092 | |||
1093 | /* | ||
1094 | * Convert array 'a' of 'npids' pid_t's to a string of newline separated | ||
1095 | * decimal pids in 'buf'. Don't write more than 'sz' chars, but return | ||
1096 | * count 'cnt' of how many chars would be written if buf were large enough. | ||
1097 | */ | ||
1098 | static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | ||
1099 | { | ||
1100 | int cnt = 0; | ||
1101 | int i; | ||
1102 | |||
1103 | for (i = 0; i < npids; i++) | ||
1104 | cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); | ||
1105 | return cnt; | ||
1106 | } | ||
1107 | |||
1108 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | ||
1109 | { | ||
1110 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | ||
1111 | struct ctr_struct *ctr; | ||
1112 | pid_t *pidarray; | ||
1113 | int npids; | ||
1114 | char c; | ||
1115 | |||
1116 | if (!(file->f_mode & FMODE_READ)) | ||
1117 | return 0; | ||
1118 | |||
1119 | ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); | ||
1120 | if (!ctr) | ||
1121 | goto err0; | ||
1122 | |||
1123 | /* | ||
1124 | * If cpuset gets more users after we read count, we won't have | ||
1125 | * enough space - tough. This race is indistinguishable to the | ||
1126 | * caller from the case that the additional cpuset users didn't | ||
1127 | * show up until sometime later on. | ||
1128 | */ | ||
1129 | npids = atomic_read(&cs->count); | ||
1130 | pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); | ||
1131 | if (!pidarray) | ||
1132 | goto err1; | ||
1133 | |||
1134 | npids = pid_array_load(pidarray, npids, cs); | ||
1135 | sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); | ||
1136 | |||
1137 | /* Call pid_array_to_buf() twice, first just to get bufsz */ | ||
1138 | ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; | ||
1139 | ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); | ||
1140 | if (!ctr->buf) | ||
1141 | goto err2; | ||
1142 | ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); | ||
1143 | |||
1144 | kfree(pidarray); | ||
1145 | file->private_data = ctr; | ||
1146 | return 0; | ||
1147 | |||
1148 | err2: | ||
1149 | kfree(pidarray); | ||
1150 | err1: | ||
1151 | kfree(ctr); | ||
1152 | err0: | ||
1153 | return -ENOMEM; | ||
1154 | } | ||
1155 | |||
1156 | static ssize_t cpuset_tasks_read(struct file *file, char __user *buf, | ||
1157 | size_t nbytes, loff_t *ppos) | ||
1158 | { | ||
1159 | struct ctr_struct *ctr = file->private_data; | ||
1160 | |||
1161 | if (*ppos + nbytes > ctr->bufsz) | ||
1162 | nbytes = ctr->bufsz - *ppos; | ||
1163 | if (copy_to_user(buf, ctr->buf + *ppos, nbytes)) | ||
1164 | return -EFAULT; | ||
1165 | *ppos += nbytes; | ||
1166 | return nbytes; | ||
1167 | } | ||
1168 | |||
1169 | static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) | ||
1170 | { | ||
1171 | struct ctr_struct *ctr; | ||
1172 | |||
1173 | if (file->f_mode & FMODE_READ) { | ||
1174 | ctr = file->private_data; | ||
1175 | kfree(ctr->buf); | ||
1176 | kfree(ctr); | ||
1177 | } | ||
1178 | return 0; | ||
1179 | } | ||
1180 | |||
1181 | /* | ||
1182 | * for the common functions, 'private' gives the type of file | ||
1183 | */ | ||
1184 | |||
1185 | static struct cftype cft_tasks = { | ||
1186 | .name = "tasks", | ||
1187 | .open = cpuset_tasks_open, | ||
1188 | .read = cpuset_tasks_read, | ||
1189 | .release = cpuset_tasks_release, | ||
1190 | .private = FILE_TASKLIST, | ||
1191 | }; | ||
1192 | |||
1193 | static struct cftype cft_cpus = { | ||
1194 | .name = "cpus", | ||
1195 | .private = FILE_CPULIST, | ||
1196 | }; | ||
1197 | |||
1198 | static struct cftype cft_mems = { | ||
1199 | .name = "mems", | ||
1200 | .private = FILE_MEMLIST, | ||
1201 | }; | ||
1202 | |||
1203 | static struct cftype cft_cpu_exclusive = { | ||
1204 | .name = "cpu_exclusive", | ||
1205 | .private = FILE_CPU_EXCLUSIVE, | ||
1206 | }; | ||
1207 | |||
1208 | static struct cftype cft_mem_exclusive = { | ||
1209 | .name = "mem_exclusive", | ||
1210 | .private = FILE_MEM_EXCLUSIVE, | ||
1211 | }; | ||
1212 | |||
1213 | static struct cftype cft_notify_on_release = { | ||
1214 | .name = "notify_on_release", | ||
1215 | .private = FILE_NOTIFY_ON_RELEASE, | ||
1216 | }; | ||
1217 | |||
1218 | static int cpuset_populate_dir(struct dentry *cs_dentry) | ||
1219 | { | ||
1220 | int err; | ||
1221 | |||
1222 | if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0) | ||
1223 | return err; | ||
1224 | if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0) | ||
1225 | return err; | ||
1226 | if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0) | ||
1227 | return err; | ||
1228 | if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0) | ||
1229 | return err; | ||
1230 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) | ||
1231 | return err; | ||
1232 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | ||
1233 | return err; | ||
1234 | return 0; | ||
1235 | } | ||
1236 | |||
1237 | /* | ||
1238 | * cpuset_create - create a cpuset | ||
1239 | * parent: cpuset that will be parent of the new cpuset. | ||
1240 | * name: name of the new cpuset. Will be strcpy'ed. | ||
1241 | * mode: mode to set on new inode | ||
1242 | * | ||
1243 | * Must be called with the semaphore on the parent inode held | ||
1244 | */ | ||
1245 | |||
1246 | static long cpuset_create(struct cpuset *parent, const char *name, int mode) | ||
1247 | { | ||
1248 | struct cpuset *cs; | ||
1249 | int err; | ||
1250 | |||
1251 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); | ||
1252 | if (!cs) | ||
1253 | return -ENOMEM; | ||
1254 | |||
1255 | down(&cpuset_sem); | ||
1256 | refresh_mems(); | ||
1257 | cs->flags = 0; | ||
1258 | if (notify_on_release(parent)) | ||
1259 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | ||
1260 | cs->cpus_allowed = CPU_MASK_NONE; | ||
1261 | cs->mems_allowed = NODE_MASK_NONE; | ||
1262 | atomic_set(&cs->count, 0); | ||
1263 | INIT_LIST_HEAD(&cs->sibling); | ||
1264 | INIT_LIST_HEAD(&cs->children); | ||
1265 | atomic_inc(&cpuset_mems_generation); | ||
1266 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
1267 | |||
1268 | cs->parent = parent; | ||
1269 | |||
1270 | list_add(&cs->sibling, &cs->parent->children); | ||
1271 | |||
1272 | err = cpuset_create_dir(cs, name, mode); | ||
1273 | if (err < 0) | ||
1274 | goto err; | ||
1275 | |||
1276 | /* | ||
1277 | * Release cpuset_sem before cpuset_populate_dir() because it | ||
1278 | * will down() this new directory's i_sem and if we race with | ||
1279 | * another mkdir, we might deadlock. | ||
1280 | */ | ||
1281 | up(&cpuset_sem); | ||
1282 | |||
1283 | err = cpuset_populate_dir(cs->dentry); | ||
1284 | /* If err < 0, we have a half-filled directory - oh well ;) */ | ||
1285 | return 0; | ||
1286 | err: | ||
1287 | list_del(&cs->sibling); | ||
1288 | up(&cpuset_sem); | ||
1289 | kfree(cs); | ||
1290 | return err; | ||
1291 | } | ||
1292 | |||
1293 | static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) | ||
1294 | { | ||
1295 | struct cpuset *c_parent = dentry->d_parent->d_fsdata; | ||
1296 | |||
1297 | /* the vfs holds inode->i_sem already */ | ||
1298 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); | ||
1299 | } | ||
1300 | |||
1301 | static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | ||
1302 | { | ||
1303 | struct cpuset *cs = dentry->d_fsdata; | ||
1304 | struct dentry *d; | ||
1305 | struct cpuset *parent; | ||
1306 | |||
1307 | /* the vfs holds both inode->i_sem already */ | ||
1308 | |||
1309 | down(&cpuset_sem); | ||
1310 | refresh_mems(); | ||
1311 | if (atomic_read(&cs->count) > 0) { | ||
1312 | up(&cpuset_sem); | ||
1313 | return -EBUSY; | ||
1314 | } | ||
1315 | if (!list_empty(&cs->children)) { | ||
1316 | up(&cpuset_sem); | ||
1317 | return -EBUSY; | ||
1318 | } | ||
1319 | spin_lock(&cs->dentry->d_lock); | ||
1320 | parent = cs->parent; | ||
1321 | set_bit(CS_REMOVED, &cs->flags); | ||
1322 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | ||
1323 | if (list_empty(&parent->children)) | ||
1324 | check_for_release(parent); | ||
1325 | d = dget(cs->dentry); | ||
1326 | cs->dentry = NULL; | ||
1327 | spin_unlock(&d->d_lock); | ||
1328 | cpuset_d_remove_dir(d); | ||
1329 | dput(d); | ||
1330 | up(&cpuset_sem); | ||
1331 | return 0; | ||
1332 | } | ||
1333 | |||
1334 | /** | ||
1335 | * cpuset_init - initialize cpusets at system boot | ||
1336 | * | ||
1337 | * Description: Initialize top_cpuset and the cpuset internal file system, | ||
1338 | **/ | ||
1339 | |||
1340 | int __init cpuset_init(void) | ||
1341 | { | ||
1342 | struct dentry *root; | ||
1343 | int err; | ||
1344 | |||
1345 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | ||
1346 | top_cpuset.mems_allowed = NODE_MASK_ALL; | ||
1347 | |||
1348 | atomic_inc(&cpuset_mems_generation); | ||
1349 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); | ||
1350 | |||
1351 | init_task.cpuset = &top_cpuset; | ||
1352 | |||
1353 | err = register_filesystem(&cpuset_fs_type); | ||
1354 | if (err < 0) | ||
1355 | goto out; | ||
1356 | cpuset_mount = kern_mount(&cpuset_fs_type); | ||
1357 | if (IS_ERR(cpuset_mount)) { | ||
1358 | printk(KERN_ERR "cpuset: could not mount!\n"); | ||
1359 | err = PTR_ERR(cpuset_mount); | ||
1360 | cpuset_mount = NULL; | ||
1361 | goto out; | ||
1362 | } | ||
1363 | root = cpuset_mount->mnt_sb->s_root; | ||
1364 | root->d_fsdata = &top_cpuset; | ||
1365 | root->d_inode->i_nlink++; | ||
1366 | top_cpuset.dentry = root; | ||
1367 | root->d_inode->i_op = &cpuset_dir_inode_operations; | ||
1368 | err = cpuset_populate_dir(root); | ||
1369 | out: | ||
1370 | return err; | ||
1371 | } | ||
1372 | |||
1373 | /** | ||
1374 | * cpuset_init_smp - initialize cpus_allowed | ||
1375 | * | ||
1376 | * Description: Finish top cpuset after cpu, node maps are initialized | ||
1377 | **/ | ||
1378 | |||
1379 | void __init cpuset_init_smp(void) | ||
1380 | { | ||
1381 | top_cpuset.cpus_allowed = cpu_online_map; | ||
1382 | top_cpuset.mems_allowed = node_online_map; | ||
1383 | } | ||
1384 | |||
1385 | /** | ||
1386 | * cpuset_fork - attach newly forked task to its parents cpuset. | ||
1387 | * @p: pointer to task_struct of forking parent process. | ||
1388 | * | ||
1389 | * Description: By default, on fork, a task inherits its | ||
1390 | * parents cpuset. The pointer to the shared cpuset is | ||
1391 | * automatically copied in fork.c by dup_task_struct(). | ||
1392 | * This cpuset_fork() routine need only increment the usage | ||
1393 | * counter in that cpuset. | ||
1394 | **/ | ||
1395 | |||
1396 | void cpuset_fork(struct task_struct *tsk) | ||
1397 | { | ||
1398 | atomic_inc(&tsk->cpuset->count); | ||
1399 | } | ||
1400 | |||
1401 | /** | ||
1402 | * cpuset_exit - detach cpuset from exiting task | ||
1403 | * @tsk: pointer to task_struct of exiting process | ||
1404 | * | ||
1405 | * Description: Detach cpuset from @tsk and release it. | ||
1406 | * | ||
1407 | **/ | ||
1408 | |||
1409 | void cpuset_exit(struct task_struct *tsk) | ||
1410 | { | ||
1411 | struct cpuset *cs; | ||
1412 | |||
1413 | task_lock(tsk); | ||
1414 | cs = tsk->cpuset; | ||
1415 | tsk->cpuset = NULL; | ||
1416 | task_unlock(tsk); | ||
1417 | |||
1418 | if (atomic_dec_and_test(&cs->count)) { | ||
1419 | down(&cpuset_sem); | ||
1420 | check_for_release(cs); | ||
1421 | up(&cpuset_sem); | ||
1422 | } | ||
1423 | } | ||
1424 | |||
1425 | /** | ||
1426 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | ||
1427 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | ||
1428 | * | ||
1429 | * Description: Returns the cpumask_t cpus_allowed of the cpuset | ||
1430 | * attached to the specified @tsk. Guaranteed to return some non-empty | ||
1431 | * subset of cpu_online_map, even if this means going outside the | ||
1432 | * tasks cpuset. | ||
1433 | **/ | ||
1434 | |||
1435 | const cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | ||
1436 | { | ||
1437 | cpumask_t mask; | ||
1438 | |||
1439 | down(&cpuset_sem); | ||
1440 | task_lock((struct task_struct *)tsk); | ||
1441 | guarantee_online_cpus(tsk->cpuset, &mask); | ||
1442 | task_unlock((struct task_struct *)tsk); | ||
1443 | up(&cpuset_sem); | ||
1444 | |||
1445 | return mask; | ||
1446 | } | ||
1447 | |||
1448 | void cpuset_init_current_mems_allowed(void) | ||
1449 | { | ||
1450 | current->mems_allowed = NODE_MASK_ALL; | ||
1451 | } | ||
1452 | |||
1453 | /* | ||
1454 | * If the current tasks cpusets mems_allowed changed behind our backs, | ||
1455 | * update current->mems_allowed and mems_generation to the new value. | ||
1456 | * Do not call this routine if in_interrupt(). | ||
1457 | */ | ||
1458 | |||
1459 | void cpuset_update_current_mems_allowed(void) | ||
1460 | { | ||
1461 | struct cpuset *cs = current->cpuset; | ||
1462 | |||
1463 | if (!cs) | ||
1464 | return; /* task is exiting */ | ||
1465 | if (current->cpuset_mems_generation != cs->mems_generation) { | ||
1466 | down(&cpuset_sem); | ||
1467 | refresh_mems(); | ||
1468 | up(&cpuset_sem); | ||
1469 | } | ||
1470 | } | ||
1471 | |||
1472 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes) | ||
1473 | { | ||
1474 | bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), | ||
1475 | MAX_NUMNODES); | ||
1476 | } | ||
1477 | |||
1478 | /* | ||
1479 | * Are any of the nodes on zonelist zl allowed in current->mems_allowed? | ||
1480 | */ | ||
1481 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | ||
1482 | { | ||
1483 | int i; | ||
1484 | |||
1485 | for (i = 0; zl->zones[i]; i++) { | ||
1486 | int nid = zl->zones[i]->zone_pgdat->node_id; | ||
1487 | |||
1488 | if (node_isset(nid, current->mems_allowed)) | ||
1489 | return 1; | ||
1490 | } | ||
1491 | return 0; | ||
1492 | } | ||
1493 | |||
1494 | /* | ||
1495 | * Is 'current' valid, and is zone z allowed in current->mems_allowed? | ||
1496 | */ | ||
1497 | int cpuset_zone_allowed(struct zone *z) | ||
1498 | { | ||
1499 | return in_interrupt() || | ||
1500 | node_isset(z->zone_pgdat->node_id, current->mems_allowed); | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * proc_cpuset_show() | ||
1505 | * - Print tasks cpuset path into seq_file. | ||
1506 | * - Used for /proc/<pid>/cpuset. | ||
1507 | */ | ||
1508 | |||
1509 | static int proc_cpuset_show(struct seq_file *m, void *v) | ||
1510 | { | ||
1511 | struct cpuset *cs; | ||
1512 | struct task_struct *tsk; | ||
1513 | char *buf; | ||
1514 | int retval = 0; | ||
1515 | |||
1516 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
1517 | if (!buf) | ||
1518 | return -ENOMEM; | ||
1519 | |||
1520 | tsk = m->private; | ||
1521 | down(&cpuset_sem); | ||
1522 | task_lock(tsk); | ||
1523 | cs = tsk->cpuset; | ||
1524 | task_unlock(tsk); | ||
1525 | if (!cs) { | ||
1526 | retval = -EINVAL; | ||
1527 | goto out; | ||
1528 | } | ||
1529 | |||
1530 | retval = cpuset_path(cs, buf, PAGE_SIZE); | ||
1531 | if (retval < 0) | ||
1532 | goto out; | ||
1533 | seq_puts(m, buf); | ||
1534 | seq_putc(m, '\n'); | ||
1535 | out: | ||
1536 | up(&cpuset_sem); | ||
1537 | kfree(buf); | ||
1538 | return retval; | ||
1539 | } | ||
1540 | |||
1541 | static int cpuset_open(struct inode *inode, struct file *file) | ||
1542 | { | ||
1543 | struct task_struct *tsk = PROC_I(inode)->task; | ||
1544 | return single_open(file, proc_cpuset_show, tsk); | ||
1545 | } | ||
1546 | |||
1547 | struct file_operations proc_cpuset_operations = { | ||
1548 | .open = cpuset_open, | ||
1549 | .read = seq_read, | ||
1550 | .llseek = seq_lseek, | ||
1551 | .release = single_release, | ||
1552 | }; | ||
1553 | |||
1554 | /* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ | ||
1555 | char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) | ||
1556 | { | ||
1557 | buffer += sprintf(buffer, "Cpus_allowed:\t"); | ||
1558 | buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed); | ||
1559 | buffer += sprintf(buffer, "\n"); | ||
1560 | buffer += sprintf(buffer, "Mems_allowed:\t"); | ||
1561 | buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed); | ||
1562 | buffer += sprintf(buffer, "\n"); | ||
1563 | return buffer; | ||
1564 | } | ||
diff --git a/kernel/dma.c b/kernel/dma.c new file mode 100644 index 000000000000..aef0a45b7893 --- /dev/null +++ b/kernel/dma.c | |||
@@ -0,0 +1,158 @@ | |||
1 | /* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ | ||
2 | * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. | ||
3 | * | ||
4 | * Written by Hennus Bergman, 1992. | ||
5 | * | ||
6 | * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma. | ||
7 | * In the previous version the reported device could end up being wrong, | ||
8 | * if a device requested a DMA channel that was already in use. | ||
9 | * [It also happened to remove the sizeof(char *) == sizeof(int) | ||
10 | * assumption introduced because of those /proc/dma patches. -- Hennus] | ||
11 | */ | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/errno.h> | ||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/seq_file.h> | ||
18 | #include <linux/proc_fs.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <asm/dma.h> | ||
21 | #include <asm/system.h> | ||
22 | |||
23 | |||
24 | |||
25 | /* A note on resource allocation: | ||
26 | * | ||
27 | * All drivers needing DMA channels, should allocate and release them | ||
28 | * through the public routines `request_dma()' and `free_dma()'. | ||
29 | * | ||
30 | * In order to avoid problems, all processes should allocate resources in | ||
31 | * the same sequence and release them in the reverse order. | ||
32 | * | ||
33 | * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA. | ||
34 | * When releasing them, first release the DMA, then release the IRQ. | ||
35 | * If you don't, you may cause allocation requests to fail unnecessarily. | ||
36 | * This doesn't really matter now, but it will once we get real semaphores | ||
37 | * in the kernel. | ||
38 | */ | ||
39 | |||
40 | |||
41 | DEFINE_SPINLOCK(dma_spin_lock); | ||
42 | |||
43 | /* | ||
44 | * If our port doesn't define this it has no PC like DMA | ||
45 | */ | ||
46 | |||
47 | #ifdef MAX_DMA_CHANNELS | ||
48 | |||
49 | |||
50 | /* Channel n is busy iff dma_chan_busy[n].lock != 0. | ||
51 | * DMA0 used to be reserved for DRAM refresh, but apparently not any more... | ||
52 | * DMA4 is reserved for cascading. | ||
53 | */ | ||
54 | |||
55 | struct dma_chan { | ||
56 | int lock; | ||
57 | const char *device_id; | ||
58 | }; | ||
59 | |||
60 | static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { | ||
61 | [4] = { 1, "cascade" }, | ||
62 | }; | ||
63 | |||
64 | |||
65 | int request_dma(unsigned int dmanr, const char * device_id) | ||
66 | { | ||
67 | if (dmanr >= MAX_DMA_CHANNELS) | ||
68 | return -EINVAL; | ||
69 | |||
70 | if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0) | ||
71 | return -EBUSY; | ||
72 | |||
73 | dma_chan_busy[dmanr].device_id = device_id; | ||
74 | |||
75 | /* old flag was 0, now contains 1 to indicate busy */ | ||
76 | return 0; | ||
77 | } /* request_dma */ | ||
78 | |||
79 | |||
80 | void free_dma(unsigned int dmanr) | ||
81 | { | ||
82 | if (dmanr >= MAX_DMA_CHANNELS) { | ||
83 | printk(KERN_WARNING "Trying to free DMA%d\n", dmanr); | ||
84 | return; | ||
85 | } | ||
86 | |||
87 | if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { | ||
88 | printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr); | ||
89 | return; | ||
90 | } | ||
91 | |||
92 | } /* free_dma */ | ||
93 | |||
94 | #else | ||
95 | |||
96 | int request_dma(unsigned int dmanr, const char *device_id) | ||
97 | { | ||
98 | return -EINVAL; | ||
99 | } | ||
100 | |||
101 | void free_dma(unsigned int dmanr) | ||
102 | { | ||
103 | } | ||
104 | |||
105 | #endif | ||
106 | |||
107 | #ifdef CONFIG_PROC_FS | ||
108 | |||
109 | #ifdef MAX_DMA_CHANNELS | ||
110 | static int proc_dma_show(struct seq_file *m, void *v) | ||
111 | { | ||
112 | int i; | ||
113 | |||
114 | for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) { | ||
115 | if (dma_chan_busy[i].lock) { | ||
116 | seq_printf(m, "%2d: %s\n", i, | ||
117 | dma_chan_busy[i].device_id); | ||
118 | } | ||
119 | } | ||
120 | return 0; | ||
121 | } | ||
122 | #else | ||
123 | static int proc_dma_show(struct seq_file *m, void *v) | ||
124 | { | ||
125 | seq_puts(m, "No DMA\n"); | ||
126 | return 0; | ||
127 | } | ||
128 | #endif /* MAX_DMA_CHANNELS */ | ||
129 | |||
130 | static int proc_dma_open(struct inode *inode, struct file *file) | ||
131 | { | ||
132 | return single_open(file, proc_dma_show, NULL); | ||
133 | } | ||
134 | |||
135 | static struct file_operations proc_dma_operations = { | ||
136 | .open = proc_dma_open, | ||
137 | .read = seq_read, | ||
138 | .llseek = seq_lseek, | ||
139 | .release = single_release, | ||
140 | }; | ||
141 | |||
142 | static int __init proc_dma_init(void) | ||
143 | { | ||
144 | struct proc_dir_entry *e; | ||
145 | |||
146 | e = create_proc_entry("dma", 0, NULL); | ||
147 | if (e) | ||
148 | e->proc_fops = &proc_dma_operations; | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | __initcall(proc_dma_init); | ||
154 | #endif | ||
155 | |||
156 | EXPORT_SYMBOL(request_dma); | ||
157 | EXPORT_SYMBOL(free_dma); | ||
158 | EXPORT_SYMBOL(dma_spin_lock); | ||
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c new file mode 100644 index 000000000000..867d6dbeb574 --- /dev/null +++ b/kernel/exec_domain.c | |||
@@ -0,0 +1,209 @@ | |||
1 | /* | ||
2 | * Handling of different ABIs (personalities). | ||
3 | * | ||
4 | * We group personalities into execution domains which have their | ||
5 | * own handlers for kernel entry points, signal mapping, etc... | ||
6 | * | ||
7 | * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) | ||
8 | */ | ||
9 | |||
10 | #include <linux/config.h> | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/kmod.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/personality.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/sysctl.h> | ||
19 | #include <linux/types.h> | ||
20 | |||
21 | |||
22 | static void default_handler(int, struct pt_regs *); | ||
23 | |||
24 | static struct exec_domain *exec_domains = &default_exec_domain; | ||
25 | static DEFINE_RWLOCK(exec_domains_lock); | ||
26 | |||
27 | |||
28 | static u_long ident_map[32] = { | ||
29 | 0, 1, 2, 3, 4, 5, 6, 7, | ||
30 | 8, 9, 10, 11, 12, 13, 14, 15, | ||
31 | 16, 17, 18, 19, 20, 21, 22, 23, | ||
32 | 24, 25, 26, 27, 28, 29, 30, 31 | ||
33 | }; | ||
34 | |||
35 | struct exec_domain default_exec_domain = { | ||
36 | .name = "Linux", /* name */ | ||
37 | .handler = default_handler, /* lcall7 causes a seg fault. */ | ||
38 | .pers_low = 0, /* PER_LINUX personality. */ | ||
39 | .pers_high = 0, /* PER_LINUX personality. */ | ||
40 | .signal_map = ident_map, /* Identity map signals. */ | ||
41 | .signal_invmap = ident_map, /* - both ways. */ | ||
42 | }; | ||
43 | |||
44 | |||
45 | static void | ||
46 | default_handler(int segment, struct pt_regs *regp) | ||
47 | { | ||
48 | set_personality(0); | ||
49 | |||
50 | if (current_thread_info()->exec_domain->handler != default_handler) | ||
51 | current_thread_info()->exec_domain->handler(segment, regp); | ||
52 | else | ||
53 | send_sig(SIGSEGV, current, 1); | ||
54 | } | ||
55 | |||
56 | static struct exec_domain * | ||
57 | lookup_exec_domain(u_long personality) | ||
58 | { | ||
59 | struct exec_domain * ep; | ||
60 | u_long pers = personality(personality); | ||
61 | |||
62 | read_lock(&exec_domains_lock); | ||
63 | for (ep = exec_domains; ep; ep = ep->next) { | ||
64 | if (pers >= ep->pers_low && pers <= ep->pers_high) | ||
65 | if (try_module_get(ep->module)) | ||
66 | goto out; | ||
67 | } | ||
68 | |||
69 | #ifdef CONFIG_KMOD | ||
70 | read_unlock(&exec_domains_lock); | ||
71 | request_module("personality-%ld", pers); | ||
72 | read_lock(&exec_domains_lock); | ||
73 | |||
74 | for (ep = exec_domains; ep; ep = ep->next) { | ||
75 | if (pers >= ep->pers_low && pers <= ep->pers_high) | ||
76 | if (try_module_get(ep->module)) | ||
77 | goto out; | ||
78 | } | ||
79 | #endif | ||
80 | |||
81 | ep = &default_exec_domain; | ||
82 | out: | ||
83 | read_unlock(&exec_domains_lock); | ||
84 | return (ep); | ||
85 | } | ||
86 | |||
87 | int | ||
88 | register_exec_domain(struct exec_domain *ep) | ||
89 | { | ||
90 | struct exec_domain *tmp; | ||
91 | int err = -EBUSY; | ||
92 | |||
93 | if (ep == NULL) | ||
94 | return -EINVAL; | ||
95 | |||
96 | if (ep->next != NULL) | ||
97 | return -EBUSY; | ||
98 | |||
99 | write_lock(&exec_domains_lock); | ||
100 | for (tmp = exec_domains; tmp; tmp = tmp->next) { | ||
101 | if (tmp == ep) | ||
102 | goto out; | ||
103 | } | ||
104 | |||
105 | ep->next = exec_domains; | ||
106 | exec_domains = ep; | ||
107 | err = 0; | ||
108 | |||
109 | out: | ||
110 | write_unlock(&exec_domains_lock); | ||
111 | return (err); | ||
112 | } | ||
113 | |||
114 | int | ||
115 | unregister_exec_domain(struct exec_domain *ep) | ||
116 | { | ||
117 | struct exec_domain **epp; | ||
118 | |||
119 | epp = &exec_domains; | ||
120 | write_lock(&exec_domains_lock); | ||
121 | for (epp = &exec_domains; *epp; epp = &(*epp)->next) { | ||
122 | if (ep == *epp) | ||
123 | goto unregister; | ||
124 | } | ||
125 | write_unlock(&exec_domains_lock); | ||
126 | return -EINVAL; | ||
127 | |||
128 | unregister: | ||
129 | *epp = ep->next; | ||
130 | ep->next = NULL; | ||
131 | write_unlock(&exec_domains_lock); | ||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | int | ||
136 | __set_personality(u_long personality) | ||
137 | { | ||
138 | struct exec_domain *ep, *oep; | ||
139 | |||
140 | ep = lookup_exec_domain(personality); | ||
141 | if (ep == current_thread_info()->exec_domain) { | ||
142 | current->personality = personality; | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | if (atomic_read(¤t->fs->count) != 1) { | ||
147 | struct fs_struct *fsp, *ofsp; | ||
148 | |||
149 | fsp = copy_fs_struct(current->fs); | ||
150 | if (fsp == NULL) { | ||
151 | module_put(ep->module); | ||
152 | return -ENOMEM; | ||
153 | } | ||
154 | |||
155 | task_lock(current); | ||
156 | ofsp = current->fs; | ||
157 | current->fs = fsp; | ||
158 | task_unlock(current); | ||
159 | |||
160 | put_fs_struct(ofsp); | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * At that point we are guaranteed to be the sole owner of | ||
165 | * current->fs. | ||
166 | */ | ||
167 | |||
168 | current->personality = personality; | ||
169 | oep = current_thread_info()->exec_domain; | ||
170 | current_thread_info()->exec_domain = ep; | ||
171 | set_fs_altroot(); | ||
172 | |||
173 | module_put(oep->module); | ||
174 | return 0; | ||
175 | } | ||
176 | |||
177 | int | ||
178 | get_exec_domain_list(char *page) | ||
179 | { | ||
180 | struct exec_domain *ep; | ||
181 | int len = 0; | ||
182 | |||
183 | read_lock(&exec_domains_lock); | ||
184 | for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next) | ||
185 | len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n", | ||
186 | ep->pers_low, ep->pers_high, ep->name, | ||
187 | module_name(ep->module)); | ||
188 | read_unlock(&exec_domains_lock); | ||
189 | return (len); | ||
190 | } | ||
191 | |||
192 | asmlinkage long | ||
193 | sys_personality(u_long personality) | ||
194 | { | ||
195 | u_long old = current->personality; | ||
196 | |||
197 | if (personality != 0xffffffff) { | ||
198 | set_personality(personality); | ||
199 | if (current->personality != personality) | ||
200 | return -EINVAL; | ||
201 | } | ||
202 | |||
203 | return (long)old; | ||
204 | } | ||
205 | |||
206 | |||
207 | EXPORT_SYMBOL(register_exec_domain); | ||
208 | EXPORT_SYMBOL(unregister_exec_domain); | ||
209 | EXPORT_SYMBOL(__set_personality); | ||
diff --git a/kernel/exit.c b/kernel/exit.c new file mode 100644 index 000000000000..6dd4ebe1dd90 --- /dev/null +++ b/kernel/exit.c | |||
@@ -0,0 +1,1527 @@ | |||
1 | /* | ||
2 | * linux/kernel/exit.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | #include <linux/config.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/interrupt.h> | ||
11 | #include <linux/smp_lock.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/completion.h> | ||
14 | #include <linux/personality.h> | ||
15 | #include <linux/tty.h> | ||
16 | #include <linux/namespace.h> | ||
17 | #include <linux/key.h> | ||
18 | #include <linux/security.h> | ||
19 | #include <linux/cpu.h> | ||
20 | #include <linux/acct.h> | ||
21 | #include <linux/file.h> | ||
22 | #include <linux/binfmts.h> | ||
23 | #include <linux/ptrace.h> | ||
24 | #include <linux/profile.h> | ||
25 | #include <linux/mount.h> | ||
26 | #include <linux/proc_fs.h> | ||
27 | #include <linux/mempolicy.h> | ||
28 | #include <linux/cpuset.h> | ||
29 | #include <linux/syscalls.h> | ||
30 | |||
31 | #include <asm/uaccess.h> | ||
32 | #include <asm/unistd.h> | ||
33 | #include <asm/pgtable.h> | ||
34 | #include <asm/mmu_context.h> | ||
35 | |||
36 | extern void sem_exit (void); | ||
37 | extern struct task_struct *child_reaper; | ||
38 | |||
39 | int getrusage(struct task_struct *, int, struct rusage __user *); | ||
40 | |||
41 | static void __unhash_process(struct task_struct *p) | ||
42 | { | ||
43 | nr_threads--; | ||
44 | detach_pid(p, PIDTYPE_PID); | ||
45 | detach_pid(p, PIDTYPE_TGID); | ||
46 | if (thread_group_leader(p)) { | ||
47 | detach_pid(p, PIDTYPE_PGID); | ||
48 | detach_pid(p, PIDTYPE_SID); | ||
49 | if (p->pid) | ||
50 | __get_cpu_var(process_counts)--; | ||
51 | } | ||
52 | |||
53 | REMOVE_LINKS(p); | ||
54 | } | ||
55 | |||
56 | void release_task(struct task_struct * p) | ||
57 | { | ||
58 | int zap_leader; | ||
59 | task_t *leader; | ||
60 | struct dentry *proc_dentry; | ||
61 | |||
62 | repeat: | ||
63 | atomic_dec(&p->user->processes); | ||
64 | spin_lock(&p->proc_lock); | ||
65 | proc_dentry = proc_pid_unhash(p); | ||
66 | write_lock_irq(&tasklist_lock); | ||
67 | if (unlikely(p->ptrace)) | ||
68 | __ptrace_unlink(p); | ||
69 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | ||
70 | __exit_signal(p); | ||
71 | __exit_sighand(p); | ||
72 | __unhash_process(p); | ||
73 | |||
74 | /* | ||
75 | * If we are the last non-leader member of the thread | ||
76 | * group, and the leader is zombie, then notify the | ||
77 | * group leader's parent process. (if it wants notification.) | ||
78 | */ | ||
79 | zap_leader = 0; | ||
80 | leader = p->group_leader; | ||
81 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { | ||
82 | BUG_ON(leader->exit_signal == -1); | ||
83 | do_notify_parent(leader, leader->exit_signal); | ||
84 | /* | ||
85 | * If we were the last child thread and the leader has | ||
86 | * exited already, and the leader's parent ignores SIGCHLD, | ||
87 | * then we are the one who should release the leader. | ||
88 | * | ||
89 | * do_notify_parent() will have marked it self-reaping in | ||
90 | * that case. | ||
91 | */ | ||
92 | zap_leader = (leader->exit_signal == -1); | ||
93 | } | ||
94 | |||
95 | sched_exit(p); | ||
96 | write_unlock_irq(&tasklist_lock); | ||
97 | spin_unlock(&p->proc_lock); | ||
98 | proc_pid_flush(proc_dentry); | ||
99 | release_thread(p); | ||
100 | put_task_struct(p); | ||
101 | |||
102 | p = leader; | ||
103 | if (unlikely(zap_leader)) | ||
104 | goto repeat; | ||
105 | } | ||
106 | |||
107 | /* we are using it only for SMP init */ | ||
108 | |||
109 | void unhash_process(struct task_struct *p) | ||
110 | { | ||
111 | struct dentry *proc_dentry; | ||
112 | |||
113 | spin_lock(&p->proc_lock); | ||
114 | proc_dentry = proc_pid_unhash(p); | ||
115 | write_lock_irq(&tasklist_lock); | ||
116 | __unhash_process(p); | ||
117 | write_unlock_irq(&tasklist_lock); | ||
118 | spin_unlock(&p->proc_lock); | ||
119 | proc_pid_flush(proc_dentry); | ||
120 | } | ||
121 | |||
122 | /* | ||
123 | * This checks not only the pgrp, but falls back on the pid if no | ||
124 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly | ||
125 | * without this... | ||
126 | */ | ||
127 | int session_of_pgrp(int pgrp) | ||
128 | { | ||
129 | struct task_struct *p; | ||
130 | int sid = -1; | ||
131 | |||
132 | read_lock(&tasklist_lock); | ||
133 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | ||
134 | if (p->signal->session > 0) { | ||
135 | sid = p->signal->session; | ||
136 | goto out; | ||
137 | } | ||
138 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | ||
139 | p = find_task_by_pid(pgrp); | ||
140 | if (p) | ||
141 | sid = p->signal->session; | ||
142 | out: | ||
143 | read_unlock(&tasklist_lock); | ||
144 | |||
145 | return sid; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Determine if a process group is "orphaned", according to the POSIX | ||
150 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | ||
151 | * by terminal-generated stop signals. Newly orphaned process groups are | ||
152 | * to receive a SIGHUP and a SIGCONT. | ||
153 | * | ||
154 | * "I ask you, have you ever known what it is to be an orphan?" | ||
155 | */ | ||
156 | static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) | ||
157 | { | ||
158 | struct task_struct *p; | ||
159 | int ret = 1; | ||
160 | |||
161 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | ||
162 | if (p == ignored_task | ||
163 | || p->exit_state | ||
164 | || p->real_parent->pid == 1) | ||
165 | continue; | ||
166 | if (process_group(p->real_parent) != pgrp | ||
167 | && p->real_parent->signal->session == p->signal->session) { | ||
168 | ret = 0; | ||
169 | break; | ||
170 | } | ||
171 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | ||
172 | return ret; /* (sighing) "Often!" */ | ||
173 | } | ||
174 | |||
175 | int is_orphaned_pgrp(int pgrp) | ||
176 | { | ||
177 | int retval; | ||
178 | |||
179 | read_lock(&tasklist_lock); | ||
180 | retval = will_become_orphaned_pgrp(pgrp, NULL); | ||
181 | read_unlock(&tasklist_lock); | ||
182 | |||
183 | return retval; | ||
184 | } | ||
185 | |||
186 | static inline int has_stopped_jobs(int pgrp) | ||
187 | { | ||
188 | int retval = 0; | ||
189 | struct task_struct *p; | ||
190 | |||
191 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | ||
192 | if (p->state != TASK_STOPPED) | ||
193 | continue; | ||
194 | |||
195 | /* If p is stopped by a debugger on a signal that won't | ||
196 | stop it, then don't count p as stopped. This isn't | ||
197 | perfect but it's a good approximation. */ | ||
198 | if (unlikely (p->ptrace) | ||
199 | && p->exit_code != SIGSTOP | ||
200 | && p->exit_code != SIGTSTP | ||
201 | && p->exit_code != SIGTTOU | ||
202 | && p->exit_code != SIGTTIN) | ||
203 | continue; | ||
204 | |||
205 | retval = 1; | ||
206 | break; | ||
207 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | ||
208 | return retval; | ||
209 | } | ||
210 | |||
211 | /** | ||
212 | * reparent_to_init() - Reparent the calling kernel thread to the init task. | ||
213 | * | ||
214 | * If a kernel thread is launched as a result of a system call, or if | ||
215 | * it ever exits, it should generally reparent itself to init so that | ||
216 | * it is correctly cleaned up on exit. | ||
217 | * | ||
218 | * The various task state such as scheduling policy and priority may have | ||
219 | * been inherited from a user process, so we reset them to sane values here. | ||
220 | * | ||
221 | * NOTE that reparent_to_init() gives the caller full capabilities. | ||
222 | */ | ||
223 | void reparent_to_init(void) | ||
224 | { | ||
225 | write_lock_irq(&tasklist_lock); | ||
226 | |||
227 | ptrace_unlink(current); | ||
228 | /* Reparent to init */ | ||
229 | REMOVE_LINKS(current); | ||
230 | current->parent = child_reaper; | ||
231 | current->real_parent = child_reaper; | ||
232 | SET_LINKS(current); | ||
233 | |||
234 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | ||
235 | current->exit_signal = SIGCHLD; | ||
236 | |||
237 | if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) | ||
238 | set_user_nice(current, 0); | ||
239 | /* cpus_allowed? */ | ||
240 | /* rt_priority? */ | ||
241 | /* signals? */ | ||
242 | security_task_reparent_to_init(current); | ||
243 | memcpy(current->signal->rlim, init_task.signal->rlim, | ||
244 | sizeof(current->signal->rlim)); | ||
245 | atomic_inc(&(INIT_USER->__count)); | ||
246 | write_unlock_irq(&tasklist_lock); | ||
247 | switch_uid(INIT_USER); | ||
248 | } | ||
249 | |||
250 | void __set_special_pids(pid_t session, pid_t pgrp) | ||
251 | { | ||
252 | struct task_struct *curr = current; | ||
253 | |||
254 | if (curr->signal->session != session) { | ||
255 | detach_pid(curr, PIDTYPE_SID); | ||
256 | curr->signal->session = session; | ||
257 | attach_pid(curr, PIDTYPE_SID, session); | ||
258 | } | ||
259 | if (process_group(curr) != pgrp) { | ||
260 | detach_pid(curr, PIDTYPE_PGID); | ||
261 | curr->signal->pgrp = pgrp; | ||
262 | attach_pid(curr, PIDTYPE_PGID, pgrp); | ||
263 | } | ||
264 | } | ||
265 | |||
266 | void set_special_pids(pid_t session, pid_t pgrp) | ||
267 | { | ||
268 | write_lock_irq(&tasklist_lock); | ||
269 | __set_special_pids(session, pgrp); | ||
270 | write_unlock_irq(&tasklist_lock); | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | * Let kernel threads use this to say that they | ||
275 | * allow a certain signal (since daemonize() will | ||
276 | * have disabled all of them by default). | ||
277 | */ | ||
278 | int allow_signal(int sig) | ||
279 | { | ||
280 | if (sig < 1 || sig > _NSIG) | ||
281 | return -EINVAL; | ||
282 | |||
283 | spin_lock_irq(¤t->sighand->siglock); | ||
284 | sigdelset(¤t->blocked, sig); | ||
285 | if (!current->mm) { | ||
286 | /* Kernel threads handle their own signals. | ||
287 | Let the signal code know it'll be handled, so | ||
288 | that they don't get converted to SIGKILL or | ||
289 | just silently dropped */ | ||
290 | current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; | ||
291 | } | ||
292 | recalc_sigpending(); | ||
293 | spin_unlock_irq(¤t->sighand->siglock); | ||
294 | return 0; | ||
295 | } | ||
296 | |||
297 | EXPORT_SYMBOL(allow_signal); | ||
298 | |||
299 | int disallow_signal(int sig) | ||
300 | { | ||
301 | if (sig < 1 || sig > _NSIG) | ||
302 | return -EINVAL; | ||
303 | |||
304 | spin_lock_irq(¤t->sighand->siglock); | ||
305 | sigaddset(¤t->blocked, sig); | ||
306 | recalc_sigpending(); | ||
307 | spin_unlock_irq(¤t->sighand->siglock); | ||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | EXPORT_SYMBOL(disallow_signal); | ||
312 | |||
313 | /* | ||
314 | * Put all the gunge required to become a kernel thread without | ||
315 | * attached user resources in one place where it belongs. | ||
316 | */ | ||
317 | |||
318 | void daemonize(const char *name, ...) | ||
319 | { | ||
320 | va_list args; | ||
321 | struct fs_struct *fs; | ||
322 | sigset_t blocked; | ||
323 | |||
324 | va_start(args, name); | ||
325 | vsnprintf(current->comm, sizeof(current->comm), name, args); | ||
326 | va_end(args); | ||
327 | |||
328 | /* | ||
329 | * If we were started as result of loading a module, close all of the | ||
330 | * user space pages. We don't need them, and if we didn't close them | ||
331 | * they would be locked into memory. | ||
332 | */ | ||
333 | exit_mm(current); | ||
334 | |||
335 | set_special_pids(1, 1); | ||
336 | down(&tty_sem); | ||
337 | current->signal->tty = NULL; | ||
338 | up(&tty_sem); | ||
339 | |||
340 | /* Block and flush all signals */ | ||
341 | sigfillset(&blocked); | ||
342 | sigprocmask(SIG_BLOCK, &blocked, NULL); | ||
343 | flush_signals(current); | ||
344 | |||
345 | /* Become as one with the init task */ | ||
346 | |||
347 | exit_fs(current); /* current->fs->count--; */ | ||
348 | fs = init_task.fs; | ||
349 | current->fs = fs; | ||
350 | atomic_inc(&fs->count); | ||
351 | exit_files(current); | ||
352 | current->files = init_task.files; | ||
353 | atomic_inc(¤t->files->count); | ||
354 | |||
355 | reparent_to_init(); | ||
356 | } | ||
357 | |||
358 | EXPORT_SYMBOL(daemonize); | ||
359 | |||
360 | static inline void close_files(struct files_struct * files) | ||
361 | { | ||
362 | int i, j; | ||
363 | |||
364 | j = 0; | ||
365 | for (;;) { | ||
366 | unsigned long set; | ||
367 | i = j * __NFDBITS; | ||
368 | if (i >= files->max_fdset || i >= files->max_fds) | ||
369 | break; | ||
370 | set = files->open_fds->fds_bits[j++]; | ||
371 | while (set) { | ||
372 | if (set & 1) { | ||
373 | struct file * file = xchg(&files->fd[i], NULL); | ||
374 | if (file) | ||
375 | filp_close(file, files); | ||
376 | } | ||
377 | i++; | ||
378 | set >>= 1; | ||
379 | } | ||
380 | } | ||
381 | } | ||
382 | |||
383 | struct files_struct *get_files_struct(struct task_struct *task) | ||
384 | { | ||
385 | struct files_struct *files; | ||
386 | |||
387 | task_lock(task); | ||
388 | files = task->files; | ||
389 | if (files) | ||
390 | atomic_inc(&files->count); | ||
391 | task_unlock(task); | ||
392 | |||
393 | return files; | ||
394 | } | ||
395 | |||
396 | void fastcall put_files_struct(struct files_struct *files) | ||
397 | { | ||
398 | if (atomic_dec_and_test(&files->count)) { | ||
399 | close_files(files); | ||
400 | /* | ||
401 | * Free the fd and fdset arrays if we expanded them. | ||
402 | */ | ||
403 | if (files->fd != &files->fd_array[0]) | ||
404 | free_fd_array(files->fd, files->max_fds); | ||
405 | if (files->max_fdset > __FD_SETSIZE) { | ||
406 | free_fdset(files->open_fds, files->max_fdset); | ||
407 | free_fdset(files->close_on_exec, files->max_fdset); | ||
408 | } | ||
409 | kmem_cache_free(files_cachep, files); | ||
410 | } | ||
411 | } | ||
412 | |||
413 | EXPORT_SYMBOL(put_files_struct); | ||
414 | |||
415 | static inline void __exit_files(struct task_struct *tsk) | ||
416 | { | ||
417 | struct files_struct * files = tsk->files; | ||
418 | |||
419 | if (files) { | ||
420 | task_lock(tsk); | ||
421 | tsk->files = NULL; | ||
422 | task_unlock(tsk); | ||
423 | put_files_struct(files); | ||
424 | } | ||
425 | } | ||
426 | |||
427 | void exit_files(struct task_struct *tsk) | ||
428 | { | ||
429 | __exit_files(tsk); | ||
430 | } | ||
431 | |||
432 | static inline void __put_fs_struct(struct fs_struct *fs) | ||
433 | { | ||
434 | /* No need to hold fs->lock if we are killing it */ | ||
435 | if (atomic_dec_and_test(&fs->count)) { | ||
436 | dput(fs->root); | ||
437 | mntput(fs->rootmnt); | ||
438 | dput(fs->pwd); | ||
439 | mntput(fs->pwdmnt); | ||
440 | if (fs->altroot) { | ||
441 | dput(fs->altroot); | ||
442 | mntput(fs->altrootmnt); | ||
443 | } | ||
444 | kmem_cache_free(fs_cachep, fs); | ||
445 | } | ||
446 | } | ||
447 | |||
448 | void put_fs_struct(struct fs_struct *fs) | ||
449 | { | ||
450 | __put_fs_struct(fs); | ||
451 | } | ||
452 | |||
453 | static inline void __exit_fs(struct task_struct *tsk) | ||
454 | { | ||
455 | struct fs_struct * fs = tsk->fs; | ||
456 | |||
457 | if (fs) { | ||
458 | task_lock(tsk); | ||
459 | tsk->fs = NULL; | ||
460 | task_unlock(tsk); | ||
461 | __put_fs_struct(fs); | ||
462 | } | ||
463 | } | ||
464 | |||
465 | void exit_fs(struct task_struct *tsk) | ||
466 | { | ||
467 | __exit_fs(tsk); | ||
468 | } | ||
469 | |||
470 | EXPORT_SYMBOL_GPL(exit_fs); | ||
471 | |||
472 | /* | ||
473 | * Turn us into a lazy TLB process if we | ||
474 | * aren't already.. | ||
475 | */ | ||
476 | void exit_mm(struct task_struct * tsk) | ||
477 | { | ||
478 | struct mm_struct *mm = tsk->mm; | ||
479 | |||
480 | mm_release(tsk, mm); | ||
481 | if (!mm) | ||
482 | return; | ||
483 | /* | ||
484 | * Serialize with any possible pending coredump. | ||
485 | * We must hold mmap_sem around checking core_waiters | ||
486 | * and clearing tsk->mm. The core-inducing thread | ||
487 | * will increment core_waiters for each thread in the | ||
488 | * group with ->mm != NULL. | ||
489 | */ | ||
490 | down_read(&mm->mmap_sem); | ||
491 | if (mm->core_waiters) { | ||
492 | up_read(&mm->mmap_sem); | ||
493 | down_write(&mm->mmap_sem); | ||
494 | if (!--mm->core_waiters) | ||
495 | complete(mm->core_startup_done); | ||
496 | up_write(&mm->mmap_sem); | ||
497 | |||
498 | wait_for_completion(&mm->core_done); | ||
499 | down_read(&mm->mmap_sem); | ||
500 | } | ||
501 | atomic_inc(&mm->mm_count); | ||
502 | if (mm != tsk->active_mm) BUG(); | ||
503 | /* more a memory barrier than a real lock */ | ||
504 | task_lock(tsk); | ||
505 | tsk->mm = NULL; | ||
506 | up_read(&mm->mmap_sem); | ||
507 | enter_lazy_tlb(mm, current); | ||
508 | task_unlock(tsk); | ||
509 | mmput(mm); | ||
510 | } | ||
511 | |||
512 | static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) | ||
513 | { | ||
514 | /* | ||
515 | * Make sure we're not reparenting to ourselves and that | ||
516 | * the parent is not a zombie. | ||
517 | */ | ||
518 | BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); | ||
519 | p->real_parent = reaper; | ||
520 | if (p->parent == p->real_parent) | ||
521 | BUG(); | ||
522 | } | ||
523 | |||
524 | static inline void reparent_thread(task_t *p, task_t *father, int traced) | ||
525 | { | ||
526 | /* We don't want people slaying init. */ | ||
527 | if (p->exit_signal != -1) | ||
528 | p->exit_signal = SIGCHLD; | ||
529 | |||
530 | if (p->pdeath_signal) | ||
531 | /* We already hold the tasklist_lock here. */ | ||
532 | group_send_sig_info(p->pdeath_signal, (void *) 0, p); | ||
533 | |||
534 | /* Move the child from its dying parent to the new one. */ | ||
535 | if (unlikely(traced)) { | ||
536 | /* Preserve ptrace links if someone else is tracing this child. */ | ||
537 | list_del_init(&p->ptrace_list); | ||
538 | if (p->parent != p->real_parent) | ||
539 | list_add(&p->ptrace_list, &p->real_parent->ptrace_children); | ||
540 | } else { | ||
541 | /* If this child is being traced, then we're the one tracing it | ||
542 | * anyway, so let go of it. | ||
543 | */ | ||
544 | p->ptrace = 0; | ||
545 | list_del_init(&p->sibling); | ||
546 | p->parent = p->real_parent; | ||
547 | list_add_tail(&p->sibling, &p->parent->children); | ||
548 | |||
549 | /* If we'd notified the old parent about this child's death, | ||
550 | * also notify the new parent. | ||
551 | */ | ||
552 | if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && | ||
553 | thread_group_empty(p)) | ||
554 | do_notify_parent(p, p->exit_signal); | ||
555 | else if (p->state == TASK_TRACED) { | ||
556 | /* | ||
557 | * If it was at a trace stop, turn it into | ||
558 | * a normal stop since it's no longer being | ||
559 | * traced. | ||
560 | */ | ||
561 | ptrace_untrace(p); | ||
562 | } | ||
563 | } | ||
564 | |||
565 | /* | ||
566 | * process group orphan check | ||
567 | * Case ii: Our child is in a different pgrp | ||
568 | * than we are, and it was the only connection | ||
569 | * outside, so the child pgrp is now orphaned. | ||
570 | */ | ||
571 | if ((process_group(p) != process_group(father)) && | ||
572 | (p->signal->session == father->signal->session)) { | ||
573 | int pgrp = process_group(p); | ||
574 | |||
575 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { | ||
576 | __kill_pg_info(SIGHUP, (void *)1, pgrp); | ||
577 | __kill_pg_info(SIGCONT, (void *)1, pgrp); | ||
578 | } | ||
579 | } | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * When we die, we re-parent all our children. | ||
584 | * Try to give them to another thread in our thread | ||
585 | * group, and if no such member exists, give it to | ||
586 | * the global child reaper process (ie "init") | ||
587 | */ | ||
588 | static inline void forget_original_parent(struct task_struct * father, | ||
589 | struct list_head *to_release) | ||
590 | { | ||
591 | struct task_struct *p, *reaper = father; | ||
592 | struct list_head *_p, *_n; | ||
593 | |||
594 | do { | ||
595 | reaper = next_thread(reaper); | ||
596 | if (reaper == father) { | ||
597 | reaper = child_reaper; | ||
598 | break; | ||
599 | } | ||
600 | } while (reaper->exit_state); | ||
601 | |||
602 | /* | ||
603 | * There are only two places where our children can be: | ||
604 | * | ||
605 | * - in our child list | ||
606 | * - in our ptraced child list | ||
607 | * | ||
608 | * Search them and reparent children. | ||
609 | */ | ||
610 | list_for_each_safe(_p, _n, &father->children) { | ||
611 | int ptrace; | ||
612 | p = list_entry(_p,struct task_struct,sibling); | ||
613 | |||
614 | ptrace = p->ptrace; | ||
615 | |||
616 | /* if father isn't the real parent, then ptrace must be enabled */ | ||
617 | BUG_ON(father != p->real_parent && !ptrace); | ||
618 | |||
619 | if (father == p->real_parent) { | ||
620 | /* reparent with a reaper, real father it's us */ | ||
621 | choose_new_parent(p, reaper, child_reaper); | ||
622 | reparent_thread(p, father, 0); | ||
623 | } else { | ||
624 | /* reparent ptraced task to its real parent */ | ||
625 | __ptrace_unlink (p); | ||
626 | if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && | ||
627 | thread_group_empty(p)) | ||
628 | do_notify_parent(p, p->exit_signal); | ||
629 | } | ||
630 | |||
631 | /* | ||
632 | * if the ptraced child is a zombie with exit_signal == -1 | ||
633 | * we must collect it before we exit, or it will remain | ||
634 | * zombie forever since we prevented it from self-reap itself | ||
635 | * while it was being traced by us, to be able to see it in wait4. | ||
636 | */ | ||
637 | if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) | ||
638 | list_add(&p->ptrace_list, to_release); | ||
639 | } | ||
640 | list_for_each_safe(_p, _n, &father->ptrace_children) { | ||
641 | p = list_entry(_p,struct task_struct,ptrace_list); | ||
642 | choose_new_parent(p, reaper, child_reaper); | ||
643 | reparent_thread(p, father, 1); | ||
644 | } | ||
645 | } | ||
646 | |||
647 | /* | ||
648 | * Send signals to all our closest relatives so that they know | ||
649 | * to properly mourn us.. | ||
650 | */ | ||
651 | static void exit_notify(struct task_struct *tsk) | ||
652 | { | ||
653 | int state; | ||
654 | struct task_struct *t; | ||
655 | struct list_head ptrace_dead, *_p, *_n; | ||
656 | |||
657 | if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) | ||
658 | && !thread_group_empty(tsk)) { | ||
659 | /* | ||
660 | * This occurs when there was a race between our exit | ||
661 | * syscall and a group signal choosing us as the one to | ||
662 | * wake up. It could be that we are the only thread | ||
663 | * alerted to check for pending signals, but another thread | ||
664 | * should be woken now to take the signal since we will not. | ||
665 | * Now we'll wake all the threads in the group just to make | ||
666 | * sure someone gets all the pending signals. | ||
667 | */ | ||
668 | read_lock(&tasklist_lock); | ||
669 | spin_lock_irq(&tsk->sighand->siglock); | ||
670 | for (t = next_thread(tsk); t != tsk; t = next_thread(t)) | ||
671 | if (!signal_pending(t) && !(t->flags & PF_EXITING)) { | ||
672 | recalc_sigpending_tsk(t); | ||
673 | if (signal_pending(t)) | ||
674 | signal_wake_up(t, 0); | ||
675 | } | ||
676 | spin_unlock_irq(&tsk->sighand->siglock); | ||
677 | read_unlock(&tasklist_lock); | ||
678 | } | ||
679 | |||
680 | write_lock_irq(&tasklist_lock); | ||
681 | |||
682 | /* | ||
683 | * This does two things: | ||
684 | * | ||
685 | * A. Make init inherit all the child processes | ||
686 | * B. Check to see if any process groups have become orphaned | ||
687 | * as a result of our exiting, and if they have any stopped | ||
688 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
689 | */ | ||
690 | |||
691 | INIT_LIST_HEAD(&ptrace_dead); | ||
692 | forget_original_parent(tsk, &ptrace_dead); | ||
693 | BUG_ON(!list_empty(&tsk->children)); | ||
694 | BUG_ON(!list_empty(&tsk->ptrace_children)); | ||
695 | |||
696 | /* | ||
697 | * Check to see if any process groups have become orphaned | ||
698 | * as a result of our exiting, and if they have any stopped | ||
699 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
700 | * | ||
701 | * Case i: Our father is in a different pgrp than we are | ||
702 | * and we were the only connection outside, so our pgrp | ||
703 | * is about to become orphaned. | ||
704 | */ | ||
705 | |||
706 | t = tsk->real_parent; | ||
707 | |||
708 | if ((process_group(t) != process_group(tsk)) && | ||
709 | (t->signal->session == tsk->signal->session) && | ||
710 | will_become_orphaned_pgrp(process_group(tsk), tsk) && | ||
711 | has_stopped_jobs(process_group(tsk))) { | ||
712 | __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); | ||
713 | __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); | ||
714 | } | ||
715 | |||
716 | /* Let father know we died | ||
717 | * | ||
718 | * Thread signals are configurable, but you aren't going to use | ||
719 | * that to send signals to arbitary processes. | ||
720 | * That stops right now. | ||
721 | * | ||
722 | * If the parent exec id doesn't match the exec id we saved | ||
723 | * when we started then we know the parent has changed security | ||
724 | * domain. | ||
725 | * | ||
726 | * If our self_exec id doesn't match our parent_exec_id then | ||
727 | * we have changed execution domain as these two values started | ||
728 | * the same after a fork. | ||
729 | * | ||
730 | */ | ||
731 | |||
732 | if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && | ||
733 | ( tsk->parent_exec_id != t->self_exec_id || | ||
734 | tsk->self_exec_id != tsk->parent_exec_id) | ||
735 | && !capable(CAP_KILL)) | ||
736 | tsk->exit_signal = SIGCHLD; | ||
737 | |||
738 | |||
739 | /* If something other than our normal parent is ptracing us, then | ||
740 | * send it a SIGCHLD instead of honoring exit_signal. exit_signal | ||
741 | * only has special meaning to our real parent. | ||
742 | */ | ||
743 | if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { | ||
744 | int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; | ||
745 | do_notify_parent(tsk, signal); | ||
746 | } else if (tsk->ptrace) { | ||
747 | do_notify_parent(tsk, SIGCHLD); | ||
748 | } | ||
749 | |||
750 | state = EXIT_ZOMBIE; | ||
751 | if (tsk->exit_signal == -1 && | ||
752 | (likely(tsk->ptrace == 0) || | ||
753 | unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) | ||
754 | state = EXIT_DEAD; | ||
755 | tsk->exit_state = state; | ||
756 | |||
757 | write_unlock_irq(&tasklist_lock); | ||
758 | |||
759 | list_for_each_safe(_p, _n, &ptrace_dead) { | ||
760 | list_del_init(_p); | ||
761 | t = list_entry(_p,struct task_struct,ptrace_list); | ||
762 | release_task(t); | ||
763 | } | ||
764 | |||
765 | /* If the process is dead, release it - nobody will wait for it */ | ||
766 | if (state == EXIT_DEAD) | ||
767 | release_task(tsk); | ||
768 | |||
769 | /* PF_DEAD causes final put_task_struct after we schedule. */ | ||
770 | preempt_disable(); | ||
771 | tsk->flags |= PF_DEAD; | ||
772 | } | ||
773 | |||
774 | fastcall NORET_TYPE void do_exit(long code) | ||
775 | { | ||
776 | struct task_struct *tsk = current; | ||
777 | int group_dead; | ||
778 | |||
779 | profile_task_exit(tsk); | ||
780 | |||
781 | if (unlikely(in_interrupt())) | ||
782 | panic("Aiee, killing interrupt handler!"); | ||
783 | if (unlikely(!tsk->pid)) | ||
784 | panic("Attempted to kill the idle task!"); | ||
785 | if (unlikely(tsk->pid == 1)) | ||
786 | panic("Attempted to kill init!"); | ||
787 | if (tsk->io_context) | ||
788 | exit_io_context(); | ||
789 | |||
790 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { | ||
791 | current->ptrace_message = code; | ||
792 | ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); | ||
793 | } | ||
794 | |||
795 | tsk->flags |= PF_EXITING; | ||
796 | |||
797 | /* | ||
798 | * Make sure we don't try to process any timer firings | ||
799 | * while we are already exiting. | ||
800 | */ | ||
801 | tsk->it_virt_expires = cputime_zero; | ||
802 | tsk->it_prof_expires = cputime_zero; | ||
803 | tsk->it_sched_expires = 0; | ||
804 | |||
805 | if (unlikely(in_atomic())) | ||
806 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | ||
807 | current->comm, current->pid, | ||
808 | preempt_count()); | ||
809 | |||
810 | acct_update_integrals(tsk); | ||
811 | update_mem_hiwater(tsk); | ||
812 | group_dead = atomic_dec_and_test(&tsk->signal->live); | ||
813 | if (group_dead) { | ||
814 | del_timer_sync(&tsk->signal->real_timer); | ||
815 | acct_process(code); | ||
816 | } | ||
817 | exit_mm(tsk); | ||
818 | |||
819 | exit_sem(tsk); | ||
820 | __exit_files(tsk); | ||
821 | __exit_fs(tsk); | ||
822 | exit_namespace(tsk); | ||
823 | exit_thread(); | ||
824 | cpuset_exit(tsk); | ||
825 | exit_keys(tsk); | ||
826 | |||
827 | if (group_dead && tsk->signal->leader) | ||
828 | disassociate_ctty(1); | ||
829 | |||
830 | module_put(tsk->thread_info->exec_domain->module); | ||
831 | if (tsk->binfmt) | ||
832 | module_put(tsk->binfmt->module); | ||
833 | |||
834 | tsk->exit_code = code; | ||
835 | exit_notify(tsk); | ||
836 | #ifdef CONFIG_NUMA | ||
837 | mpol_free(tsk->mempolicy); | ||
838 | tsk->mempolicy = NULL; | ||
839 | #endif | ||
840 | |||
841 | BUG_ON(!(current->flags & PF_DEAD)); | ||
842 | schedule(); | ||
843 | BUG(); | ||
844 | /* Avoid "noreturn function does return". */ | ||
845 | for (;;) ; | ||
846 | } | ||
847 | |||
848 | NORET_TYPE void complete_and_exit(struct completion *comp, long code) | ||
849 | { | ||
850 | if (comp) | ||
851 | complete(comp); | ||
852 | |||
853 | do_exit(code); | ||
854 | } | ||
855 | |||
856 | EXPORT_SYMBOL(complete_and_exit); | ||
857 | |||
858 | asmlinkage long sys_exit(int error_code) | ||
859 | { | ||
860 | do_exit((error_code&0xff)<<8); | ||
861 | } | ||
862 | |||
863 | task_t fastcall *next_thread(const task_t *p) | ||
864 | { | ||
865 | return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); | ||
866 | } | ||
867 | |||
868 | EXPORT_SYMBOL(next_thread); | ||
869 | |||
870 | /* | ||
871 | * Take down every thread in the group. This is called by fatal signals | ||
872 | * as well as by sys_exit_group (below). | ||
873 | */ | ||
874 | NORET_TYPE void | ||
875 | do_group_exit(int exit_code) | ||
876 | { | ||
877 | BUG_ON(exit_code & 0x80); /* core dumps don't get here */ | ||
878 | |||
879 | if (current->signal->flags & SIGNAL_GROUP_EXIT) | ||
880 | exit_code = current->signal->group_exit_code; | ||
881 | else if (!thread_group_empty(current)) { | ||
882 | struct signal_struct *const sig = current->signal; | ||
883 | struct sighand_struct *const sighand = current->sighand; | ||
884 | read_lock(&tasklist_lock); | ||
885 | spin_lock_irq(&sighand->siglock); | ||
886 | if (sig->flags & SIGNAL_GROUP_EXIT) | ||
887 | /* Another thread got here before we took the lock. */ | ||
888 | exit_code = sig->group_exit_code; | ||
889 | else { | ||
890 | sig->flags = SIGNAL_GROUP_EXIT; | ||
891 | sig->group_exit_code = exit_code; | ||
892 | zap_other_threads(current); | ||
893 | } | ||
894 | spin_unlock_irq(&sighand->siglock); | ||
895 | read_unlock(&tasklist_lock); | ||
896 | } | ||
897 | |||
898 | do_exit(exit_code); | ||
899 | /* NOTREACHED */ | ||
900 | } | ||
901 | |||
902 | /* | ||
903 | * this kills every thread in the thread group. Note that any externally | ||
904 | * wait4()-ing process will get the correct exit code - even if this | ||
905 | * thread is not the thread group leader. | ||
906 | */ | ||
907 | asmlinkage void sys_exit_group(int error_code) | ||
908 | { | ||
909 | do_group_exit((error_code & 0xff) << 8); | ||
910 | } | ||
911 | |||
912 | static int eligible_child(pid_t pid, int options, task_t *p) | ||
913 | { | ||
914 | if (pid > 0) { | ||
915 | if (p->pid != pid) | ||
916 | return 0; | ||
917 | } else if (!pid) { | ||
918 | if (process_group(p) != process_group(current)) | ||
919 | return 0; | ||
920 | } else if (pid != -1) { | ||
921 | if (process_group(p) != -pid) | ||
922 | return 0; | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * Do not consider detached threads that are | ||
927 | * not ptraced: | ||
928 | */ | ||
929 | if (p->exit_signal == -1 && !p->ptrace) | ||
930 | return 0; | ||
931 | |||
932 | /* Wait for all children (clone and not) if __WALL is set; | ||
933 | * otherwise, wait for clone children *only* if __WCLONE is | ||
934 | * set; otherwise, wait for non-clone children *only*. (Note: | ||
935 | * A "clone" child here is one that reports to its parent | ||
936 | * using a signal other than SIGCHLD.) */ | ||
937 | if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) | ||
938 | && !(options & __WALL)) | ||
939 | return 0; | ||
940 | /* | ||
941 | * Do not consider thread group leaders that are | ||
942 | * in a non-empty thread group: | ||
943 | */ | ||
944 | if (current->tgid != p->tgid && delay_group_leader(p)) | ||
945 | return 2; | ||
946 | |||
947 | if (security_task_wait(p)) | ||
948 | return 0; | ||
949 | |||
950 | return 1; | ||
951 | } | ||
952 | |||
953 | static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, | ||
954 | int why, int status, | ||
955 | struct siginfo __user *infop, | ||
956 | struct rusage __user *rusagep) | ||
957 | { | ||
958 | int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; | ||
959 | put_task_struct(p); | ||
960 | if (!retval) | ||
961 | retval = put_user(SIGCHLD, &infop->si_signo); | ||
962 | if (!retval) | ||
963 | retval = put_user(0, &infop->si_errno); | ||
964 | if (!retval) | ||
965 | retval = put_user((short)why, &infop->si_code); | ||
966 | if (!retval) | ||
967 | retval = put_user(pid, &infop->si_pid); | ||
968 | if (!retval) | ||
969 | retval = put_user(uid, &infop->si_uid); | ||
970 | if (!retval) | ||
971 | retval = put_user(status, &infop->si_status); | ||
972 | if (!retval) | ||
973 | retval = pid; | ||
974 | return retval; | ||
975 | } | ||
976 | |||
977 | /* | ||
978 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold | ||
979 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | ||
980 | * the lock and this task is uninteresting. If we return nonzero, we have | ||
981 | * released the lock and the system call should return. | ||
982 | */ | ||
983 | static int wait_task_zombie(task_t *p, int noreap, | ||
984 | struct siginfo __user *infop, | ||
985 | int __user *stat_addr, struct rusage __user *ru) | ||
986 | { | ||
987 | unsigned long state; | ||
988 | int retval; | ||
989 | int status; | ||
990 | |||
991 | if (unlikely(noreap)) { | ||
992 | pid_t pid = p->pid; | ||
993 | uid_t uid = p->uid; | ||
994 | int exit_code = p->exit_code; | ||
995 | int why, status; | ||
996 | |||
997 | if (unlikely(p->exit_state != EXIT_ZOMBIE)) | ||
998 | return 0; | ||
999 | if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) | ||
1000 | return 0; | ||
1001 | get_task_struct(p); | ||
1002 | read_unlock(&tasklist_lock); | ||
1003 | if ((exit_code & 0x7f) == 0) { | ||
1004 | why = CLD_EXITED; | ||
1005 | status = exit_code >> 8; | ||
1006 | } else { | ||
1007 | why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; | ||
1008 | status = exit_code & 0x7f; | ||
1009 | } | ||
1010 | return wait_noreap_copyout(p, pid, uid, why, | ||
1011 | status, infop, ru); | ||
1012 | } | ||
1013 | |||
1014 | /* | ||
1015 | * Try to move the task's state to DEAD | ||
1016 | * only one thread is allowed to do this: | ||
1017 | */ | ||
1018 | state = xchg(&p->exit_state, EXIT_DEAD); | ||
1019 | if (state != EXIT_ZOMBIE) { | ||
1020 | BUG_ON(state != EXIT_DEAD); | ||
1021 | return 0; | ||
1022 | } | ||
1023 | if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) { | ||
1024 | /* | ||
1025 | * This can only happen in a race with a ptraced thread | ||
1026 | * dying on another processor. | ||
1027 | */ | ||
1028 | return 0; | ||
1029 | } | ||
1030 | |||
1031 | if (likely(p->real_parent == p->parent) && likely(p->signal)) { | ||
1032 | /* | ||
1033 | * The resource counters for the group leader are in its | ||
1034 | * own task_struct. Those for dead threads in the group | ||
1035 | * are in its signal_struct, as are those for the child | ||
1036 | * processes it has previously reaped. All these | ||
1037 | * accumulate in the parent's signal_struct c* fields. | ||
1038 | * | ||
1039 | * We don't bother to take a lock here to protect these | ||
1040 | * p->signal fields, because they are only touched by | ||
1041 | * __exit_signal, which runs with tasklist_lock | ||
1042 | * write-locked anyway, and so is excluded here. We do | ||
1043 | * need to protect the access to p->parent->signal fields, | ||
1044 | * as other threads in the parent group can be right | ||
1045 | * here reaping other children at the same time. | ||
1046 | */ | ||
1047 | spin_lock_irq(&p->parent->sighand->siglock); | ||
1048 | p->parent->signal->cutime = | ||
1049 | cputime_add(p->parent->signal->cutime, | ||
1050 | cputime_add(p->utime, | ||
1051 | cputime_add(p->signal->utime, | ||
1052 | p->signal->cutime))); | ||
1053 | p->parent->signal->cstime = | ||
1054 | cputime_add(p->parent->signal->cstime, | ||
1055 | cputime_add(p->stime, | ||
1056 | cputime_add(p->signal->stime, | ||
1057 | p->signal->cstime))); | ||
1058 | p->parent->signal->cmin_flt += | ||
1059 | p->min_flt + p->signal->min_flt + p->signal->cmin_flt; | ||
1060 | p->parent->signal->cmaj_flt += | ||
1061 | p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt; | ||
1062 | p->parent->signal->cnvcsw += | ||
1063 | p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw; | ||
1064 | p->parent->signal->cnivcsw += | ||
1065 | p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw; | ||
1066 | spin_unlock_irq(&p->parent->sighand->siglock); | ||
1067 | } | ||
1068 | |||
1069 | /* | ||
1070 | * Now we are sure this task is interesting, and no other | ||
1071 | * thread can reap it because we set its state to EXIT_DEAD. | ||
1072 | */ | ||
1073 | read_unlock(&tasklist_lock); | ||
1074 | |||
1075 | retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; | ||
1076 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | ||
1077 | ? p->signal->group_exit_code : p->exit_code; | ||
1078 | if (!retval && stat_addr) | ||
1079 | retval = put_user(status, stat_addr); | ||
1080 | if (!retval && infop) | ||
1081 | retval = put_user(SIGCHLD, &infop->si_signo); | ||
1082 | if (!retval && infop) | ||
1083 | retval = put_user(0, &infop->si_errno); | ||
1084 | if (!retval && infop) { | ||
1085 | int why; | ||
1086 | |||
1087 | if ((status & 0x7f) == 0) { | ||
1088 | why = CLD_EXITED; | ||
1089 | status >>= 8; | ||
1090 | } else { | ||
1091 | why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; | ||
1092 | status &= 0x7f; | ||
1093 | } | ||
1094 | retval = put_user((short)why, &infop->si_code); | ||
1095 | if (!retval) | ||
1096 | retval = put_user(status, &infop->si_status); | ||
1097 | } | ||
1098 | if (!retval && infop) | ||
1099 | retval = put_user(p->pid, &infop->si_pid); | ||
1100 | if (!retval && infop) | ||
1101 | retval = put_user(p->uid, &infop->si_uid); | ||
1102 | if (retval) { | ||
1103 | // TODO: is this safe? | ||
1104 | p->exit_state = EXIT_ZOMBIE; | ||
1105 | return retval; | ||
1106 | } | ||
1107 | retval = p->pid; | ||
1108 | if (p->real_parent != p->parent) { | ||
1109 | write_lock_irq(&tasklist_lock); | ||
1110 | /* Double-check with lock held. */ | ||
1111 | if (p->real_parent != p->parent) { | ||
1112 | __ptrace_unlink(p); | ||
1113 | // TODO: is this safe? | ||
1114 | p->exit_state = EXIT_ZOMBIE; | ||
1115 | /* | ||
1116 | * If this is not a detached task, notify the parent. | ||
1117 | * If it's still not detached after that, don't release | ||
1118 | * it now. | ||
1119 | */ | ||
1120 | if (p->exit_signal != -1) { | ||
1121 | do_notify_parent(p, p->exit_signal); | ||
1122 | if (p->exit_signal != -1) | ||
1123 | p = NULL; | ||
1124 | } | ||
1125 | } | ||
1126 | write_unlock_irq(&tasklist_lock); | ||
1127 | } | ||
1128 | if (p != NULL) | ||
1129 | release_task(p); | ||
1130 | BUG_ON(!retval); | ||
1131 | return retval; | ||
1132 | } | ||
1133 | |||
1134 | /* | ||
1135 | * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold | ||
1136 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | ||
1137 | * the lock and this task is uninteresting. If we return nonzero, we have | ||
1138 | * released the lock and the system call should return. | ||
1139 | */ | ||
1140 | static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, | ||
1141 | struct siginfo __user *infop, | ||
1142 | int __user *stat_addr, struct rusage __user *ru) | ||
1143 | { | ||
1144 | int retval, exit_code; | ||
1145 | |||
1146 | if (!p->exit_code) | ||
1147 | return 0; | ||
1148 | if (delayed_group_leader && !(p->ptrace & PT_PTRACED) && | ||
1149 | p->signal && p->signal->group_stop_count > 0) | ||
1150 | /* | ||
1151 | * A group stop is in progress and this is the group leader. | ||
1152 | * We won't report until all threads have stopped. | ||
1153 | */ | ||
1154 | return 0; | ||
1155 | |||
1156 | /* | ||
1157 | * Now we are pretty sure this task is interesting. | ||
1158 | * Make sure it doesn't get reaped out from under us while we | ||
1159 | * give up the lock and then examine it below. We don't want to | ||
1160 | * keep holding onto the tasklist_lock while we call getrusage and | ||
1161 | * possibly take page faults for user memory. | ||
1162 | */ | ||
1163 | get_task_struct(p); | ||
1164 | read_unlock(&tasklist_lock); | ||
1165 | |||
1166 | if (unlikely(noreap)) { | ||
1167 | pid_t pid = p->pid; | ||
1168 | uid_t uid = p->uid; | ||
1169 | int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; | ||
1170 | |||
1171 | exit_code = p->exit_code; | ||
1172 | if (unlikely(!exit_code) || | ||
1173 | unlikely(p->state > TASK_STOPPED)) | ||
1174 | goto bail_ref; | ||
1175 | return wait_noreap_copyout(p, pid, uid, | ||
1176 | why, (exit_code << 8) | 0x7f, | ||
1177 | infop, ru); | ||
1178 | } | ||
1179 | |||
1180 | write_lock_irq(&tasklist_lock); | ||
1181 | |||
1182 | /* | ||
1183 | * This uses xchg to be atomic with the thread resuming and setting | ||
1184 | * it. It must also be done with the write lock held to prevent a | ||
1185 | * race with the EXIT_ZOMBIE case. | ||
1186 | */ | ||
1187 | exit_code = xchg(&p->exit_code, 0); | ||
1188 | if (unlikely(p->exit_state)) { | ||
1189 | /* | ||
1190 | * The task resumed and then died. Let the next iteration | ||
1191 | * catch it in EXIT_ZOMBIE. Note that exit_code might | ||
1192 | * already be zero here if it resumed and did _exit(0). | ||
1193 | * The task itself is dead and won't touch exit_code again; | ||
1194 | * other processors in this function are locked out. | ||
1195 | */ | ||
1196 | p->exit_code = exit_code; | ||
1197 | exit_code = 0; | ||
1198 | } | ||
1199 | if (unlikely(exit_code == 0)) { | ||
1200 | /* | ||
1201 | * Another thread in this function got to it first, or it | ||
1202 | * resumed, or it resumed and then died. | ||
1203 | */ | ||
1204 | write_unlock_irq(&tasklist_lock); | ||
1205 | bail_ref: | ||
1206 | put_task_struct(p); | ||
1207 | /* | ||
1208 | * We are returning to the wait loop without having successfully | ||
1209 | * removed the process and having released the lock. We cannot | ||
1210 | * continue, since the "p" task pointer is potentially stale. | ||
1211 | * | ||
1212 | * Return -EAGAIN, and do_wait() will restart the loop from the | ||
1213 | * beginning. Do _not_ re-acquire the lock. | ||
1214 | */ | ||
1215 | return -EAGAIN; | ||
1216 | } | ||
1217 | |||
1218 | /* move to end of parent's list to avoid starvation */ | ||
1219 | remove_parent(p); | ||
1220 | add_parent(p, p->parent); | ||
1221 | |||
1222 | write_unlock_irq(&tasklist_lock); | ||
1223 | |||
1224 | retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; | ||
1225 | if (!retval && stat_addr) | ||
1226 | retval = put_user((exit_code << 8) | 0x7f, stat_addr); | ||
1227 | if (!retval && infop) | ||
1228 | retval = put_user(SIGCHLD, &infop->si_signo); | ||
1229 | if (!retval && infop) | ||
1230 | retval = put_user(0, &infop->si_errno); | ||
1231 | if (!retval && infop) | ||
1232 | retval = put_user((short)((p->ptrace & PT_PTRACED) | ||
1233 | ? CLD_TRAPPED : CLD_STOPPED), | ||
1234 | &infop->si_code); | ||
1235 | if (!retval && infop) | ||
1236 | retval = put_user(exit_code, &infop->si_status); | ||
1237 | if (!retval && infop) | ||
1238 | retval = put_user(p->pid, &infop->si_pid); | ||
1239 | if (!retval && infop) | ||
1240 | retval = put_user(p->uid, &infop->si_uid); | ||
1241 | if (!retval) | ||
1242 | retval = p->pid; | ||
1243 | put_task_struct(p); | ||
1244 | |||
1245 | BUG_ON(!retval); | ||
1246 | return retval; | ||
1247 | } | ||
1248 | |||
1249 | /* | ||
1250 | * Handle do_wait work for one task in a live, non-stopped state. | ||
1251 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | ||
1252 | * the lock and this task is uninteresting. If we return nonzero, we have | ||
1253 | * released the lock and the system call should return. | ||
1254 | */ | ||
1255 | static int wait_task_continued(task_t *p, int noreap, | ||
1256 | struct siginfo __user *infop, | ||
1257 | int __user *stat_addr, struct rusage __user *ru) | ||
1258 | { | ||
1259 | int retval; | ||
1260 | pid_t pid; | ||
1261 | uid_t uid; | ||
1262 | |||
1263 | if (unlikely(!p->signal)) | ||
1264 | return 0; | ||
1265 | |||
1266 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) | ||
1267 | return 0; | ||
1268 | |||
1269 | spin_lock_irq(&p->sighand->siglock); | ||
1270 | /* Re-check with the lock held. */ | ||
1271 | if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { | ||
1272 | spin_unlock_irq(&p->sighand->siglock); | ||
1273 | return 0; | ||
1274 | } | ||
1275 | if (!noreap) | ||
1276 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; | ||
1277 | spin_unlock_irq(&p->sighand->siglock); | ||
1278 | |||
1279 | pid = p->pid; | ||
1280 | uid = p->uid; | ||
1281 | get_task_struct(p); | ||
1282 | read_unlock(&tasklist_lock); | ||
1283 | |||
1284 | if (!infop) { | ||
1285 | retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; | ||
1286 | put_task_struct(p); | ||
1287 | if (!retval && stat_addr) | ||
1288 | retval = put_user(0xffff, stat_addr); | ||
1289 | if (!retval) | ||
1290 | retval = p->pid; | ||
1291 | } else { | ||
1292 | retval = wait_noreap_copyout(p, pid, uid, | ||
1293 | CLD_CONTINUED, SIGCONT, | ||
1294 | infop, ru); | ||
1295 | BUG_ON(retval == 0); | ||
1296 | } | ||
1297 | |||
1298 | return retval; | ||
1299 | } | ||
1300 | |||
1301 | |||
1302 | static inline int my_ptrace_child(struct task_struct *p) | ||
1303 | { | ||
1304 | if (!(p->ptrace & PT_PTRACED)) | ||
1305 | return 0; | ||
1306 | if (!(p->ptrace & PT_ATTACHED)) | ||
1307 | return 1; | ||
1308 | /* | ||
1309 | * This child was PTRACE_ATTACH'd. We should be seeing it only if | ||
1310 | * we are the attacher. If we are the real parent, this is a race | ||
1311 | * inside ptrace_attach. It is waiting for the tasklist_lock, | ||
1312 | * which we have to switch the parent links, but has already set | ||
1313 | * the flags in p->ptrace. | ||
1314 | */ | ||
1315 | return (p->parent != p->real_parent); | ||
1316 | } | ||
1317 | |||
1318 | static long do_wait(pid_t pid, int options, struct siginfo __user *infop, | ||
1319 | int __user *stat_addr, struct rusage __user *ru) | ||
1320 | { | ||
1321 | DECLARE_WAITQUEUE(wait, current); | ||
1322 | struct task_struct *tsk; | ||
1323 | int flag, retval; | ||
1324 | |||
1325 | add_wait_queue(¤t->signal->wait_chldexit,&wait); | ||
1326 | repeat: | ||
1327 | /* | ||
1328 | * We will set this flag if we see any child that might later | ||
1329 | * match our criteria, even if we are not able to reap it yet. | ||
1330 | */ | ||
1331 | flag = 0; | ||
1332 | current->state = TASK_INTERRUPTIBLE; | ||
1333 | read_lock(&tasklist_lock); | ||
1334 | tsk = current; | ||
1335 | do { | ||
1336 | struct task_struct *p; | ||
1337 | struct list_head *_p; | ||
1338 | int ret; | ||
1339 | |||
1340 | list_for_each(_p,&tsk->children) { | ||
1341 | p = list_entry(_p,struct task_struct,sibling); | ||
1342 | |||
1343 | ret = eligible_child(pid, options, p); | ||
1344 | if (!ret) | ||
1345 | continue; | ||
1346 | |||
1347 | switch (p->state) { | ||
1348 | case TASK_TRACED: | ||
1349 | if (!my_ptrace_child(p)) | ||
1350 | continue; | ||
1351 | /*FALLTHROUGH*/ | ||
1352 | case TASK_STOPPED: | ||
1353 | /* | ||
1354 | * It's stopped now, so it might later | ||
1355 | * continue, exit, or stop again. | ||
1356 | */ | ||
1357 | flag = 1; | ||
1358 | if (!(options & WUNTRACED) && | ||
1359 | !my_ptrace_child(p)) | ||
1360 | continue; | ||
1361 | retval = wait_task_stopped(p, ret == 2, | ||
1362 | (options & WNOWAIT), | ||
1363 | infop, | ||
1364 | stat_addr, ru); | ||
1365 | if (retval == -EAGAIN) | ||
1366 | goto repeat; | ||
1367 | if (retval != 0) /* He released the lock. */ | ||
1368 | goto end; | ||
1369 | break; | ||
1370 | default: | ||
1371 | // case EXIT_DEAD: | ||
1372 | if (p->exit_state == EXIT_DEAD) | ||
1373 | continue; | ||
1374 | // case EXIT_ZOMBIE: | ||
1375 | if (p->exit_state == EXIT_ZOMBIE) { | ||
1376 | /* | ||
1377 | * Eligible but we cannot release | ||
1378 | * it yet: | ||
1379 | */ | ||
1380 | if (ret == 2) | ||
1381 | goto check_continued; | ||
1382 | if (!likely(options & WEXITED)) | ||
1383 | continue; | ||
1384 | retval = wait_task_zombie( | ||
1385 | p, (options & WNOWAIT), | ||
1386 | infop, stat_addr, ru); | ||
1387 | /* He released the lock. */ | ||
1388 | if (retval != 0) | ||
1389 | goto end; | ||
1390 | break; | ||
1391 | } | ||
1392 | check_continued: | ||
1393 | /* | ||
1394 | * It's running now, so it might later | ||
1395 | * exit, stop, or stop and then continue. | ||
1396 | */ | ||
1397 | flag = 1; | ||
1398 | if (!unlikely(options & WCONTINUED)) | ||
1399 | continue; | ||
1400 | retval = wait_task_continued( | ||
1401 | p, (options & WNOWAIT), | ||
1402 | infop, stat_addr, ru); | ||
1403 | if (retval != 0) /* He released the lock. */ | ||
1404 | goto end; | ||
1405 | break; | ||
1406 | } | ||
1407 | } | ||
1408 | if (!flag) { | ||
1409 | list_for_each(_p, &tsk->ptrace_children) { | ||
1410 | p = list_entry(_p, struct task_struct, | ||
1411 | ptrace_list); | ||
1412 | if (!eligible_child(pid, options, p)) | ||
1413 | continue; | ||
1414 | flag = 1; | ||
1415 | break; | ||
1416 | } | ||
1417 | } | ||
1418 | if (options & __WNOTHREAD) | ||
1419 | break; | ||
1420 | tsk = next_thread(tsk); | ||
1421 | if (tsk->signal != current->signal) | ||
1422 | BUG(); | ||
1423 | } while (tsk != current); | ||
1424 | |||
1425 | read_unlock(&tasklist_lock); | ||
1426 | if (flag) { | ||
1427 | retval = 0; | ||
1428 | if (options & WNOHANG) | ||
1429 | goto end; | ||
1430 | retval = -ERESTARTSYS; | ||
1431 | if (signal_pending(current)) | ||
1432 | goto end; | ||
1433 | schedule(); | ||
1434 | goto repeat; | ||
1435 | } | ||
1436 | retval = -ECHILD; | ||
1437 | end: | ||
1438 | current->state = TASK_RUNNING; | ||
1439 | remove_wait_queue(¤t->signal->wait_chldexit,&wait); | ||
1440 | if (infop) { | ||
1441 | if (retval > 0) | ||
1442 | retval = 0; | ||
1443 | else { | ||
1444 | /* | ||
1445 | * For a WNOHANG return, clear out all the fields | ||
1446 | * we would set so the user can easily tell the | ||
1447 | * difference. | ||
1448 | */ | ||
1449 | if (!retval) | ||
1450 | retval = put_user(0, &infop->si_signo); | ||
1451 | if (!retval) | ||
1452 | retval = put_user(0, &infop->si_errno); | ||
1453 | if (!retval) | ||
1454 | retval = put_user(0, &infop->si_code); | ||
1455 | if (!retval) | ||
1456 | retval = put_user(0, &infop->si_pid); | ||
1457 | if (!retval) | ||
1458 | retval = put_user(0, &infop->si_uid); | ||
1459 | if (!retval) | ||
1460 | retval = put_user(0, &infop->si_status); | ||
1461 | } | ||
1462 | } | ||
1463 | return retval; | ||
1464 | } | ||
1465 | |||
1466 | asmlinkage long sys_waitid(int which, pid_t pid, | ||
1467 | struct siginfo __user *infop, int options, | ||
1468 | struct rusage __user *ru) | ||
1469 | { | ||
1470 | long ret; | ||
1471 | |||
1472 | if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) | ||
1473 | return -EINVAL; | ||
1474 | if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) | ||
1475 | return -EINVAL; | ||
1476 | |||
1477 | switch (which) { | ||
1478 | case P_ALL: | ||
1479 | pid = -1; | ||
1480 | break; | ||
1481 | case P_PID: | ||
1482 | if (pid <= 0) | ||
1483 | return -EINVAL; | ||
1484 | break; | ||
1485 | case P_PGID: | ||
1486 | if (pid <= 0) | ||
1487 | return -EINVAL; | ||
1488 | pid = -pid; | ||
1489 | break; | ||
1490 | default: | ||
1491 | return -EINVAL; | ||
1492 | } | ||
1493 | |||
1494 | ret = do_wait(pid, options, infop, NULL, ru); | ||
1495 | |||
1496 | /* avoid REGPARM breakage on x86: */ | ||
1497 | prevent_tail_call(ret); | ||
1498 | return ret; | ||
1499 | } | ||
1500 | |||
1501 | asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, | ||
1502 | int options, struct rusage __user *ru) | ||
1503 | { | ||
1504 | long ret; | ||
1505 | |||
1506 | if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| | ||
1507 | __WNOTHREAD|__WCLONE|__WALL)) | ||
1508 | return -EINVAL; | ||
1509 | ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru); | ||
1510 | |||
1511 | /* avoid REGPARM breakage on x86: */ | ||
1512 | prevent_tail_call(ret); | ||
1513 | return ret; | ||
1514 | } | ||
1515 | |||
1516 | #ifdef __ARCH_WANT_SYS_WAITPID | ||
1517 | |||
1518 | /* | ||
1519 | * sys_waitpid() remains for compatibility. waitpid() should be | ||
1520 | * implemented by calling sys_wait4() from libc.a. | ||
1521 | */ | ||
1522 | asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) | ||
1523 | { | ||
1524 | return sys_wait4(pid, stat_addr, options, NULL); | ||
1525 | } | ||
1526 | |||
1527 | #endif | ||
diff --git a/kernel/extable.c b/kernel/extable.c new file mode 100644 index 000000000000..7501b531ceed --- /dev/null +++ b/kernel/extable.c | |||
@@ -0,0 +1,67 @@ | |||
1 | /* Rewritten by Rusty Russell, on the backs of many others... | ||
2 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2 of the License, or | ||
7 | (at your option) any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software | ||
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <asm/uaccess.h> | ||
21 | #include <asm/sections.h> | ||
22 | |||
23 | extern struct exception_table_entry __start___ex_table[]; | ||
24 | extern struct exception_table_entry __stop___ex_table[]; | ||
25 | |||
26 | /* Sort the kernel's built-in exception table */ | ||
27 | void __init sort_main_extable(void) | ||
28 | { | ||
29 | sort_extable(__start___ex_table, __stop___ex_table); | ||
30 | } | ||
31 | |||
32 | /* Given an address, look for it in the exception tables. */ | ||
33 | const struct exception_table_entry *search_exception_tables(unsigned long addr) | ||
34 | { | ||
35 | const struct exception_table_entry *e; | ||
36 | |||
37 | e = search_extable(__start___ex_table, __stop___ex_table-1, addr); | ||
38 | if (!e) | ||
39 | e = search_module_extables(addr); | ||
40 | return e; | ||
41 | } | ||
42 | |||
43 | static int core_kernel_text(unsigned long addr) | ||
44 | { | ||
45 | if (addr >= (unsigned long)_stext && | ||
46 | addr <= (unsigned long)_etext) | ||
47 | return 1; | ||
48 | |||
49 | if (addr >= (unsigned long)_sinittext && | ||
50 | addr <= (unsigned long)_einittext) | ||
51 | return 1; | ||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | int __kernel_text_address(unsigned long addr) | ||
56 | { | ||
57 | if (core_kernel_text(addr)) | ||
58 | return 1; | ||
59 | return __module_text_address(addr) != NULL; | ||
60 | } | ||
61 | |||
62 | int kernel_text_address(unsigned long addr) | ||
63 | { | ||
64 | if (core_kernel_text(addr)) | ||
65 | return 1; | ||
66 | return module_text_address(addr) != NULL; | ||
67 | } | ||
diff --git a/kernel/fork.c b/kernel/fork.c new file mode 100644 index 000000000000..f42a17f88699 --- /dev/null +++ b/kernel/fork.c | |||
@@ -0,0 +1,1274 @@ | |||
1 | /* | ||
2 | * linux/kernel/fork.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * 'fork.c' contains the help-routines for the 'fork' system call | ||
9 | * (see also entry.S and others). | ||
10 | * Fork is rather simple, once you get the hang of it, but the memory | ||
11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' | ||
12 | */ | ||
13 | |||
14 | #include <linux/config.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/unistd.h> | ||
18 | #include <linux/smp_lock.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/vmalloc.h> | ||
21 | #include <linux/completion.h> | ||
22 | #include <linux/namespace.h> | ||
23 | #include <linux/personality.h> | ||
24 | #include <linux/mempolicy.h> | ||
25 | #include <linux/sem.h> | ||
26 | #include <linux/file.h> | ||
27 | #include <linux/key.h> | ||
28 | #include <linux/binfmts.h> | ||
29 | #include <linux/mman.h> | ||
30 | #include <linux/fs.h> | ||
31 | #include <linux/cpu.h> | ||
32 | #include <linux/cpuset.h> | ||
33 | #include <linux/security.h> | ||
34 | #include <linux/swap.h> | ||
35 | #include <linux/syscalls.h> | ||
36 | #include <linux/jiffies.h> | ||
37 | #include <linux/futex.h> | ||
38 | #include <linux/ptrace.h> | ||
39 | #include <linux/mount.h> | ||
40 | #include <linux/audit.h> | ||
41 | #include <linux/profile.h> | ||
42 | #include <linux/rmap.h> | ||
43 | #include <linux/acct.h> | ||
44 | |||
45 | #include <asm/pgtable.h> | ||
46 | #include <asm/pgalloc.h> | ||
47 | #include <asm/uaccess.h> | ||
48 | #include <asm/mmu_context.h> | ||
49 | #include <asm/cacheflush.h> | ||
50 | #include <asm/tlbflush.h> | ||
51 | |||
52 | /* | ||
53 | * Protected counters by write_lock_irq(&tasklist_lock) | ||
54 | */ | ||
55 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | ||
56 | int nr_threads; /* The idle threads do not count.. */ | ||
57 | |||
58 | int max_threads; /* tunable limit on nr_threads */ | ||
59 | |||
60 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; | ||
61 | |||
62 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ | ||
63 | |||
64 | EXPORT_SYMBOL(tasklist_lock); | ||
65 | |||
66 | int nr_processes(void) | ||
67 | { | ||
68 | int cpu; | ||
69 | int total = 0; | ||
70 | |||
71 | for_each_online_cpu(cpu) | ||
72 | total += per_cpu(process_counts, cpu); | ||
73 | |||
74 | return total; | ||
75 | } | ||
76 | |||
77 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | ||
78 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) | ||
79 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) | ||
80 | static kmem_cache_t *task_struct_cachep; | ||
81 | #endif | ||
82 | |||
83 | /* SLAB cache for signal_struct structures (tsk->signal) */ | ||
84 | kmem_cache_t *signal_cachep; | ||
85 | |||
86 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ | ||
87 | kmem_cache_t *sighand_cachep; | ||
88 | |||
89 | /* SLAB cache for files_struct structures (tsk->files) */ | ||
90 | kmem_cache_t *files_cachep; | ||
91 | |||
92 | /* SLAB cache for fs_struct structures (tsk->fs) */ | ||
93 | kmem_cache_t *fs_cachep; | ||
94 | |||
95 | /* SLAB cache for vm_area_struct structures */ | ||
96 | kmem_cache_t *vm_area_cachep; | ||
97 | |||
98 | /* SLAB cache for mm_struct structures (tsk->mm) */ | ||
99 | static kmem_cache_t *mm_cachep; | ||
100 | |||
101 | void free_task(struct task_struct *tsk) | ||
102 | { | ||
103 | free_thread_info(tsk->thread_info); | ||
104 | free_task_struct(tsk); | ||
105 | } | ||
106 | EXPORT_SYMBOL(free_task); | ||
107 | |||
108 | void __put_task_struct(struct task_struct *tsk) | ||
109 | { | ||
110 | WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); | ||
111 | WARN_ON(atomic_read(&tsk->usage)); | ||
112 | WARN_ON(tsk == current); | ||
113 | |||
114 | if (unlikely(tsk->audit_context)) | ||
115 | audit_free(tsk); | ||
116 | security_task_free(tsk); | ||
117 | free_uid(tsk->user); | ||
118 | put_group_info(tsk->group_info); | ||
119 | |||
120 | if (!profile_handoff_task(tsk)) | ||
121 | free_task(tsk); | ||
122 | } | ||
123 | |||
124 | void __init fork_init(unsigned long mempages) | ||
125 | { | ||
126 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | ||
127 | #ifndef ARCH_MIN_TASKALIGN | ||
128 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | ||
129 | #endif | ||
130 | /* create a slab on which task_structs can be allocated */ | ||
131 | task_struct_cachep = | ||
132 | kmem_cache_create("task_struct", sizeof(struct task_struct), | ||
133 | ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); | ||
134 | #endif | ||
135 | |||
136 | /* | ||
137 | * The default maximum number of threads is set to a safe | ||
138 | * value: the thread structures can take up at most half | ||
139 | * of memory. | ||
140 | */ | ||
141 | max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); | ||
142 | |||
143 | /* | ||
144 | * we need to allow at least 20 threads to boot a system | ||
145 | */ | ||
146 | if(max_threads < 20) | ||
147 | max_threads = 20; | ||
148 | |||
149 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; | ||
150 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; | ||
151 | init_task.signal->rlim[RLIMIT_SIGPENDING] = | ||
152 | init_task.signal->rlim[RLIMIT_NPROC]; | ||
153 | } | ||
154 | |||
155 | static struct task_struct *dup_task_struct(struct task_struct *orig) | ||
156 | { | ||
157 | struct task_struct *tsk; | ||
158 | struct thread_info *ti; | ||
159 | |||
160 | prepare_to_copy(orig); | ||
161 | |||
162 | tsk = alloc_task_struct(); | ||
163 | if (!tsk) | ||
164 | return NULL; | ||
165 | |||
166 | ti = alloc_thread_info(tsk); | ||
167 | if (!ti) { | ||
168 | free_task_struct(tsk); | ||
169 | return NULL; | ||
170 | } | ||
171 | |||
172 | *ti = *orig->thread_info; | ||
173 | *tsk = *orig; | ||
174 | tsk->thread_info = ti; | ||
175 | ti->task = tsk; | ||
176 | |||
177 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | ||
178 | atomic_set(&tsk->usage,2); | ||
179 | return tsk; | ||
180 | } | ||
181 | |||
182 | #ifdef CONFIG_MMU | ||
183 | static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | ||
184 | { | ||
185 | struct vm_area_struct * mpnt, *tmp, **pprev; | ||
186 | struct rb_node **rb_link, *rb_parent; | ||
187 | int retval; | ||
188 | unsigned long charge; | ||
189 | struct mempolicy *pol; | ||
190 | |||
191 | down_write(&oldmm->mmap_sem); | ||
192 | flush_cache_mm(current->mm); | ||
193 | mm->locked_vm = 0; | ||
194 | mm->mmap = NULL; | ||
195 | mm->mmap_cache = NULL; | ||
196 | mm->free_area_cache = oldmm->mmap_base; | ||
197 | mm->map_count = 0; | ||
198 | set_mm_counter(mm, rss, 0); | ||
199 | set_mm_counter(mm, anon_rss, 0); | ||
200 | cpus_clear(mm->cpu_vm_mask); | ||
201 | mm->mm_rb = RB_ROOT; | ||
202 | rb_link = &mm->mm_rb.rb_node; | ||
203 | rb_parent = NULL; | ||
204 | pprev = &mm->mmap; | ||
205 | |||
206 | for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { | ||
207 | struct file *file; | ||
208 | |||
209 | if (mpnt->vm_flags & VM_DONTCOPY) { | ||
210 | __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, | ||
211 | -vma_pages(mpnt)); | ||
212 | continue; | ||
213 | } | ||
214 | charge = 0; | ||
215 | if (mpnt->vm_flags & VM_ACCOUNT) { | ||
216 | unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; | ||
217 | if (security_vm_enough_memory(len)) | ||
218 | goto fail_nomem; | ||
219 | charge = len; | ||
220 | } | ||
221 | tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | ||
222 | if (!tmp) | ||
223 | goto fail_nomem; | ||
224 | *tmp = *mpnt; | ||
225 | pol = mpol_copy(vma_policy(mpnt)); | ||
226 | retval = PTR_ERR(pol); | ||
227 | if (IS_ERR(pol)) | ||
228 | goto fail_nomem_policy; | ||
229 | vma_set_policy(tmp, pol); | ||
230 | tmp->vm_flags &= ~VM_LOCKED; | ||
231 | tmp->vm_mm = mm; | ||
232 | tmp->vm_next = NULL; | ||
233 | anon_vma_link(tmp); | ||
234 | file = tmp->vm_file; | ||
235 | if (file) { | ||
236 | struct inode *inode = file->f_dentry->d_inode; | ||
237 | get_file(file); | ||
238 | if (tmp->vm_flags & VM_DENYWRITE) | ||
239 | atomic_dec(&inode->i_writecount); | ||
240 | |||
241 | /* insert tmp into the share list, just after mpnt */ | ||
242 | spin_lock(&file->f_mapping->i_mmap_lock); | ||
243 | tmp->vm_truncate_count = mpnt->vm_truncate_count; | ||
244 | flush_dcache_mmap_lock(file->f_mapping); | ||
245 | vma_prio_tree_add(tmp, mpnt); | ||
246 | flush_dcache_mmap_unlock(file->f_mapping); | ||
247 | spin_unlock(&file->f_mapping->i_mmap_lock); | ||
248 | } | ||
249 | |||
250 | /* | ||
251 | * Link in the new vma and copy the page table entries: | ||
252 | * link in first so that swapoff can see swap entries, | ||
253 | * and try_to_unmap_one's find_vma find the new vma. | ||
254 | */ | ||
255 | spin_lock(&mm->page_table_lock); | ||
256 | *pprev = tmp; | ||
257 | pprev = &tmp->vm_next; | ||
258 | |||
259 | __vma_link_rb(mm, tmp, rb_link, rb_parent); | ||
260 | rb_link = &tmp->vm_rb.rb_right; | ||
261 | rb_parent = &tmp->vm_rb; | ||
262 | |||
263 | mm->map_count++; | ||
264 | retval = copy_page_range(mm, current->mm, tmp); | ||
265 | spin_unlock(&mm->page_table_lock); | ||
266 | |||
267 | if (tmp->vm_ops && tmp->vm_ops->open) | ||
268 | tmp->vm_ops->open(tmp); | ||
269 | |||
270 | if (retval) | ||
271 | goto out; | ||
272 | } | ||
273 | retval = 0; | ||
274 | |||
275 | out: | ||
276 | flush_tlb_mm(current->mm); | ||
277 | up_write(&oldmm->mmap_sem); | ||
278 | return retval; | ||
279 | fail_nomem_policy: | ||
280 | kmem_cache_free(vm_area_cachep, tmp); | ||
281 | fail_nomem: | ||
282 | retval = -ENOMEM; | ||
283 | vm_unacct_memory(charge); | ||
284 | goto out; | ||
285 | } | ||
286 | |||
287 | static inline int mm_alloc_pgd(struct mm_struct * mm) | ||
288 | { | ||
289 | mm->pgd = pgd_alloc(mm); | ||
290 | if (unlikely(!mm->pgd)) | ||
291 | return -ENOMEM; | ||
292 | return 0; | ||
293 | } | ||
294 | |||
295 | static inline void mm_free_pgd(struct mm_struct * mm) | ||
296 | { | ||
297 | pgd_free(mm->pgd); | ||
298 | } | ||
299 | #else | ||
300 | #define dup_mmap(mm, oldmm) (0) | ||
301 | #define mm_alloc_pgd(mm) (0) | ||
302 | #define mm_free_pgd(mm) | ||
303 | #endif /* CONFIG_MMU */ | ||
304 | |||
305 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); | ||
306 | |||
307 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) | ||
308 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) | ||
309 | |||
310 | #include <linux/init_task.h> | ||
311 | |||
312 | static struct mm_struct * mm_init(struct mm_struct * mm) | ||
313 | { | ||
314 | atomic_set(&mm->mm_users, 1); | ||
315 | atomic_set(&mm->mm_count, 1); | ||
316 | init_rwsem(&mm->mmap_sem); | ||
317 | INIT_LIST_HEAD(&mm->mmlist); | ||
318 | mm->core_waiters = 0; | ||
319 | mm->nr_ptes = 0; | ||
320 | spin_lock_init(&mm->page_table_lock); | ||
321 | rwlock_init(&mm->ioctx_list_lock); | ||
322 | mm->ioctx_list = NULL; | ||
323 | mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); | ||
324 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
325 | |||
326 | if (likely(!mm_alloc_pgd(mm))) { | ||
327 | mm->def_flags = 0; | ||
328 | return mm; | ||
329 | } | ||
330 | free_mm(mm); | ||
331 | return NULL; | ||
332 | } | ||
333 | |||
334 | /* | ||
335 | * Allocate and initialize an mm_struct. | ||
336 | */ | ||
337 | struct mm_struct * mm_alloc(void) | ||
338 | { | ||
339 | struct mm_struct * mm; | ||
340 | |||
341 | mm = allocate_mm(); | ||
342 | if (mm) { | ||
343 | memset(mm, 0, sizeof(*mm)); | ||
344 | mm = mm_init(mm); | ||
345 | } | ||
346 | return mm; | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * Called when the last reference to the mm | ||
351 | * is dropped: either by a lazy thread or by | ||
352 | * mmput. Free the page directory and the mm. | ||
353 | */ | ||
354 | void fastcall __mmdrop(struct mm_struct *mm) | ||
355 | { | ||
356 | BUG_ON(mm == &init_mm); | ||
357 | mm_free_pgd(mm); | ||
358 | destroy_context(mm); | ||
359 | free_mm(mm); | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * Decrement the use count and release all resources for an mm. | ||
364 | */ | ||
365 | void mmput(struct mm_struct *mm) | ||
366 | { | ||
367 | if (atomic_dec_and_test(&mm->mm_users)) { | ||
368 | exit_aio(mm); | ||
369 | exit_mmap(mm); | ||
370 | if (!list_empty(&mm->mmlist)) { | ||
371 | spin_lock(&mmlist_lock); | ||
372 | list_del(&mm->mmlist); | ||
373 | spin_unlock(&mmlist_lock); | ||
374 | } | ||
375 | put_swap_token(mm); | ||
376 | mmdrop(mm); | ||
377 | } | ||
378 | } | ||
379 | EXPORT_SYMBOL_GPL(mmput); | ||
380 | |||
381 | /** | ||
382 | * get_task_mm - acquire a reference to the task's mm | ||
383 | * | ||
384 | * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning | ||
385 | * this kernel workthread has transiently adopted a user mm with use_mm, | ||
386 | * to do its AIO) is not set and if so returns a reference to it, after | ||
387 | * bumping up the use count. User must release the mm via mmput() | ||
388 | * after use. Typically used by /proc and ptrace. | ||
389 | */ | ||
390 | struct mm_struct *get_task_mm(struct task_struct *task) | ||
391 | { | ||
392 | struct mm_struct *mm; | ||
393 | |||
394 | task_lock(task); | ||
395 | mm = task->mm; | ||
396 | if (mm) { | ||
397 | if (task->flags & PF_BORROWED_MM) | ||
398 | mm = NULL; | ||
399 | else | ||
400 | atomic_inc(&mm->mm_users); | ||
401 | } | ||
402 | task_unlock(task); | ||
403 | return mm; | ||
404 | } | ||
405 | EXPORT_SYMBOL_GPL(get_task_mm); | ||
406 | |||
407 | /* Please note the differences between mmput and mm_release. | ||
408 | * mmput is called whenever we stop holding onto a mm_struct, | ||
409 | * error success whatever. | ||
410 | * | ||
411 | * mm_release is called after a mm_struct has been removed | ||
412 | * from the current process. | ||
413 | * | ||
414 | * This difference is important for error handling, when we | ||
415 | * only half set up a mm_struct for a new process and need to restore | ||
416 | * the old one. Because we mmput the new mm_struct before | ||
417 | * restoring the old one. . . | ||
418 | * Eric Biederman 10 January 1998 | ||
419 | */ | ||
420 | void mm_release(struct task_struct *tsk, struct mm_struct *mm) | ||
421 | { | ||
422 | struct completion *vfork_done = tsk->vfork_done; | ||
423 | |||
424 | /* Get rid of any cached register state */ | ||
425 | deactivate_mm(tsk, mm); | ||
426 | |||
427 | /* notify parent sleeping on vfork() */ | ||
428 | if (vfork_done) { | ||
429 | tsk->vfork_done = NULL; | ||
430 | complete(vfork_done); | ||
431 | } | ||
432 | if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { | ||
433 | u32 __user * tidptr = tsk->clear_child_tid; | ||
434 | tsk->clear_child_tid = NULL; | ||
435 | |||
436 | /* | ||
437 | * We don't check the error code - if userspace has | ||
438 | * not set up a proper pointer then tough luck. | ||
439 | */ | ||
440 | put_user(0, tidptr); | ||
441 | sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); | ||
442 | } | ||
443 | } | ||
444 | |||
445 | static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | ||
446 | { | ||
447 | struct mm_struct * mm, *oldmm; | ||
448 | int retval; | ||
449 | |||
450 | tsk->min_flt = tsk->maj_flt = 0; | ||
451 | tsk->nvcsw = tsk->nivcsw = 0; | ||
452 | |||
453 | tsk->mm = NULL; | ||
454 | tsk->active_mm = NULL; | ||
455 | |||
456 | /* | ||
457 | * Are we cloning a kernel thread? | ||
458 | * | ||
459 | * We need to steal a active VM for that.. | ||
460 | */ | ||
461 | oldmm = current->mm; | ||
462 | if (!oldmm) | ||
463 | return 0; | ||
464 | |||
465 | if (clone_flags & CLONE_VM) { | ||
466 | atomic_inc(&oldmm->mm_users); | ||
467 | mm = oldmm; | ||
468 | /* | ||
469 | * There are cases where the PTL is held to ensure no | ||
470 | * new threads start up in user mode using an mm, which | ||
471 | * allows optimizing out ipis; the tlb_gather_mmu code | ||
472 | * is an example. | ||
473 | */ | ||
474 | spin_unlock_wait(&oldmm->page_table_lock); | ||
475 | goto good_mm; | ||
476 | } | ||
477 | |||
478 | retval = -ENOMEM; | ||
479 | mm = allocate_mm(); | ||
480 | if (!mm) | ||
481 | goto fail_nomem; | ||
482 | |||
483 | /* Copy the current MM stuff.. */ | ||
484 | memcpy(mm, oldmm, sizeof(*mm)); | ||
485 | if (!mm_init(mm)) | ||
486 | goto fail_nomem; | ||
487 | |||
488 | if (init_new_context(tsk,mm)) | ||
489 | goto fail_nocontext; | ||
490 | |||
491 | retval = dup_mmap(mm, oldmm); | ||
492 | if (retval) | ||
493 | goto free_pt; | ||
494 | |||
495 | mm->hiwater_rss = get_mm_counter(mm,rss); | ||
496 | mm->hiwater_vm = mm->total_vm; | ||
497 | |||
498 | good_mm: | ||
499 | tsk->mm = mm; | ||
500 | tsk->active_mm = mm; | ||
501 | return 0; | ||
502 | |||
503 | free_pt: | ||
504 | mmput(mm); | ||
505 | fail_nomem: | ||
506 | return retval; | ||
507 | |||
508 | fail_nocontext: | ||
509 | /* | ||
510 | * If init_new_context() failed, we cannot use mmput() to free the mm | ||
511 | * because it calls destroy_context() | ||
512 | */ | ||
513 | mm_free_pgd(mm); | ||
514 | free_mm(mm); | ||
515 | return retval; | ||
516 | } | ||
517 | |||
518 | static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) | ||
519 | { | ||
520 | struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); | ||
521 | /* We don't need to lock fs - think why ;-) */ | ||
522 | if (fs) { | ||
523 | atomic_set(&fs->count, 1); | ||
524 | rwlock_init(&fs->lock); | ||
525 | fs->umask = old->umask; | ||
526 | read_lock(&old->lock); | ||
527 | fs->rootmnt = mntget(old->rootmnt); | ||
528 | fs->root = dget(old->root); | ||
529 | fs->pwdmnt = mntget(old->pwdmnt); | ||
530 | fs->pwd = dget(old->pwd); | ||
531 | if (old->altroot) { | ||
532 | fs->altrootmnt = mntget(old->altrootmnt); | ||
533 | fs->altroot = dget(old->altroot); | ||
534 | } else { | ||
535 | fs->altrootmnt = NULL; | ||
536 | fs->altroot = NULL; | ||
537 | } | ||
538 | read_unlock(&old->lock); | ||
539 | } | ||
540 | return fs; | ||
541 | } | ||
542 | |||
543 | struct fs_struct *copy_fs_struct(struct fs_struct *old) | ||
544 | { | ||
545 | return __copy_fs_struct(old); | ||
546 | } | ||
547 | |||
548 | EXPORT_SYMBOL_GPL(copy_fs_struct); | ||
549 | |||
550 | static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) | ||
551 | { | ||
552 | if (clone_flags & CLONE_FS) { | ||
553 | atomic_inc(¤t->fs->count); | ||
554 | return 0; | ||
555 | } | ||
556 | tsk->fs = __copy_fs_struct(current->fs); | ||
557 | if (!tsk->fs) | ||
558 | return -ENOMEM; | ||
559 | return 0; | ||
560 | } | ||
561 | |||
562 | static int count_open_files(struct files_struct *files, int size) | ||
563 | { | ||
564 | int i; | ||
565 | |||
566 | /* Find the last open fd */ | ||
567 | for (i = size/(8*sizeof(long)); i > 0; ) { | ||
568 | if (files->open_fds->fds_bits[--i]) | ||
569 | break; | ||
570 | } | ||
571 | i = (i+1) * 8 * sizeof(long); | ||
572 | return i; | ||
573 | } | ||
574 | |||
575 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | ||
576 | { | ||
577 | struct files_struct *oldf, *newf; | ||
578 | struct file **old_fds, **new_fds; | ||
579 | int open_files, size, i, error = 0, expand; | ||
580 | |||
581 | /* | ||
582 | * A background process may not have any files ... | ||
583 | */ | ||
584 | oldf = current->files; | ||
585 | if (!oldf) | ||
586 | goto out; | ||
587 | |||
588 | if (clone_flags & CLONE_FILES) { | ||
589 | atomic_inc(&oldf->count); | ||
590 | goto out; | ||
591 | } | ||
592 | |||
593 | /* | ||
594 | * Note: we may be using current for both targets (See exec.c) | ||
595 | * This works because we cache current->files (old) as oldf. Don't | ||
596 | * break this. | ||
597 | */ | ||
598 | tsk->files = NULL; | ||
599 | error = -ENOMEM; | ||
600 | newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); | ||
601 | if (!newf) | ||
602 | goto out; | ||
603 | |||
604 | atomic_set(&newf->count, 1); | ||
605 | |||
606 | spin_lock_init(&newf->file_lock); | ||
607 | newf->next_fd = 0; | ||
608 | newf->max_fds = NR_OPEN_DEFAULT; | ||
609 | newf->max_fdset = __FD_SETSIZE; | ||
610 | newf->close_on_exec = &newf->close_on_exec_init; | ||
611 | newf->open_fds = &newf->open_fds_init; | ||
612 | newf->fd = &newf->fd_array[0]; | ||
613 | |||
614 | spin_lock(&oldf->file_lock); | ||
615 | |||
616 | open_files = count_open_files(oldf, oldf->max_fdset); | ||
617 | expand = 0; | ||
618 | |||
619 | /* | ||
620 | * Check whether we need to allocate a larger fd array or fd set. | ||
621 | * Note: we're not a clone task, so the open count won't change. | ||
622 | */ | ||
623 | if (open_files > newf->max_fdset) { | ||
624 | newf->max_fdset = 0; | ||
625 | expand = 1; | ||
626 | } | ||
627 | if (open_files > newf->max_fds) { | ||
628 | newf->max_fds = 0; | ||
629 | expand = 1; | ||
630 | } | ||
631 | |||
632 | /* if the old fdset gets grown now, we'll only copy up to "size" fds */ | ||
633 | if (expand) { | ||
634 | spin_unlock(&oldf->file_lock); | ||
635 | spin_lock(&newf->file_lock); | ||
636 | error = expand_files(newf, open_files-1); | ||
637 | spin_unlock(&newf->file_lock); | ||
638 | if (error < 0) | ||
639 | goto out_release; | ||
640 | spin_lock(&oldf->file_lock); | ||
641 | } | ||
642 | |||
643 | old_fds = oldf->fd; | ||
644 | new_fds = newf->fd; | ||
645 | |||
646 | memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); | ||
647 | memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); | ||
648 | |||
649 | for (i = open_files; i != 0; i--) { | ||
650 | struct file *f = *old_fds++; | ||
651 | if (f) { | ||
652 | get_file(f); | ||
653 | } else { | ||
654 | /* | ||
655 | * The fd may be claimed in the fd bitmap but not yet | ||
656 | * instantiated in the files array if a sibling thread | ||
657 | * is partway through open(). So make sure that this | ||
658 | * fd is available to the new process. | ||
659 | */ | ||
660 | FD_CLR(open_files - i, newf->open_fds); | ||
661 | } | ||
662 | *new_fds++ = f; | ||
663 | } | ||
664 | spin_unlock(&oldf->file_lock); | ||
665 | |||
666 | /* compute the remainder to be cleared */ | ||
667 | size = (newf->max_fds - open_files) * sizeof(struct file *); | ||
668 | |||
669 | /* This is long word aligned thus could use a optimized version */ | ||
670 | memset(new_fds, 0, size); | ||
671 | |||
672 | if (newf->max_fdset > open_files) { | ||
673 | int left = (newf->max_fdset-open_files)/8; | ||
674 | int start = open_files / (8 * sizeof(unsigned long)); | ||
675 | |||
676 | memset(&newf->open_fds->fds_bits[start], 0, left); | ||
677 | memset(&newf->close_on_exec->fds_bits[start], 0, left); | ||
678 | } | ||
679 | |||
680 | tsk->files = newf; | ||
681 | error = 0; | ||
682 | out: | ||
683 | return error; | ||
684 | |||
685 | out_release: | ||
686 | free_fdset (newf->close_on_exec, newf->max_fdset); | ||
687 | free_fdset (newf->open_fds, newf->max_fdset); | ||
688 | free_fd_array(newf->fd, newf->max_fds); | ||
689 | kmem_cache_free(files_cachep, newf); | ||
690 | goto out; | ||
691 | } | ||
692 | |||
693 | /* | ||
694 | * Helper to unshare the files of the current task. | ||
695 | * We don't want to expose copy_files internals to | ||
696 | * the exec layer of the kernel. | ||
697 | */ | ||
698 | |||
699 | int unshare_files(void) | ||
700 | { | ||
701 | struct files_struct *files = current->files; | ||
702 | int rc; | ||
703 | |||
704 | if(!files) | ||
705 | BUG(); | ||
706 | |||
707 | /* This can race but the race causes us to copy when we don't | ||
708 | need to and drop the copy */ | ||
709 | if(atomic_read(&files->count) == 1) | ||
710 | { | ||
711 | atomic_inc(&files->count); | ||
712 | return 0; | ||
713 | } | ||
714 | rc = copy_files(0, current); | ||
715 | if(rc) | ||
716 | current->files = files; | ||
717 | return rc; | ||
718 | } | ||
719 | |||
720 | EXPORT_SYMBOL(unshare_files); | ||
721 | |||
722 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) | ||
723 | { | ||
724 | struct sighand_struct *sig; | ||
725 | |||
726 | if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { | ||
727 | atomic_inc(¤t->sighand->count); | ||
728 | return 0; | ||
729 | } | ||
730 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); | ||
731 | tsk->sighand = sig; | ||
732 | if (!sig) | ||
733 | return -ENOMEM; | ||
734 | spin_lock_init(&sig->siglock); | ||
735 | atomic_set(&sig->count, 1); | ||
736 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); | ||
737 | return 0; | ||
738 | } | ||
739 | |||
740 | static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) | ||
741 | { | ||
742 | struct signal_struct *sig; | ||
743 | int ret; | ||
744 | |||
745 | if (clone_flags & CLONE_THREAD) { | ||
746 | atomic_inc(¤t->signal->count); | ||
747 | atomic_inc(¤t->signal->live); | ||
748 | return 0; | ||
749 | } | ||
750 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); | ||
751 | tsk->signal = sig; | ||
752 | if (!sig) | ||
753 | return -ENOMEM; | ||
754 | |||
755 | ret = copy_thread_group_keys(tsk); | ||
756 | if (ret < 0) { | ||
757 | kmem_cache_free(signal_cachep, sig); | ||
758 | return ret; | ||
759 | } | ||
760 | |||
761 | atomic_set(&sig->count, 1); | ||
762 | atomic_set(&sig->live, 1); | ||
763 | init_waitqueue_head(&sig->wait_chldexit); | ||
764 | sig->flags = 0; | ||
765 | sig->group_exit_code = 0; | ||
766 | sig->group_exit_task = NULL; | ||
767 | sig->group_stop_count = 0; | ||
768 | sig->curr_target = NULL; | ||
769 | init_sigpending(&sig->shared_pending); | ||
770 | INIT_LIST_HEAD(&sig->posix_timers); | ||
771 | |||
772 | sig->it_real_value = sig->it_real_incr = 0; | ||
773 | sig->real_timer.function = it_real_fn; | ||
774 | sig->real_timer.data = (unsigned long) tsk; | ||
775 | init_timer(&sig->real_timer); | ||
776 | |||
777 | sig->it_virt_expires = cputime_zero; | ||
778 | sig->it_virt_incr = cputime_zero; | ||
779 | sig->it_prof_expires = cputime_zero; | ||
780 | sig->it_prof_incr = cputime_zero; | ||
781 | |||
782 | sig->tty = current->signal->tty; | ||
783 | sig->pgrp = process_group(current); | ||
784 | sig->session = current->signal->session; | ||
785 | sig->leader = 0; /* session leadership doesn't inherit */ | ||
786 | sig->tty_old_pgrp = 0; | ||
787 | |||
788 | sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; | ||
789 | sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; | ||
790 | sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; | ||
791 | sig->sched_time = 0; | ||
792 | INIT_LIST_HEAD(&sig->cpu_timers[0]); | ||
793 | INIT_LIST_HEAD(&sig->cpu_timers[1]); | ||
794 | INIT_LIST_HEAD(&sig->cpu_timers[2]); | ||
795 | |||
796 | task_lock(current->group_leader); | ||
797 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); | ||
798 | task_unlock(current->group_leader); | ||
799 | |||
800 | if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { | ||
801 | /* | ||
802 | * New sole thread in the process gets an expiry time | ||
803 | * of the whole CPU time limit. | ||
804 | */ | ||
805 | tsk->it_prof_expires = | ||
806 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | ||
807 | } | ||
808 | |||
809 | return 0; | ||
810 | } | ||
811 | |||
812 | static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) | ||
813 | { | ||
814 | unsigned long new_flags = p->flags; | ||
815 | |||
816 | new_flags &= ~PF_SUPERPRIV; | ||
817 | new_flags |= PF_FORKNOEXEC; | ||
818 | if (!(clone_flags & CLONE_PTRACE)) | ||
819 | p->ptrace = 0; | ||
820 | p->flags = new_flags; | ||
821 | } | ||
822 | |||
823 | asmlinkage long sys_set_tid_address(int __user *tidptr) | ||
824 | { | ||
825 | current->clear_child_tid = tidptr; | ||
826 | |||
827 | return current->pid; | ||
828 | } | ||
829 | |||
830 | /* | ||
831 | * This creates a new process as a copy of the old one, | ||
832 | * but does not actually start it yet. | ||
833 | * | ||
834 | * It copies the registers, and all the appropriate | ||
835 | * parts of the process environment (as per the clone | ||
836 | * flags). The actual kick-off is left to the caller. | ||
837 | */ | ||
838 | static task_t *copy_process(unsigned long clone_flags, | ||
839 | unsigned long stack_start, | ||
840 | struct pt_regs *regs, | ||
841 | unsigned long stack_size, | ||
842 | int __user *parent_tidptr, | ||
843 | int __user *child_tidptr, | ||
844 | int pid) | ||
845 | { | ||
846 | int retval; | ||
847 | struct task_struct *p = NULL; | ||
848 | |||
849 | if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | ||
850 | return ERR_PTR(-EINVAL); | ||
851 | |||
852 | /* | ||
853 | * Thread groups must share signals as well, and detached threads | ||
854 | * can only be started up within the thread group. | ||
855 | */ | ||
856 | if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) | ||
857 | return ERR_PTR(-EINVAL); | ||
858 | |||
859 | /* | ||
860 | * Shared signal handlers imply shared VM. By way of the above, | ||
861 | * thread groups also imply shared VM. Blocking this case allows | ||
862 | * for various simplifications in other code. | ||
863 | */ | ||
864 | if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) | ||
865 | return ERR_PTR(-EINVAL); | ||
866 | |||
867 | retval = security_task_create(clone_flags); | ||
868 | if (retval) | ||
869 | goto fork_out; | ||
870 | |||
871 | retval = -ENOMEM; | ||
872 | p = dup_task_struct(current); | ||
873 | if (!p) | ||
874 | goto fork_out; | ||
875 | |||
876 | retval = -EAGAIN; | ||
877 | if (atomic_read(&p->user->processes) >= | ||
878 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { | ||
879 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && | ||
880 | p->user != &root_user) | ||
881 | goto bad_fork_free; | ||
882 | } | ||
883 | |||
884 | atomic_inc(&p->user->__count); | ||
885 | atomic_inc(&p->user->processes); | ||
886 | get_group_info(p->group_info); | ||
887 | |||
888 | /* | ||
889 | * If multiple threads are within copy_process(), then this check | ||
890 | * triggers too late. This doesn't hurt, the check is only there | ||
891 | * to stop root fork bombs. | ||
892 | */ | ||
893 | if (nr_threads >= max_threads) | ||
894 | goto bad_fork_cleanup_count; | ||
895 | |||
896 | if (!try_module_get(p->thread_info->exec_domain->module)) | ||
897 | goto bad_fork_cleanup_count; | ||
898 | |||
899 | if (p->binfmt && !try_module_get(p->binfmt->module)) | ||
900 | goto bad_fork_cleanup_put_domain; | ||
901 | |||
902 | p->did_exec = 0; | ||
903 | copy_flags(clone_flags, p); | ||
904 | p->pid = pid; | ||
905 | retval = -EFAULT; | ||
906 | if (clone_flags & CLONE_PARENT_SETTID) | ||
907 | if (put_user(p->pid, parent_tidptr)) | ||
908 | goto bad_fork_cleanup; | ||
909 | |||
910 | p->proc_dentry = NULL; | ||
911 | |||
912 | INIT_LIST_HEAD(&p->children); | ||
913 | INIT_LIST_HEAD(&p->sibling); | ||
914 | p->vfork_done = NULL; | ||
915 | spin_lock_init(&p->alloc_lock); | ||
916 | spin_lock_init(&p->proc_lock); | ||
917 | |||
918 | clear_tsk_thread_flag(p, TIF_SIGPENDING); | ||
919 | init_sigpending(&p->pending); | ||
920 | |||
921 | p->utime = cputime_zero; | ||
922 | p->stime = cputime_zero; | ||
923 | p->sched_time = 0; | ||
924 | p->rchar = 0; /* I/O counter: bytes read */ | ||
925 | p->wchar = 0; /* I/O counter: bytes written */ | ||
926 | p->syscr = 0; /* I/O counter: read syscalls */ | ||
927 | p->syscw = 0; /* I/O counter: write syscalls */ | ||
928 | acct_clear_integrals(p); | ||
929 | |||
930 | p->it_virt_expires = cputime_zero; | ||
931 | p->it_prof_expires = cputime_zero; | ||
932 | p->it_sched_expires = 0; | ||
933 | INIT_LIST_HEAD(&p->cpu_timers[0]); | ||
934 | INIT_LIST_HEAD(&p->cpu_timers[1]); | ||
935 | INIT_LIST_HEAD(&p->cpu_timers[2]); | ||
936 | |||
937 | p->lock_depth = -1; /* -1 = no lock */ | ||
938 | do_posix_clock_monotonic_gettime(&p->start_time); | ||
939 | p->security = NULL; | ||
940 | p->io_context = NULL; | ||
941 | p->io_wait = NULL; | ||
942 | p->audit_context = NULL; | ||
943 | #ifdef CONFIG_NUMA | ||
944 | p->mempolicy = mpol_copy(p->mempolicy); | ||
945 | if (IS_ERR(p->mempolicy)) { | ||
946 | retval = PTR_ERR(p->mempolicy); | ||
947 | p->mempolicy = NULL; | ||
948 | goto bad_fork_cleanup; | ||
949 | } | ||
950 | #endif | ||
951 | |||
952 | p->tgid = p->pid; | ||
953 | if (clone_flags & CLONE_THREAD) | ||
954 | p->tgid = current->tgid; | ||
955 | |||
956 | if ((retval = security_task_alloc(p))) | ||
957 | goto bad_fork_cleanup_policy; | ||
958 | if ((retval = audit_alloc(p))) | ||
959 | goto bad_fork_cleanup_security; | ||
960 | /* copy all the process information */ | ||
961 | if ((retval = copy_semundo(clone_flags, p))) | ||
962 | goto bad_fork_cleanup_audit; | ||
963 | if ((retval = copy_files(clone_flags, p))) | ||
964 | goto bad_fork_cleanup_semundo; | ||
965 | if ((retval = copy_fs(clone_flags, p))) | ||
966 | goto bad_fork_cleanup_files; | ||
967 | if ((retval = copy_sighand(clone_flags, p))) | ||
968 | goto bad_fork_cleanup_fs; | ||
969 | if ((retval = copy_signal(clone_flags, p))) | ||
970 | goto bad_fork_cleanup_sighand; | ||
971 | if ((retval = copy_mm(clone_flags, p))) | ||
972 | goto bad_fork_cleanup_signal; | ||
973 | if ((retval = copy_keys(clone_flags, p))) | ||
974 | goto bad_fork_cleanup_mm; | ||
975 | if ((retval = copy_namespace(clone_flags, p))) | ||
976 | goto bad_fork_cleanup_keys; | ||
977 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); | ||
978 | if (retval) | ||
979 | goto bad_fork_cleanup_namespace; | ||
980 | |||
981 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | ||
982 | /* | ||
983 | * Clear TID on mm_release()? | ||
984 | */ | ||
985 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; | ||
986 | |||
987 | /* | ||
988 | * Syscall tracing should be turned off in the child regardless | ||
989 | * of CLONE_PTRACE. | ||
990 | */ | ||
991 | clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); | ||
992 | |||
993 | /* Our parent execution domain becomes current domain | ||
994 | These must match for thread signalling to apply */ | ||
995 | |||
996 | p->parent_exec_id = p->self_exec_id; | ||
997 | |||
998 | /* ok, now we should be set up.. */ | ||
999 | p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); | ||
1000 | p->pdeath_signal = 0; | ||
1001 | p->exit_state = 0; | ||
1002 | |||
1003 | /* Perform scheduler related setup */ | ||
1004 | sched_fork(p); | ||
1005 | |||
1006 | /* | ||
1007 | * Ok, make it visible to the rest of the system. | ||
1008 | * We dont wake it up yet. | ||
1009 | */ | ||
1010 | p->group_leader = p; | ||
1011 | INIT_LIST_HEAD(&p->ptrace_children); | ||
1012 | INIT_LIST_HEAD(&p->ptrace_list); | ||
1013 | |||
1014 | /* Need tasklist lock for parent etc handling! */ | ||
1015 | write_lock_irq(&tasklist_lock); | ||
1016 | |||
1017 | /* | ||
1018 | * The task hasn't been attached yet, so cpus_allowed mask cannot | ||
1019 | * have changed. The cpus_allowed mask of the parent may have | ||
1020 | * changed after it was copied first time, and it may then move to | ||
1021 | * another CPU - so we re-copy it here and set the child's CPU to | ||
1022 | * the parent's CPU. This avoids alot of nasty races. | ||
1023 | */ | ||
1024 | p->cpus_allowed = current->cpus_allowed; | ||
1025 | set_task_cpu(p, smp_processor_id()); | ||
1026 | |||
1027 | /* | ||
1028 | * Check for pending SIGKILL! The new thread should not be allowed | ||
1029 | * to slip out of an OOM kill. (or normal SIGKILL.) | ||
1030 | */ | ||
1031 | if (sigismember(¤t->pending.signal, SIGKILL)) { | ||
1032 | write_unlock_irq(&tasklist_lock); | ||
1033 | retval = -EINTR; | ||
1034 | goto bad_fork_cleanup_namespace; | ||
1035 | } | ||
1036 | |||
1037 | /* CLONE_PARENT re-uses the old parent */ | ||
1038 | if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) | ||
1039 | p->real_parent = current->real_parent; | ||
1040 | else | ||
1041 | p->real_parent = current; | ||
1042 | p->parent = p->real_parent; | ||
1043 | |||
1044 | if (clone_flags & CLONE_THREAD) { | ||
1045 | spin_lock(¤t->sighand->siglock); | ||
1046 | /* | ||
1047 | * Important: if an exit-all has been started then | ||
1048 | * do not create this new thread - the whole thread | ||
1049 | * group is supposed to exit anyway. | ||
1050 | */ | ||
1051 | if (current->signal->flags & SIGNAL_GROUP_EXIT) { | ||
1052 | spin_unlock(¤t->sighand->siglock); | ||
1053 | write_unlock_irq(&tasklist_lock); | ||
1054 | retval = -EAGAIN; | ||
1055 | goto bad_fork_cleanup_namespace; | ||
1056 | } | ||
1057 | p->group_leader = current->group_leader; | ||
1058 | |||
1059 | if (current->signal->group_stop_count > 0) { | ||
1060 | /* | ||
1061 | * There is an all-stop in progress for the group. | ||
1062 | * We ourselves will stop as soon as we check signals. | ||
1063 | * Make the new thread part of that group stop too. | ||
1064 | */ | ||
1065 | current->signal->group_stop_count++; | ||
1066 | set_tsk_thread_flag(p, TIF_SIGPENDING); | ||
1067 | } | ||
1068 | |||
1069 | if (!cputime_eq(current->signal->it_virt_expires, | ||
1070 | cputime_zero) || | ||
1071 | !cputime_eq(current->signal->it_prof_expires, | ||
1072 | cputime_zero) || | ||
1073 | current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || | ||
1074 | !list_empty(¤t->signal->cpu_timers[0]) || | ||
1075 | !list_empty(¤t->signal->cpu_timers[1]) || | ||
1076 | !list_empty(¤t->signal->cpu_timers[2])) { | ||
1077 | /* | ||
1078 | * Have child wake up on its first tick to check | ||
1079 | * for process CPU timers. | ||
1080 | */ | ||
1081 | p->it_prof_expires = jiffies_to_cputime(1); | ||
1082 | } | ||
1083 | |||
1084 | spin_unlock(¤t->sighand->siglock); | ||
1085 | } | ||
1086 | |||
1087 | SET_LINKS(p); | ||
1088 | if (unlikely(p->ptrace & PT_PTRACED)) | ||
1089 | __ptrace_link(p, current->parent); | ||
1090 | |||
1091 | cpuset_fork(p); | ||
1092 | |||
1093 | attach_pid(p, PIDTYPE_PID, p->pid); | ||
1094 | attach_pid(p, PIDTYPE_TGID, p->tgid); | ||
1095 | if (thread_group_leader(p)) { | ||
1096 | attach_pid(p, PIDTYPE_PGID, process_group(p)); | ||
1097 | attach_pid(p, PIDTYPE_SID, p->signal->session); | ||
1098 | if (p->pid) | ||
1099 | __get_cpu_var(process_counts)++; | ||
1100 | } | ||
1101 | |||
1102 | nr_threads++; | ||
1103 | total_forks++; | ||
1104 | write_unlock_irq(&tasklist_lock); | ||
1105 | retval = 0; | ||
1106 | |||
1107 | fork_out: | ||
1108 | if (retval) | ||
1109 | return ERR_PTR(retval); | ||
1110 | return p; | ||
1111 | |||
1112 | bad_fork_cleanup_namespace: | ||
1113 | exit_namespace(p); | ||
1114 | bad_fork_cleanup_keys: | ||
1115 | exit_keys(p); | ||
1116 | bad_fork_cleanup_mm: | ||
1117 | if (p->mm) | ||
1118 | mmput(p->mm); | ||
1119 | bad_fork_cleanup_signal: | ||
1120 | exit_signal(p); | ||
1121 | bad_fork_cleanup_sighand: | ||
1122 | exit_sighand(p); | ||
1123 | bad_fork_cleanup_fs: | ||
1124 | exit_fs(p); /* blocking */ | ||
1125 | bad_fork_cleanup_files: | ||
1126 | exit_files(p); /* blocking */ | ||
1127 | bad_fork_cleanup_semundo: | ||
1128 | exit_sem(p); | ||
1129 | bad_fork_cleanup_audit: | ||
1130 | audit_free(p); | ||
1131 | bad_fork_cleanup_security: | ||
1132 | security_task_free(p); | ||
1133 | bad_fork_cleanup_policy: | ||
1134 | #ifdef CONFIG_NUMA | ||
1135 | mpol_free(p->mempolicy); | ||
1136 | #endif | ||
1137 | bad_fork_cleanup: | ||
1138 | if (p->binfmt) | ||
1139 | module_put(p->binfmt->module); | ||
1140 | bad_fork_cleanup_put_domain: | ||
1141 | module_put(p->thread_info->exec_domain->module); | ||
1142 | bad_fork_cleanup_count: | ||
1143 | put_group_info(p->group_info); | ||
1144 | atomic_dec(&p->user->processes); | ||
1145 | free_uid(p->user); | ||
1146 | bad_fork_free: | ||
1147 | free_task(p); | ||
1148 | goto fork_out; | ||
1149 | } | ||
1150 | |||
1151 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | ||
1152 | { | ||
1153 | memset(regs, 0, sizeof(struct pt_regs)); | ||
1154 | return regs; | ||
1155 | } | ||
1156 | |||
1157 | task_t * __devinit fork_idle(int cpu) | ||
1158 | { | ||
1159 | task_t *task; | ||
1160 | struct pt_regs regs; | ||
1161 | |||
1162 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); | ||
1163 | if (!task) | ||
1164 | return ERR_PTR(-ENOMEM); | ||
1165 | init_idle(task, cpu); | ||
1166 | unhash_process(task); | ||
1167 | return task; | ||
1168 | } | ||
1169 | |||
1170 | static inline int fork_traceflag (unsigned clone_flags) | ||
1171 | { | ||
1172 | if (clone_flags & CLONE_UNTRACED) | ||
1173 | return 0; | ||
1174 | else if (clone_flags & CLONE_VFORK) { | ||
1175 | if (current->ptrace & PT_TRACE_VFORK) | ||
1176 | return PTRACE_EVENT_VFORK; | ||
1177 | } else if ((clone_flags & CSIGNAL) != SIGCHLD) { | ||
1178 | if (current->ptrace & PT_TRACE_CLONE) | ||
1179 | return PTRACE_EVENT_CLONE; | ||
1180 | } else if (current->ptrace & PT_TRACE_FORK) | ||
1181 | return PTRACE_EVENT_FORK; | ||
1182 | |||
1183 | return 0; | ||
1184 | } | ||
1185 | |||
1186 | /* | ||
1187 | * Ok, this is the main fork-routine. | ||
1188 | * | ||
1189 | * It copies the process, and if successful kick-starts | ||
1190 | * it and waits for it to finish using the VM if required. | ||
1191 | */ | ||
1192 | long do_fork(unsigned long clone_flags, | ||
1193 | unsigned long stack_start, | ||
1194 | struct pt_regs *regs, | ||
1195 | unsigned long stack_size, | ||
1196 | int __user *parent_tidptr, | ||
1197 | int __user *child_tidptr) | ||
1198 | { | ||
1199 | struct task_struct *p; | ||
1200 | int trace = 0; | ||
1201 | long pid = alloc_pidmap(); | ||
1202 | |||
1203 | if (pid < 0) | ||
1204 | return -EAGAIN; | ||
1205 | if (unlikely(current->ptrace)) { | ||
1206 | trace = fork_traceflag (clone_flags); | ||
1207 | if (trace) | ||
1208 | clone_flags |= CLONE_PTRACE; | ||
1209 | } | ||
1210 | |||
1211 | p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); | ||
1212 | /* | ||
1213 | * Do this prior waking up the new thread - the thread pointer | ||
1214 | * might get invalid after that point, if the thread exits quickly. | ||
1215 | */ | ||
1216 | if (!IS_ERR(p)) { | ||
1217 | struct completion vfork; | ||
1218 | |||
1219 | if (clone_flags & CLONE_VFORK) { | ||
1220 | p->vfork_done = &vfork; | ||
1221 | init_completion(&vfork); | ||
1222 | } | ||
1223 | |||
1224 | if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { | ||
1225 | /* | ||
1226 | * We'll start up with an immediate SIGSTOP. | ||
1227 | */ | ||
1228 | sigaddset(&p->pending.signal, SIGSTOP); | ||
1229 | set_tsk_thread_flag(p, TIF_SIGPENDING); | ||
1230 | } | ||
1231 | |||
1232 | if (!(clone_flags & CLONE_STOPPED)) | ||
1233 | wake_up_new_task(p, clone_flags); | ||
1234 | else | ||
1235 | p->state = TASK_STOPPED; | ||
1236 | |||
1237 | if (unlikely (trace)) { | ||
1238 | current->ptrace_message = pid; | ||
1239 | ptrace_notify ((trace << 8) | SIGTRAP); | ||
1240 | } | ||
1241 | |||
1242 | if (clone_flags & CLONE_VFORK) { | ||
1243 | wait_for_completion(&vfork); | ||
1244 | if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) | ||
1245 | ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); | ||
1246 | } | ||
1247 | } else { | ||
1248 | free_pidmap(pid); | ||
1249 | pid = PTR_ERR(p); | ||
1250 | } | ||
1251 | return pid; | ||
1252 | } | ||
1253 | |||
1254 | void __init proc_caches_init(void) | ||
1255 | { | ||
1256 | sighand_cachep = kmem_cache_create("sighand_cache", | ||
1257 | sizeof(struct sighand_struct), 0, | ||
1258 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | ||
1259 | signal_cachep = kmem_cache_create("signal_cache", | ||
1260 | sizeof(struct signal_struct), 0, | ||
1261 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | ||
1262 | files_cachep = kmem_cache_create("files_cache", | ||
1263 | sizeof(struct files_struct), 0, | ||
1264 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | ||
1265 | fs_cachep = kmem_cache_create("fs_cache", | ||
1266 | sizeof(struct fs_struct), 0, | ||
1267 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | ||
1268 | vm_area_cachep = kmem_cache_create("vm_area_struct", | ||
1269 | sizeof(struct vm_area_struct), 0, | ||
1270 | SLAB_PANIC, NULL, NULL); | ||
1271 | mm_cachep = kmem_cache_create("mm_struct", | ||
1272 | sizeof(struct mm_struct), 0, | ||
1273 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | ||
1274 | } | ||
diff --git a/kernel/futex.c b/kernel/futex.c new file mode 100644 index 000000000000..7b54a672d0ad --- /dev/null +++ b/kernel/futex.c | |||
@@ -0,0 +1,798 @@ | |||
1 | /* | ||
2 | * Fast Userspace Mutexes (which I call "Futexes!"). | ||
3 | * (C) Rusty Russell, IBM 2002 | ||
4 | * | ||
5 | * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar | ||
6 | * (C) Copyright 2003 Red Hat Inc, All Rights Reserved | ||
7 | * | ||
8 | * Removed page pinning, fix privately mapped COW pages and other cleanups | ||
9 | * (C) Copyright 2003, 2004 Jamie Lokier | ||
10 | * | ||
11 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | ||
12 | * enough at me, Linus for the original (flawed) idea, Matthew | ||
13 | * Kirkwood for proof-of-concept implementation. | ||
14 | * | ||
15 | * "The futexes are also cursed." | ||
16 | * "But they come in a choice of three flavours!" | ||
17 | * | ||
18 | * This program is free software; you can redistribute it and/or modify | ||
19 | * it under the terms of the GNU General Public License as published by | ||
20 | * the Free Software Foundation; either version 2 of the License, or | ||
21 | * (at your option) any later version. | ||
22 | * | ||
23 | * This program is distributed in the hope that it will be useful, | ||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
26 | * GNU General Public License for more details. | ||
27 | * | ||
28 | * You should have received a copy of the GNU General Public License | ||
29 | * along with this program; if not, write to the Free Software | ||
30 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
31 | */ | ||
32 | #include <linux/slab.h> | ||
33 | #include <linux/poll.h> | ||
34 | #include <linux/fs.h> | ||
35 | #include <linux/file.h> | ||
36 | #include <linux/jhash.h> | ||
37 | #include <linux/init.h> | ||
38 | #include <linux/futex.h> | ||
39 | #include <linux/mount.h> | ||
40 | #include <linux/pagemap.h> | ||
41 | #include <linux/syscalls.h> | ||
42 | |||
43 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | ||
44 | |||
45 | /* | ||
46 | * Futexes are matched on equal values of this key. | ||
47 | * The key type depends on whether it's a shared or private mapping. | ||
48 | * Don't rearrange members without looking at hash_futex(). | ||
49 | * | ||
50 | * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. | ||
51 | * We set bit 0 to indicate if it's an inode-based key. | ||
52 | */ | ||
53 | union futex_key { | ||
54 | struct { | ||
55 | unsigned long pgoff; | ||
56 | struct inode *inode; | ||
57 | int offset; | ||
58 | } shared; | ||
59 | struct { | ||
60 | unsigned long uaddr; | ||
61 | struct mm_struct *mm; | ||
62 | int offset; | ||
63 | } private; | ||
64 | struct { | ||
65 | unsigned long word; | ||
66 | void *ptr; | ||
67 | int offset; | ||
68 | } both; | ||
69 | }; | ||
70 | |||
71 | /* | ||
72 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | ||
73 | * we can wake only the relevant ones (hashed queues may be shared). | ||
74 | * | ||
75 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. | ||
76 | * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. | ||
77 | * The order of wakup is always to make the first condition true, then | ||
78 | * wake up q->waiters, then make the second condition true. | ||
79 | */ | ||
80 | struct futex_q { | ||
81 | struct list_head list; | ||
82 | wait_queue_head_t waiters; | ||
83 | |||
84 | /* Which hash list lock to use. */ | ||
85 | spinlock_t *lock_ptr; | ||
86 | |||
87 | /* Key which the futex is hashed on. */ | ||
88 | union futex_key key; | ||
89 | |||
90 | /* For fd, sigio sent using these. */ | ||
91 | int fd; | ||
92 | struct file *filp; | ||
93 | }; | ||
94 | |||
95 | /* | ||
96 | * Split the global futex_lock into every hash list lock. | ||
97 | */ | ||
98 | struct futex_hash_bucket { | ||
99 | spinlock_t lock; | ||
100 | struct list_head chain; | ||
101 | }; | ||
102 | |||
103 | static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; | ||
104 | |||
105 | /* Futex-fs vfsmount entry: */ | ||
106 | static struct vfsmount *futex_mnt; | ||
107 | |||
108 | /* | ||
109 | * We hash on the keys returned from get_futex_key (see below). | ||
110 | */ | ||
111 | static struct futex_hash_bucket *hash_futex(union futex_key *key) | ||
112 | { | ||
113 | u32 hash = jhash2((u32*)&key->both.word, | ||
114 | (sizeof(key->both.word)+sizeof(key->both.ptr))/4, | ||
115 | key->both.offset); | ||
116 | return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Return 1 if two futex_keys are equal, 0 otherwise. | ||
121 | */ | ||
122 | static inline int match_futex(union futex_key *key1, union futex_key *key2) | ||
123 | { | ||
124 | return (key1->both.word == key2->both.word | ||
125 | && key1->both.ptr == key2->both.ptr | ||
126 | && key1->both.offset == key2->both.offset); | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * Get parameters which are the keys for a futex. | ||
131 | * | ||
132 | * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, | ||
133 | * offset_within_page). For private mappings, it's (uaddr, current->mm). | ||
134 | * We can usually work out the index without swapping in the page. | ||
135 | * | ||
136 | * Returns: 0, or negative error code. | ||
137 | * The key words are stored in *key on success. | ||
138 | * | ||
139 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | ||
140 | */ | ||
141 | static int get_futex_key(unsigned long uaddr, union futex_key *key) | ||
142 | { | ||
143 | struct mm_struct *mm = current->mm; | ||
144 | struct vm_area_struct *vma; | ||
145 | struct page *page; | ||
146 | int err; | ||
147 | |||
148 | /* | ||
149 | * The futex address must be "naturally" aligned. | ||
150 | */ | ||
151 | key->both.offset = uaddr % PAGE_SIZE; | ||
152 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | ||
153 | return -EINVAL; | ||
154 | uaddr -= key->both.offset; | ||
155 | |||
156 | /* | ||
157 | * The futex is hashed differently depending on whether | ||
158 | * it's in a shared or private mapping. So check vma first. | ||
159 | */ | ||
160 | vma = find_extend_vma(mm, uaddr); | ||
161 | if (unlikely(!vma)) | ||
162 | return -EFAULT; | ||
163 | |||
164 | /* | ||
165 | * Permissions. | ||
166 | */ | ||
167 | if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) | ||
168 | return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; | ||
169 | |||
170 | /* | ||
171 | * Private mappings are handled in a simple way. | ||
172 | * | ||
173 | * NOTE: When userspace waits on a MAP_SHARED mapping, even if | ||
174 | * it's a read-only handle, it's expected that futexes attach to | ||
175 | * the object not the particular process. Therefore we use | ||
176 | * VM_MAYSHARE here, not VM_SHARED which is restricted to shared | ||
177 | * mappings of _writable_ handles. | ||
178 | */ | ||
179 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | ||
180 | key->private.mm = mm; | ||
181 | key->private.uaddr = uaddr; | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * Linear file mappings are also simple. | ||
187 | */ | ||
188 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | ||
189 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | ||
190 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | ||
191 | key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) | ||
192 | + vma->vm_pgoff); | ||
193 | return 0; | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * We could walk the page table to read the non-linear | ||
198 | * pte, and get the page index without fetching the page | ||
199 | * from swap. But that's a lot of code to duplicate here | ||
200 | * for a rare case, so we simply fetch the page. | ||
201 | */ | ||
202 | |||
203 | /* | ||
204 | * Do a quick atomic lookup first - this is the fastpath. | ||
205 | */ | ||
206 | spin_lock(¤t->mm->page_table_lock); | ||
207 | page = follow_page(mm, uaddr, 0); | ||
208 | if (likely(page != NULL)) { | ||
209 | key->shared.pgoff = | ||
210 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
211 | spin_unlock(¤t->mm->page_table_lock); | ||
212 | return 0; | ||
213 | } | ||
214 | spin_unlock(¤t->mm->page_table_lock); | ||
215 | |||
216 | /* | ||
217 | * Do it the general way. | ||
218 | */ | ||
219 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | ||
220 | if (err >= 0) { | ||
221 | key->shared.pgoff = | ||
222 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
223 | put_page(page); | ||
224 | return 0; | ||
225 | } | ||
226 | return err; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * Take a reference to the resource addressed by a key. | ||
231 | * Can be called while holding spinlocks. | ||
232 | * | ||
233 | * NOTE: mmap_sem MUST be held between get_futex_key() and calling this | ||
234 | * function, if it is called at all. mmap_sem keeps key->shared.inode valid. | ||
235 | */ | ||
236 | static inline void get_key_refs(union futex_key *key) | ||
237 | { | ||
238 | if (key->both.ptr != 0) { | ||
239 | if (key->both.offset & 1) | ||
240 | atomic_inc(&key->shared.inode->i_count); | ||
241 | else | ||
242 | atomic_inc(&key->private.mm->mm_count); | ||
243 | } | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * Drop a reference to the resource addressed by a key. | ||
248 | * The hash bucket spinlock must not be held. | ||
249 | */ | ||
250 | static void drop_key_refs(union futex_key *key) | ||
251 | { | ||
252 | if (key->both.ptr != 0) { | ||
253 | if (key->both.offset & 1) | ||
254 | iput(key->shared.inode); | ||
255 | else | ||
256 | mmdrop(key->private.mm); | ||
257 | } | ||
258 | } | ||
259 | |||
260 | static inline int get_futex_value_locked(int *dest, int __user *from) | ||
261 | { | ||
262 | int ret; | ||
263 | |||
264 | inc_preempt_count(); | ||
265 | ret = __copy_from_user_inatomic(dest, from, sizeof(int)); | ||
266 | dec_preempt_count(); | ||
267 | |||
268 | return ret ? -EFAULT : 0; | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * The hash bucket lock must be held when this is called. | ||
273 | * Afterwards, the futex_q must not be accessed. | ||
274 | */ | ||
275 | static void wake_futex(struct futex_q *q) | ||
276 | { | ||
277 | list_del_init(&q->list); | ||
278 | if (q->filp) | ||
279 | send_sigio(&q->filp->f_owner, q->fd, POLL_IN); | ||
280 | /* | ||
281 | * The lock in wake_up_all() is a crucial memory barrier after the | ||
282 | * list_del_init() and also before assigning to q->lock_ptr. | ||
283 | */ | ||
284 | wake_up_all(&q->waiters); | ||
285 | /* | ||
286 | * The waiting task can free the futex_q as soon as this is written, | ||
287 | * without taking any locks. This must come last. | ||
288 | */ | ||
289 | q->lock_ptr = NULL; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * Wake up all waiters hashed on the physical page that is mapped | ||
294 | * to this virtual address: | ||
295 | */ | ||
296 | static int futex_wake(unsigned long uaddr, int nr_wake) | ||
297 | { | ||
298 | union futex_key key; | ||
299 | struct futex_hash_bucket *bh; | ||
300 | struct list_head *head; | ||
301 | struct futex_q *this, *next; | ||
302 | int ret; | ||
303 | |||
304 | down_read(¤t->mm->mmap_sem); | ||
305 | |||
306 | ret = get_futex_key(uaddr, &key); | ||
307 | if (unlikely(ret != 0)) | ||
308 | goto out; | ||
309 | |||
310 | bh = hash_futex(&key); | ||
311 | spin_lock(&bh->lock); | ||
312 | head = &bh->chain; | ||
313 | |||
314 | list_for_each_entry_safe(this, next, head, list) { | ||
315 | if (match_futex (&this->key, &key)) { | ||
316 | wake_futex(this); | ||
317 | if (++ret >= nr_wake) | ||
318 | break; | ||
319 | } | ||
320 | } | ||
321 | |||
322 | spin_unlock(&bh->lock); | ||
323 | out: | ||
324 | up_read(¤t->mm->mmap_sem); | ||
325 | return ret; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Requeue all waiters hashed on one physical page to another | ||
330 | * physical page. | ||
331 | */ | ||
332 | static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | ||
333 | int nr_wake, int nr_requeue, int *valp) | ||
334 | { | ||
335 | union futex_key key1, key2; | ||
336 | struct futex_hash_bucket *bh1, *bh2; | ||
337 | struct list_head *head1; | ||
338 | struct futex_q *this, *next; | ||
339 | int ret, drop_count = 0; | ||
340 | |||
341 | retry: | ||
342 | down_read(¤t->mm->mmap_sem); | ||
343 | |||
344 | ret = get_futex_key(uaddr1, &key1); | ||
345 | if (unlikely(ret != 0)) | ||
346 | goto out; | ||
347 | ret = get_futex_key(uaddr2, &key2); | ||
348 | if (unlikely(ret != 0)) | ||
349 | goto out; | ||
350 | |||
351 | bh1 = hash_futex(&key1); | ||
352 | bh2 = hash_futex(&key2); | ||
353 | |||
354 | if (bh1 < bh2) | ||
355 | spin_lock(&bh1->lock); | ||
356 | spin_lock(&bh2->lock); | ||
357 | if (bh1 > bh2) | ||
358 | spin_lock(&bh1->lock); | ||
359 | |||
360 | if (likely(valp != NULL)) { | ||
361 | int curval; | ||
362 | |||
363 | ret = get_futex_value_locked(&curval, (int __user *)uaddr1); | ||
364 | |||
365 | if (unlikely(ret)) { | ||
366 | spin_unlock(&bh1->lock); | ||
367 | if (bh1 != bh2) | ||
368 | spin_unlock(&bh2->lock); | ||
369 | |||
370 | /* If we would have faulted, release mmap_sem, fault | ||
371 | * it in and start all over again. | ||
372 | */ | ||
373 | up_read(¤t->mm->mmap_sem); | ||
374 | |||
375 | ret = get_user(curval, (int __user *)uaddr1); | ||
376 | |||
377 | if (!ret) | ||
378 | goto retry; | ||
379 | |||
380 | return ret; | ||
381 | } | ||
382 | if (curval != *valp) { | ||
383 | ret = -EAGAIN; | ||
384 | goto out_unlock; | ||
385 | } | ||
386 | } | ||
387 | |||
388 | head1 = &bh1->chain; | ||
389 | list_for_each_entry_safe(this, next, head1, list) { | ||
390 | if (!match_futex (&this->key, &key1)) | ||
391 | continue; | ||
392 | if (++ret <= nr_wake) { | ||
393 | wake_futex(this); | ||
394 | } else { | ||
395 | list_move_tail(&this->list, &bh2->chain); | ||
396 | this->lock_ptr = &bh2->lock; | ||
397 | this->key = key2; | ||
398 | get_key_refs(&key2); | ||
399 | drop_count++; | ||
400 | |||
401 | if (ret - nr_wake >= nr_requeue) | ||
402 | break; | ||
403 | /* Make sure to stop if key1 == key2 */ | ||
404 | if (head1 == &bh2->chain && head1 != &next->list) | ||
405 | head1 = &this->list; | ||
406 | } | ||
407 | } | ||
408 | |||
409 | out_unlock: | ||
410 | spin_unlock(&bh1->lock); | ||
411 | if (bh1 != bh2) | ||
412 | spin_unlock(&bh2->lock); | ||
413 | |||
414 | /* drop_key_refs() must be called outside the spinlocks. */ | ||
415 | while (--drop_count >= 0) | ||
416 | drop_key_refs(&key1); | ||
417 | |||
418 | out: | ||
419 | up_read(¤t->mm->mmap_sem); | ||
420 | return ret; | ||
421 | } | ||
422 | |||
423 | /* The key must be already stored in q->key. */ | ||
424 | static inline struct futex_hash_bucket * | ||
425 | queue_lock(struct futex_q *q, int fd, struct file *filp) | ||
426 | { | ||
427 | struct futex_hash_bucket *bh; | ||
428 | |||
429 | q->fd = fd; | ||
430 | q->filp = filp; | ||
431 | |||
432 | init_waitqueue_head(&q->waiters); | ||
433 | |||
434 | get_key_refs(&q->key); | ||
435 | bh = hash_futex(&q->key); | ||
436 | q->lock_ptr = &bh->lock; | ||
437 | |||
438 | spin_lock(&bh->lock); | ||
439 | return bh; | ||
440 | } | ||
441 | |||
442 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) | ||
443 | { | ||
444 | list_add_tail(&q->list, &bh->chain); | ||
445 | spin_unlock(&bh->lock); | ||
446 | } | ||
447 | |||
448 | static inline void | ||
449 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | ||
450 | { | ||
451 | spin_unlock(&bh->lock); | ||
452 | drop_key_refs(&q->key); | ||
453 | } | ||
454 | |||
455 | /* | ||
456 | * queue_me and unqueue_me must be called as a pair, each | ||
457 | * exactly once. They are called with the hashed spinlock held. | ||
458 | */ | ||
459 | |||
460 | /* The key must be already stored in q->key. */ | ||
461 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | ||
462 | { | ||
463 | struct futex_hash_bucket *bh; | ||
464 | bh = queue_lock(q, fd, filp); | ||
465 | __queue_me(q, bh); | ||
466 | } | ||
467 | |||
468 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | ||
469 | static int unqueue_me(struct futex_q *q) | ||
470 | { | ||
471 | int ret = 0; | ||
472 | spinlock_t *lock_ptr; | ||
473 | |||
474 | /* In the common case we don't take the spinlock, which is nice. */ | ||
475 | retry: | ||
476 | lock_ptr = q->lock_ptr; | ||
477 | if (lock_ptr != 0) { | ||
478 | spin_lock(lock_ptr); | ||
479 | /* | ||
480 | * q->lock_ptr can change between reading it and | ||
481 | * spin_lock(), causing us to take the wrong lock. This | ||
482 | * corrects the race condition. | ||
483 | * | ||
484 | * Reasoning goes like this: if we have the wrong lock, | ||
485 | * q->lock_ptr must have changed (maybe several times) | ||
486 | * between reading it and the spin_lock(). It can | ||
487 | * change again after the spin_lock() but only if it was | ||
488 | * already changed before the spin_lock(). It cannot, | ||
489 | * however, change back to the original value. Therefore | ||
490 | * we can detect whether we acquired the correct lock. | ||
491 | */ | ||
492 | if (unlikely(lock_ptr != q->lock_ptr)) { | ||
493 | spin_unlock(lock_ptr); | ||
494 | goto retry; | ||
495 | } | ||
496 | WARN_ON(list_empty(&q->list)); | ||
497 | list_del(&q->list); | ||
498 | spin_unlock(lock_ptr); | ||
499 | ret = 1; | ||
500 | } | ||
501 | |||
502 | drop_key_refs(&q->key); | ||
503 | return ret; | ||
504 | } | ||
505 | |||
506 | static int futex_wait(unsigned long uaddr, int val, unsigned long time) | ||
507 | { | ||
508 | DECLARE_WAITQUEUE(wait, current); | ||
509 | int ret, curval; | ||
510 | struct futex_q q; | ||
511 | struct futex_hash_bucket *bh; | ||
512 | |||
513 | retry: | ||
514 | down_read(¤t->mm->mmap_sem); | ||
515 | |||
516 | ret = get_futex_key(uaddr, &q.key); | ||
517 | if (unlikely(ret != 0)) | ||
518 | goto out_release_sem; | ||
519 | |||
520 | bh = queue_lock(&q, -1, NULL); | ||
521 | |||
522 | /* | ||
523 | * Access the page AFTER the futex is queued. | ||
524 | * Order is important: | ||
525 | * | ||
526 | * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); | ||
527 | * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } | ||
528 | * | ||
529 | * The basic logical guarantee of a futex is that it blocks ONLY | ||
530 | * if cond(var) is known to be true at the time of blocking, for | ||
531 | * any cond. If we queued after testing *uaddr, that would open | ||
532 | * a race condition where we could block indefinitely with | ||
533 | * cond(var) false, which would violate the guarantee. | ||
534 | * | ||
535 | * A consequence is that futex_wait() can return zero and absorb | ||
536 | * a wakeup when *uaddr != val on entry to the syscall. This is | ||
537 | * rare, but normal. | ||
538 | * | ||
539 | * We hold the mmap semaphore, so the mapping cannot have changed | ||
540 | * since we looked it up in get_futex_key. | ||
541 | */ | ||
542 | |||
543 | ret = get_futex_value_locked(&curval, (int __user *)uaddr); | ||
544 | |||
545 | if (unlikely(ret)) { | ||
546 | queue_unlock(&q, bh); | ||
547 | |||
548 | /* If we would have faulted, release mmap_sem, fault it in and | ||
549 | * start all over again. | ||
550 | */ | ||
551 | up_read(¤t->mm->mmap_sem); | ||
552 | |||
553 | ret = get_user(curval, (int __user *)uaddr); | ||
554 | |||
555 | if (!ret) | ||
556 | goto retry; | ||
557 | return ret; | ||
558 | } | ||
559 | if (curval != val) { | ||
560 | ret = -EWOULDBLOCK; | ||
561 | queue_unlock(&q, bh); | ||
562 | goto out_release_sem; | ||
563 | } | ||
564 | |||
565 | /* Only actually queue if *uaddr contained val. */ | ||
566 | __queue_me(&q, bh); | ||
567 | |||
568 | /* | ||
569 | * Now the futex is queued and we have checked the data, we | ||
570 | * don't want to hold mmap_sem while we sleep. | ||
571 | */ | ||
572 | up_read(¤t->mm->mmap_sem); | ||
573 | |||
574 | /* | ||
575 | * There might have been scheduling since the queue_me(), as we | ||
576 | * cannot hold a spinlock across the get_user() in case it | ||
577 | * faults, and we cannot just set TASK_INTERRUPTIBLE state when | ||
578 | * queueing ourselves into the futex hash. This code thus has to | ||
579 | * rely on the futex_wake() code removing us from hash when it | ||
580 | * wakes us up. | ||
581 | */ | ||
582 | |||
583 | /* add_wait_queue is the barrier after __set_current_state. */ | ||
584 | __set_current_state(TASK_INTERRUPTIBLE); | ||
585 | add_wait_queue(&q.waiters, &wait); | ||
586 | /* | ||
587 | * !list_empty() is safe here without any lock. | ||
588 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | ||
589 | */ | ||
590 | if (likely(!list_empty(&q.list))) | ||
591 | time = schedule_timeout(time); | ||
592 | __set_current_state(TASK_RUNNING); | ||
593 | |||
594 | /* | ||
595 | * NOTE: we don't remove ourselves from the waitqueue because | ||
596 | * we are the only user of it. | ||
597 | */ | ||
598 | |||
599 | /* If we were woken (and unqueued), we succeeded, whatever. */ | ||
600 | if (!unqueue_me(&q)) | ||
601 | return 0; | ||
602 | if (time == 0) | ||
603 | return -ETIMEDOUT; | ||
604 | /* We expect signal_pending(current), but another thread may | ||
605 | * have handled it for us already. */ | ||
606 | return -EINTR; | ||
607 | |||
608 | out_release_sem: | ||
609 | up_read(¤t->mm->mmap_sem); | ||
610 | return ret; | ||
611 | } | ||
612 | |||
613 | static int futex_close(struct inode *inode, struct file *filp) | ||
614 | { | ||
615 | struct futex_q *q = filp->private_data; | ||
616 | |||
617 | unqueue_me(q); | ||
618 | kfree(q); | ||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | /* This is one-shot: once it's gone off you need a new fd */ | ||
623 | static unsigned int futex_poll(struct file *filp, | ||
624 | struct poll_table_struct *wait) | ||
625 | { | ||
626 | struct futex_q *q = filp->private_data; | ||
627 | int ret = 0; | ||
628 | |||
629 | poll_wait(filp, &q->waiters, wait); | ||
630 | |||
631 | /* | ||
632 | * list_empty() is safe here without any lock. | ||
633 | * q->lock_ptr != 0 is not safe, because of ordering against wakeup. | ||
634 | */ | ||
635 | if (list_empty(&q->list)) | ||
636 | ret = POLLIN | POLLRDNORM; | ||
637 | |||
638 | return ret; | ||
639 | } | ||
640 | |||
641 | static struct file_operations futex_fops = { | ||
642 | .release = futex_close, | ||
643 | .poll = futex_poll, | ||
644 | }; | ||
645 | |||
646 | /* | ||
647 | * Signal allows caller to avoid the race which would occur if they | ||
648 | * set the sigio stuff up afterwards. | ||
649 | */ | ||
650 | static int futex_fd(unsigned long uaddr, int signal) | ||
651 | { | ||
652 | struct futex_q *q; | ||
653 | struct file *filp; | ||
654 | int ret, err; | ||
655 | |||
656 | ret = -EINVAL; | ||
657 | if (signal < 0 || signal > _NSIG) | ||
658 | goto out; | ||
659 | |||
660 | ret = get_unused_fd(); | ||
661 | if (ret < 0) | ||
662 | goto out; | ||
663 | filp = get_empty_filp(); | ||
664 | if (!filp) { | ||
665 | put_unused_fd(ret); | ||
666 | ret = -ENFILE; | ||
667 | goto out; | ||
668 | } | ||
669 | filp->f_op = &futex_fops; | ||
670 | filp->f_vfsmnt = mntget(futex_mnt); | ||
671 | filp->f_dentry = dget(futex_mnt->mnt_root); | ||
672 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; | ||
673 | |||
674 | if (signal) { | ||
675 | int err; | ||
676 | err = f_setown(filp, current->pid, 1); | ||
677 | if (err < 0) { | ||
678 | put_unused_fd(ret); | ||
679 | put_filp(filp); | ||
680 | ret = err; | ||
681 | goto out; | ||
682 | } | ||
683 | filp->f_owner.signum = signal; | ||
684 | } | ||
685 | |||
686 | q = kmalloc(sizeof(*q), GFP_KERNEL); | ||
687 | if (!q) { | ||
688 | put_unused_fd(ret); | ||
689 | put_filp(filp); | ||
690 | ret = -ENOMEM; | ||
691 | goto out; | ||
692 | } | ||
693 | |||
694 | down_read(¤t->mm->mmap_sem); | ||
695 | err = get_futex_key(uaddr, &q->key); | ||
696 | |||
697 | if (unlikely(err != 0)) { | ||
698 | up_read(¤t->mm->mmap_sem); | ||
699 | put_unused_fd(ret); | ||
700 | put_filp(filp); | ||
701 | kfree(q); | ||
702 | return err; | ||
703 | } | ||
704 | |||
705 | /* | ||
706 | * queue_me() must be called before releasing mmap_sem, because | ||
707 | * key->shared.inode needs to be referenced while holding it. | ||
708 | */ | ||
709 | filp->private_data = q; | ||
710 | |||
711 | queue_me(q, ret, filp); | ||
712 | up_read(¤t->mm->mmap_sem); | ||
713 | |||
714 | /* Now we map fd to filp, so userspace can access it */ | ||
715 | fd_install(ret, filp); | ||
716 | out: | ||
717 | return ret; | ||
718 | } | ||
719 | |||
720 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | ||
721 | unsigned long uaddr2, int val2, int val3) | ||
722 | { | ||
723 | int ret; | ||
724 | |||
725 | switch (op) { | ||
726 | case FUTEX_WAIT: | ||
727 | ret = futex_wait(uaddr, val, timeout); | ||
728 | break; | ||
729 | case FUTEX_WAKE: | ||
730 | ret = futex_wake(uaddr, val); | ||
731 | break; | ||
732 | case FUTEX_FD: | ||
733 | /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ | ||
734 | ret = futex_fd(uaddr, val); | ||
735 | break; | ||
736 | case FUTEX_REQUEUE: | ||
737 | ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); | ||
738 | break; | ||
739 | case FUTEX_CMP_REQUEUE: | ||
740 | ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); | ||
741 | break; | ||
742 | default: | ||
743 | ret = -ENOSYS; | ||
744 | } | ||
745 | return ret; | ||
746 | } | ||
747 | |||
748 | |||
749 | asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | ||
750 | struct timespec __user *utime, u32 __user *uaddr2, | ||
751 | int val3) | ||
752 | { | ||
753 | struct timespec t; | ||
754 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | ||
755 | int val2 = 0; | ||
756 | |||
757 | if ((op == FUTEX_WAIT) && utime) { | ||
758 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | ||
759 | return -EFAULT; | ||
760 | timeout = timespec_to_jiffies(&t) + 1; | ||
761 | } | ||
762 | /* | ||
763 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | ||
764 | */ | ||
765 | if (op >= FUTEX_REQUEUE) | ||
766 | val2 = (int) (unsigned long) utime; | ||
767 | |||
768 | return do_futex((unsigned long)uaddr, op, val, timeout, | ||
769 | (unsigned long)uaddr2, val2, val3); | ||
770 | } | ||
771 | |||
772 | static struct super_block * | ||
773 | futexfs_get_sb(struct file_system_type *fs_type, | ||
774 | int flags, const char *dev_name, void *data) | ||
775 | { | ||
776 | return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); | ||
777 | } | ||
778 | |||
779 | static struct file_system_type futex_fs_type = { | ||
780 | .name = "futexfs", | ||
781 | .get_sb = futexfs_get_sb, | ||
782 | .kill_sb = kill_anon_super, | ||
783 | }; | ||
784 | |||
785 | static int __init init(void) | ||
786 | { | ||
787 | unsigned int i; | ||
788 | |||
789 | register_filesystem(&futex_fs_type); | ||
790 | futex_mnt = kern_mount(&futex_fs_type); | ||
791 | |||
792 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | ||
793 | INIT_LIST_HEAD(&futex_queues[i].chain); | ||
794 | spin_lock_init(&futex_queues[i].lock); | ||
795 | } | ||
796 | return 0; | ||
797 | } | ||
798 | __initcall(init); | ||
diff --git a/kernel/intermodule.c b/kernel/intermodule.c new file mode 100644 index 000000000000..388977f3e9b7 --- /dev/null +++ b/kernel/intermodule.c | |||
@@ -0,0 +1,182 @@ | |||
1 | /* Deprecated, do not use. Moved from module.c to here. --RR */ | ||
2 | |||
3 | /* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */ | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/kmod.h> | ||
6 | #include <linux/spinlock.h> | ||
7 | #include <linux/list.h> | ||
8 | #include <linux/slab.h> | ||
9 | |||
10 | /* inter_module functions are always available, even when the kernel is | ||
11 | * compiled without modules. Consumers of inter_module_xxx routines | ||
12 | * will always work, even when both are built into the kernel, this | ||
13 | * approach removes lots of #ifdefs in mainline code. | ||
14 | */ | ||
15 | |||
16 | static struct list_head ime_list = LIST_HEAD_INIT(ime_list); | ||
17 | static DEFINE_SPINLOCK(ime_lock); | ||
18 | static int kmalloc_failed; | ||
19 | |||
20 | struct inter_module_entry { | ||
21 | struct list_head list; | ||
22 | const char *im_name; | ||
23 | struct module *owner; | ||
24 | const void *userdata; | ||
25 | }; | ||
26 | |||
27 | /** | ||
28 | * inter_module_register - register a new set of inter module data. | ||
29 | * @im_name: an arbitrary string to identify the data, must be unique | ||
30 | * @owner: module that is registering the data, always use THIS_MODULE | ||
31 | * @userdata: pointer to arbitrary userdata to be registered | ||
32 | * | ||
33 | * Description: Check that the im_name has not already been registered, | ||
34 | * complain if it has. For new data, add it to the inter_module_entry | ||
35 | * list. | ||
36 | */ | ||
37 | void inter_module_register(const char *im_name, struct module *owner, const void *userdata) | ||
38 | { | ||
39 | struct list_head *tmp; | ||
40 | struct inter_module_entry *ime, *ime_new; | ||
41 | |||
42 | if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { | ||
43 | /* Overloaded kernel, not fatal */ | ||
44 | printk(KERN_ERR | ||
45 | "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", | ||
46 | im_name); | ||
47 | kmalloc_failed = 1; | ||
48 | return; | ||
49 | } | ||
50 | memset(ime_new, 0, sizeof(*ime_new)); | ||
51 | ime_new->im_name = im_name; | ||
52 | ime_new->owner = owner; | ||
53 | ime_new->userdata = userdata; | ||
54 | |||
55 | spin_lock(&ime_lock); | ||
56 | list_for_each(tmp, &ime_list) { | ||
57 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
58 | if (strcmp(ime->im_name, im_name) == 0) { | ||
59 | spin_unlock(&ime_lock); | ||
60 | kfree(ime_new); | ||
61 | /* Program logic error, fatal */ | ||
62 | printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name); | ||
63 | BUG(); | ||
64 | } | ||
65 | } | ||
66 | list_add(&(ime_new->list), &ime_list); | ||
67 | spin_unlock(&ime_lock); | ||
68 | } | ||
69 | |||
70 | /** | ||
71 | * inter_module_unregister - unregister a set of inter module data. | ||
72 | * @im_name: an arbitrary string to identify the data, must be unique | ||
73 | * | ||
74 | * Description: Check that the im_name has been registered, complain if | ||
75 | * it has not. For existing data, remove it from the | ||
76 | * inter_module_entry list. | ||
77 | */ | ||
78 | void inter_module_unregister(const char *im_name) | ||
79 | { | ||
80 | struct list_head *tmp; | ||
81 | struct inter_module_entry *ime; | ||
82 | |||
83 | spin_lock(&ime_lock); | ||
84 | list_for_each(tmp, &ime_list) { | ||
85 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
86 | if (strcmp(ime->im_name, im_name) == 0) { | ||
87 | list_del(&(ime->list)); | ||
88 | spin_unlock(&ime_lock); | ||
89 | kfree(ime); | ||
90 | return; | ||
91 | } | ||
92 | } | ||
93 | spin_unlock(&ime_lock); | ||
94 | if (kmalloc_failed) { | ||
95 | printk(KERN_ERR | ||
96 | "inter_module_unregister: no entry for '%s', " | ||
97 | "probably caused by previous kmalloc failure\n", | ||
98 | im_name); | ||
99 | return; | ||
100 | } | ||
101 | else { | ||
102 | /* Program logic error, fatal */ | ||
103 | printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name); | ||
104 | BUG(); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | /** | ||
109 | * inter_module_get - return arbitrary userdata from another module. | ||
110 | * @im_name: an arbitrary string to identify the data, must be unique | ||
111 | * | ||
112 | * Description: If the im_name has not been registered, return NULL. | ||
113 | * Try to increment the use count on the owning module, if that fails | ||
114 | * then return NULL. Otherwise return the userdata. | ||
115 | */ | ||
116 | static const void *inter_module_get(const char *im_name) | ||
117 | { | ||
118 | struct list_head *tmp; | ||
119 | struct inter_module_entry *ime; | ||
120 | const void *result = NULL; | ||
121 | |||
122 | spin_lock(&ime_lock); | ||
123 | list_for_each(tmp, &ime_list) { | ||
124 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
125 | if (strcmp(ime->im_name, im_name) == 0) { | ||
126 | if (try_module_get(ime->owner)) | ||
127 | result = ime->userdata; | ||
128 | break; | ||
129 | } | ||
130 | } | ||
131 | spin_unlock(&ime_lock); | ||
132 | return(result); | ||
133 | } | ||
134 | |||
135 | /** | ||
136 | * inter_module_get_request - im get with automatic request_module. | ||
137 | * @im_name: an arbitrary string to identify the data, must be unique | ||
138 | * @modname: module that is expected to register im_name | ||
139 | * | ||
140 | * Description: If inter_module_get fails, do request_module then retry. | ||
141 | */ | ||
142 | const void *inter_module_get_request(const char *im_name, const char *modname) | ||
143 | { | ||
144 | const void *result = inter_module_get(im_name); | ||
145 | if (!result) { | ||
146 | request_module("%s", modname); | ||
147 | result = inter_module_get(im_name); | ||
148 | } | ||
149 | return(result); | ||
150 | } | ||
151 | |||
152 | /** | ||
153 | * inter_module_put - release use of data from another module. | ||
154 | * @im_name: an arbitrary string to identify the data, must be unique | ||
155 | * | ||
156 | * Description: If the im_name has not been registered, complain, | ||
157 | * otherwise decrement the use count on the owning module. | ||
158 | */ | ||
159 | void inter_module_put(const char *im_name) | ||
160 | { | ||
161 | struct list_head *tmp; | ||
162 | struct inter_module_entry *ime; | ||
163 | |||
164 | spin_lock(&ime_lock); | ||
165 | list_for_each(tmp, &ime_list) { | ||
166 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
167 | if (strcmp(ime->im_name, im_name) == 0) { | ||
168 | if (ime->owner) | ||
169 | module_put(ime->owner); | ||
170 | spin_unlock(&ime_lock); | ||
171 | return; | ||
172 | } | ||
173 | } | ||
174 | spin_unlock(&ime_lock); | ||
175 | printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name); | ||
176 | BUG(); | ||
177 | } | ||
178 | |||
179 | EXPORT_SYMBOL(inter_module_register); | ||
180 | EXPORT_SYMBOL(inter_module_unregister); | ||
181 | EXPORT_SYMBOL(inter_module_get_request); | ||
182 | EXPORT_SYMBOL(inter_module_put); | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile new file mode 100644 index 000000000000..49378738ff5e --- /dev/null +++ b/kernel/irq/Makefile | |||
@@ -0,0 +1,5 @@ | |||
1 | |||
2 | obj-y := handle.o manage.o spurious.o | ||
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | ||
4 | obj-$(CONFIG_PROC_FS) += proc.o | ||
5 | |||
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c new file mode 100644 index 000000000000..98d62d8efeaf --- /dev/null +++ b/kernel/irq/autoprobe.c | |||
@@ -0,0 +1,189 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/autoprobe.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains the interrupt probing code and driver APIs. | ||
7 | */ | ||
8 | |||
9 | #include <linux/irq.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/interrupt.h> | ||
12 | |||
13 | /* | ||
14 | * Autodetection depends on the fact that any interrupt that | ||
15 | * comes in on to an unassigned handler will get stuck with | ||
16 | * "IRQ_WAITING" cleared and the interrupt disabled. | ||
17 | */ | ||
18 | static DECLARE_MUTEX(probe_sem); | ||
19 | |||
20 | /** | ||
21 | * probe_irq_on - begin an interrupt autodetect | ||
22 | * | ||
23 | * Commence probing for an interrupt. The interrupts are scanned | ||
24 | * and a mask of potential interrupt lines is returned. | ||
25 | * | ||
26 | */ | ||
27 | unsigned long probe_irq_on(void) | ||
28 | { | ||
29 | unsigned long val, delay; | ||
30 | irq_desc_t *desc; | ||
31 | unsigned int i; | ||
32 | |||
33 | down(&probe_sem); | ||
34 | /* | ||
35 | * something may have generated an irq long ago and we want to | ||
36 | * flush such a longstanding irq before considering it as spurious. | ||
37 | */ | ||
38 | for (i = NR_IRQS-1; i > 0; i--) { | ||
39 | desc = irq_desc + i; | ||
40 | |||
41 | spin_lock_irq(&desc->lock); | ||
42 | if (!irq_desc[i].action) | ||
43 | irq_desc[i].handler->startup(i); | ||
44 | spin_unlock_irq(&desc->lock); | ||
45 | } | ||
46 | |||
47 | /* Wait for longstanding interrupts to trigger. */ | ||
48 | for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) | ||
49 | /* about 20ms delay */ barrier(); | ||
50 | |||
51 | /* | ||
52 | * enable any unassigned irqs | ||
53 | * (we must startup again here because if a longstanding irq | ||
54 | * happened in the previous stage, it may have masked itself) | ||
55 | */ | ||
56 | for (i = NR_IRQS-1; i > 0; i--) { | ||
57 | desc = irq_desc + i; | ||
58 | |||
59 | spin_lock_irq(&desc->lock); | ||
60 | if (!desc->action) { | ||
61 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | ||
62 | if (desc->handler->startup(i)) | ||
63 | desc->status |= IRQ_PENDING; | ||
64 | } | ||
65 | spin_unlock_irq(&desc->lock); | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Wait for spurious interrupts to trigger | ||
70 | */ | ||
71 | for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) | ||
72 | /* about 100ms delay */ barrier(); | ||
73 | |||
74 | /* | ||
75 | * Now filter out any obviously spurious interrupts | ||
76 | */ | ||
77 | val = 0; | ||
78 | for (i = 0; i < NR_IRQS; i++) { | ||
79 | irq_desc_t *desc = irq_desc + i; | ||
80 | unsigned int status; | ||
81 | |||
82 | spin_lock_irq(&desc->lock); | ||
83 | status = desc->status; | ||
84 | |||
85 | if (status & IRQ_AUTODETECT) { | ||
86 | /* It triggered already - consider it spurious. */ | ||
87 | if (!(status & IRQ_WAITING)) { | ||
88 | desc->status = status & ~IRQ_AUTODETECT; | ||
89 | desc->handler->shutdown(i); | ||
90 | } else | ||
91 | if (i < 32) | ||
92 | val |= 1 << i; | ||
93 | } | ||
94 | spin_unlock_irq(&desc->lock); | ||
95 | } | ||
96 | |||
97 | return val; | ||
98 | } | ||
99 | |||
100 | EXPORT_SYMBOL(probe_irq_on); | ||
101 | |||
102 | /** | ||
103 | * probe_irq_mask - scan a bitmap of interrupt lines | ||
104 | * @val: mask of interrupts to consider | ||
105 | * | ||
106 | * Scan the interrupt lines and return a bitmap of active | ||
107 | * autodetect interrupts. The interrupt probe logic state | ||
108 | * is then returned to its previous value. | ||
109 | * | ||
110 | * Note: we need to scan all the irq's even though we will | ||
111 | * only return autodetect irq numbers - just so that we reset | ||
112 | * them all to a known state. | ||
113 | */ | ||
114 | unsigned int probe_irq_mask(unsigned long val) | ||
115 | { | ||
116 | unsigned int mask; | ||
117 | int i; | ||
118 | |||
119 | mask = 0; | ||
120 | for (i = 0; i < NR_IRQS; i++) { | ||
121 | irq_desc_t *desc = irq_desc + i; | ||
122 | unsigned int status; | ||
123 | |||
124 | spin_lock_irq(&desc->lock); | ||
125 | status = desc->status; | ||
126 | |||
127 | if (status & IRQ_AUTODETECT) { | ||
128 | if (i < 16 && !(status & IRQ_WAITING)) | ||
129 | mask |= 1 << i; | ||
130 | |||
131 | desc->status = status & ~IRQ_AUTODETECT; | ||
132 | desc->handler->shutdown(i); | ||
133 | } | ||
134 | spin_unlock_irq(&desc->lock); | ||
135 | } | ||
136 | up(&probe_sem); | ||
137 | |||
138 | return mask & val; | ||
139 | } | ||
140 | EXPORT_SYMBOL(probe_irq_mask); | ||
141 | |||
142 | /** | ||
143 | * probe_irq_off - end an interrupt autodetect | ||
144 | * @val: mask of potential interrupts (unused) | ||
145 | * | ||
146 | * Scans the unused interrupt lines and returns the line which | ||
147 | * appears to have triggered the interrupt. If no interrupt was | ||
148 | * found then zero is returned. If more than one interrupt is | ||
149 | * found then minus the first candidate is returned to indicate | ||
150 | * their is doubt. | ||
151 | * | ||
152 | * The interrupt probe logic state is returned to its previous | ||
153 | * value. | ||
154 | * | ||
155 | * BUGS: When used in a module (which arguably shouldn't happen) | ||
156 | * nothing prevents two IRQ probe callers from overlapping. The | ||
157 | * results of this are non-optimal. | ||
158 | */ | ||
159 | int probe_irq_off(unsigned long val) | ||
160 | { | ||
161 | int i, irq_found = 0, nr_irqs = 0; | ||
162 | |||
163 | for (i = 0; i < NR_IRQS; i++) { | ||
164 | irq_desc_t *desc = irq_desc + i; | ||
165 | unsigned int status; | ||
166 | |||
167 | spin_lock_irq(&desc->lock); | ||
168 | status = desc->status; | ||
169 | |||
170 | if (status & IRQ_AUTODETECT) { | ||
171 | if (!(status & IRQ_WAITING)) { | ||
172 | if (!nr_irqs) | ||
173 | irq_found = i; | ||
174 | nr_irqs++; | ||
175 | } | ||
176 | desc->status = status & ~IRQ_AUTODETECT; | ||
177 | desc->handler->shutdown(i); | ||
178 | } | ||
179 | spin_unlock_irq(&desc->lock); | ||
180 | } | ||
181 | up(&probe_sem); | ||
182 | |||
183 | if (nr_irqs > 1) | ||
184 | irq_found = -irq_found; | ||
185 | return irq_found; | ||
186 | } | ||
187 | |||
188 | EXPORT_SYMBOL(probe_irq_off); | ||
189 | |||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c new file mode 100644 index 000000000000..2fb0e46e11f3 --- /dev/null +++ b/kernel/irq/handle.c | |||
@@ -0,0 +1,193 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/handle.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains the core interrupt handling code. | ||
7 | */ | ||
8 | |||
9 | #include <linux/irq.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/random.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | #include <linux/kernel_stat.h> | ||
14 | |||
15 | #include "internals.h" | ||
16 | |||
17 | /* | ||
18 | * Linux has a controller-independent interrupt architecture. | ||
19 | * Every controller has a 'controller-template', that is used | ||
20 | * by the main code to do the right thing. Each driver-visible | ||
21 | * interrupt source is transparently wired to the apropriate | ||
22 | * controller. Thus drivers need not be aware of the | ||
23 | * interrupt-controller. | ||
24 | * | ||
25 | * The code is designed to be easily extended with new/different | ||
26 | * interrupt controllers, without having to do assembly magic or | ||
27 | * having to touch the generic code. | ||
28 | * | ||
29 | * Controller mappings for all interrupt sources: | ||
30 | */ | ||
31 | irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { | ||
32 | [0 ... NR_IRQS-1] = { | ||
33 | .handler = &no_irq_type, | ||
34 | .lock = SPIN_LOCK_UNLOCKED | ||
35 | } | ||
36 | }; | ||
37 | |||
38 | /* | ||
39 | * Generic 'no controller' code | ||
40 | */ | ||
41 | static void end_none(unsigned int irq) { } | ||
42 | static void enable_none(unsigned int irq) { } | ||
43 | static void disable_none(unsigned int irq) { } | ||
44 | static void shutdown_none(unsigned int irq) { } | ||
45 | static unsigned int startup_none(unsigned int irq) { return 0; } | ||
46 | |||
47 | static void ack_none(unsigned int irq) | ||
48 | { | ||
49 | /* | ||
50 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
51 | * each architecture has to answer this themself. | ||
52 | */ | ||
53 | ack_bad_irq(irq); | ||
54 | } | ||
55 | |||
56 | struct hw_interrupt_type no_irq_type = { | ||
57 | .typename = "none", | ||
58 | .startup = startup_none, | ||
59 | .shutdown = shutdown_none, | ||
60 | .enable = enable_none, | ||
61 | .disable = disable_none, | ||
62 | .ack = ack_none, | ||
63 | .end = end_none, | ||
64 | .set_affinity = NULL | ||
65 | }; | ||
66 | |||
67 | /* | ||
68 | * Special, empty irq handler: | ||
69 | */ | ||
70 | irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) | ||
71 | { | ||
72 | return IRQ_NONE; | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Have got an event to handle: | ||
77 | */ | ||
78 | fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, | ||
79 | struct irqaction *action) | ||
80 | { | ||
81 | int ret, retval = 0, status = 0; | ||
82 | |||
83 | if (!(action->flags & SA_INTERRUPT)) | ||
84 | local_irq_enable(); | ||
85 | |||
86 | do { | ||
87 | ret = action->handler(irq, action->dev_id, regs); | ||
88 | if (ret == IRQ_HANDLED) | ||
89 | status |= action->flags; | ||
90 | retval |= ret; | ||
91 | action = action->next; | ||
92 | } while (action); | ||
93 | |||
94 | if (status & SA_SAMPLE_RANDOM) | ||
95 | add_interrupt_randomness(irq); | ||
96 | local_irq_disable(); | ||
97 | |||
98 | return retval; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * do_IRQ handles all normal device IRQ's (the special | ||
103 | * SMP cross-CPU interrupts have their own specific | ||
104 | * handlers). | ||
105 | */ | ||
106 | fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | ||
107 | { | ||
108 | irq_desc_t *desc = irq_desc + irq; | ||
109 | struct irqaction * action; | ||
110 | unsigned int status; | ||
111 | |||
112 | kstat_this_cpu.irqs[irq]++; | ||
113 | if (desc->status & IRQ_PER_CPU) { | ||
114 | irqreturn_t action_ret; | ||
115 | |||
116 | /* | ||
117 | * No locking required for CPU-local interrupts: | ||
118 | */ | ||
119 | desc->handler->ack(irq); | ||
120 | action_ret = handle_IRQ_event(irq, regs, desc->action); | ||
121 | if (!noirqdebug) | ||
122 | note_interrupt(irq, desc, action_ret); | ||
123 | desc->handler->end(irq); | ||
124 | return 1; | ||
125 | } | ||
126 | |||
127 | spin_lock(&desc->lock); | ||
128 | desc->handler->ack(irq); | ||
129 | /* | ||
130 | * REPLAY is when Linux resends an IRQ that was dropped earlier | ||
131 | * WAITING is used by probe to mark irqs that are being tested | ||
132 | */ | ||
133 | status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); | ||
134 | status |= IRQ_PENDING; /* we _want_ to handle it */ | ||
135 | |||
136 | /* | ||
137 | * If the IRQ is disabled for whatever reason, we cannot | ||
138 | * use the action we have. | ||
139 | */ | ||
140 | action = NULL; | ||
141 | if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { | ||
142 | action = desc->action; | ||
143 | status &= ~IRQ_PENDING; /* we commit to handling */ | ||
144 | status |= IRQ_INPROGRESS; /* we are handling it */ | ||
145 | } | ||
146 | desc->status = status; | ||
147 | |||
148 | /* | ||
149 | * If there is no IRQ handler or it was disabled, exit early. | ||
150 | * Since we set PENDING, if another processor is handling | ||
151 | * a different instance of this same irq, the other processor | ||
152 | * will take care of it. | ||
153 | */ | ||
154 | if (unlikely(!action)) | ||
155 | goto out; | ||
156 | |||
157 | /* | ||
158 | * Edge triggered interrupts need to remember | ||
159 | * pending events. | ||
160 | * This applies to any hw interrupts that allow a second | ||
161 | * instance of the same irq to arrive while we are in do_IRQ | ||
162 | * or in the handler. But the code here only handles the _second_ | ||
163 | * instance of the irq, not the third or fourth. So it is mostly | ||
164 | * useful for irq hardware that does not mask cleanly in an | ||
165 | * SMP environment. | ||
166 | */ | ||
167 | for (;;) { | ||
168 | irqreturn_t action_ret; | ||
169 | |||
170 | spin_unlock(&desc->lock); | ||
171 | |||
172 | action_ret = handle_IRQ_event(irq, regs, action); | ||
173 | |||
174 | spin_lock(&desc->lock); | ||
175 | if (!noirqdebug) | ||
176 | note_interrupt(irq, desc, action_ret); | ||
177 | if (likely(!(desc->status & IRQ_PENDING))) | ||
178 | break; | ||
179 | desc->status &= ~IRQ_PENDING; | ||
180 | } | ||
181 | desc->status &= ~IRQ_INPROGRESS; | ||
182 | |||
183 | out: | ||
184 | /* | ||
185 | * The ->end() handler has to deal with interrupts which got | ||
186 | * disabled while the handler was running. | ||
187 | */ | ||
188 | desc->handler->end(irq); | ||
189 | spin_unlock(&desc->lock); | ||
190 | |||
191 | return 1; | ||
192 | } | ||
193 | |||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h new file mode 100644 index 000000000000..46feba630266 --- /dev/null +++ b/kernel/irq/internals.h | |||
@@ -0,0 +1,18 @@ | |||
1 | /* | ||
2 | * IRQ subsystem internal functions and variables: | ||
3 | */ | ||
4 | |||
5 | extern int noirqdebug; | ||
6 | |||
7 | #ifdef CONFIG_PROC_FS | ||
8 | extern void register_irq_proc(unsigned int irq); | ||
9 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); | ||
10 | extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); | ||
11 | #else | ||
12 | static inline void register_irq_proc(unsigned int irq) { } | ||
13 | static inline void register_handler_proc(unsigned int irq, | ||
14 | struct irqaction *action) { } | ||
15 | static inline void unregister_handler_proc(unsigned int irq, | ||
16 | struct irqaction *action) { } | ||
17 | #endif | ||
18 | |||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c new file mode 100644 index 000000000000..5202e4c4a5b6 --- /dev/null +++ b/kernel/irq/manage.c | |||
@@ -0,0 +1,349 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/manage.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains driver APIs to the irq subsystem. | ||
7 | */ | ||
8 | |||
9 | #include <linux/irq.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/random.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | |||
14 | #include "internals.h" | ||
15 | |||
16 | #ifdef CONFIG_SMP | ||
17 | |||
18 | cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; | ||
19 | |||
20 | /** | ||
21 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | ||
22 | * | ||
23 | * This function waits for any pending IRQ handlers for this interrupt | ||
24 | * to complete before returning. If you use this function while | ||
25 | * holding a resource the IRQ handler may need you will deadlock. | ||
26 | * | ||
27 | * This function may be called - with care - from IRQ context. | ||
28 | */ | ||
29 | void synchronize_irq(unsigned int irq) | ||
30 | { | ||
31 | struct irq_desc *desc = irq_desc + irq; | ||
32 | |||
33 | while (desc->status & IRQ_INPROGRESS) | ||
34 | cpu_relax(); | ||
35 | } | ||
36 | |||
37 | EXPORT_SYMBOL(synchronize_irq); | ||
38 | |||
39 | #endif | ||
40 | |||
41 | /** | ||
42 | * disable_irq_nosync - disable an irq without waiting | ||
43 | * @irq: Interrupt to disable | ||
44 | * | ||
45 | * Disable the selected interrupt line. Disables and Enables are | ||
46 | * nested. | ||
47 | * Unlike disable_irq(), this function does not ensure existing | ||
48 | * instances of the IRQ handler have completed before returning. | ||
49 | * | ||
50 | * This function may be called from IRQ context. | ||
51 | */ | ||
52 | void disable_irq_nosync(unsigned int irq) | ||
53 | { | ||
54 | irq_desc_t *desc = irq_desc + irq; | ||
55 | unsigned long flags; | ||
56 | |||
57 | spin_lock_irqsave(&desc->lock, flags); | ||
58 | if (!desc->depth++) { | ||
59 | desc->status |= IRQ_DISABLED; | ||
60 | desc->handler->disable(irq); | ||
61 | } | ||
62 | spin_unlock_irqrestore(&desc->lock, flags); | ||
63 | } | ||
64 | |||
65 | EXPORT_SYMBOL(disable_irq_nosync); | ||
66 | |||
67 | /** | ||
68 | * disable_irq - disable an irq and wait for completion | ||
69 | * @irq: Interrupt to disable | ||
70 | * | ||
71 | * Disable the selected interrupt line. Enables and Disables are | ||
72 | * nested. | ||
73 | * This function waits for any pending IRQ handlers for this interrupt | ||
74 | * to complete before returning. If you use this function while | ||
75 | * holding a resource the IRQ handler may need you will deadlock. | ||
76 | * | ||
77 | * This function may be called - with care - from IRQ context. | ||
78 | */ | ||
79 | void disable_irq(unsigned int irq) | ||
80 | { | ||
81 | irq_desc_t *desc = irq_desc + irq; | ||
82 | |||
83 | disable_irq_nosync(irq); | ||
84 | if (desc->action) | ||
85 | synchronize_irq(irq); | ||
86 | } | ||
87 | |||
88 | EXPORT_SYMBOL(disable_irq); | ||
89 | |||
90 | /** | ||
91 | * enable_irq - enable handling of an irq | ||
92 | * @irq: Interrupt to enable | ||
93 | * | ||
94 | * Undoes the effect of one call to disable_irq(). If this | ||
95 | * matches the last disable, processing of interrupts on this | ||
96 | * IRQ line is re-enabled. | ||
97 | * | ||
98 | * This function may be called from IRQ context. | ||
99 | */ | ||
100 | void enable_irq(unsigned int irq) | ||
101 | { | ||
102 | irq_desc_t *desc = irq_desc + irq; | ||
103 | unsigned long flags; | ||
104 | |||
105 | spin_lock_irqsave(&desc->lock, flags); | ||
106 | switch (desc->depth) { | ||
107 | case 0: | ||
108 | WARN_ON(1); | ||
109 | break; | ||
110 | case 1: { | ||
111 | unsigned int status = desc->status & ~IRQ_DISABLED; | ||
112 | |||
113 | desc->status = status; | ||
114 | if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | ||
115 | desc->status = status | IRQ_REPLAY; | ||
116 | hw_resend_irq(desc->handler,irq); | ||
117 | } | ||
118 | desc->handler->enable(irq); | ||
119 | /* fall-through */ | ||
120 | } | ||
121 | default: | ||
122 | desc->depth--; | ||
123 | } | ||
124 | spin_unlock_irqrestore(&desc->lock, flags); | ||
125 | } | ||
126 | |||
127 | EXPORT_SYMBOL(enable_irq); | ||
128 | |||
129 | /* | ||
130 | * Internal function that tells the architecture code whether a | ||
131 | * particular irq has been exclusively allocated or is available | ||
132 | * for driver use. | ||
133 | */ | ||
134 | int can_request_irq(unsigned int irq, unsigned long irqflags) | ||
135 | { | ||
136 | struct irqaction *action; | ||
137 | |||
138 | if (irq >= NR_IRQS) | ||
139 | return 0; | ||
140 | |||
141 | action = irq_desc[irq].action; | ||
142 | if (action) | ||
143 | if (irqflags & action->flags & SA_SHIRQ) | ||
144 | action = NULL; | ||
145 | |||
146 | return !action; | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Internal function to register an irqaction - typically used to | ||
151 | * allocate special interrupts that are part of the architecture. | ||
152 | */ | ||
153 | int setup_irq(unsigned int irq, struct irqaction * new) | ||
154 | { | ||
155 | struct irq_desc *desc = irq_desc + irq; | ||
156 | struct irqaction *old, **p; | ||
157 | unsigned long flags; | ||
158 | int shared = 0; | ||
159 | |||
160 | if (desc->handler == &no_irq_type) | ||
161 | return -ENOSYS; | ||
162 | /* | ||
163 | * Some drivers like serial.c use request_irq() heavily, | ||
164 | * so we have to be careful not to interfere with a | ||
165 | * running system. | ||
166 | */ | ||
167 | if (new->flags & SA_SAMPLE_RANDOM) { | ||
168 | /* | ||
169 | * This function might sleep, we want to call it first, | ||
170 | * outside of the atomic block. | ||
171 | * Yes, this might clear the entropy pool if the wrong | ||
172 | * driver is attempted to be loaded, without actually | ||
173 | * installing a new handler, but is this really a problem, | ||
174 | * only the sysadmin is able to do this. | ||
175 | */ | ||
176 | rand_initialize_irq(irq); | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * The following block of code has to be executed atomically | ||
181 | */ | ||
182 | spin_lock_irqsave(&desc->lock,flags); | ||
183 | p = &desc->action; | ||
184 | if ((old = *p) != NULL) { | ||
185 | /* Can't share interrupts unless both agree to */ | ||
186 | if (!(old->flags & new->flags & SA_SHIRQ)) { | ||
187 | spin_unlock_irqrestore(&desc->lock,flags); | ||
188 | return -EBUSY; | ||
189 | } | ||
190 | |||
191 | /* add new interrupt at end of irq queue */ | ||
192 | do { | ||
193 | p = &old->next; | ||
194 | old = *p; | ||
195 | } while (old); | ||
196 | shared = 1; | ||
197 | } | ||
198 | |||
199 | *p = new; | ||
200 | |||
201 | if (!shared) { | ||
202 | desc->depth = 0; | ||
203 | desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | | ||
204 | IRQ_WAITING | IRQ_INPROGRESS); | ||
205 | if (desc->handler->startup) | ||
206 | desc->handler->startup(irq); | ||
207 | else | ||
208 | desc->handler->enable(irq); | ||
209 | } | ||
210 | spin_unlock_irqrestore(&desc->lock,flags); | ||
211 | |||
212 | new->irq = irq; | ||
213 | register_irq_proc(irq); | ||
214 | new->dir = NULL; | ||
215 | register_handler_proc(irq, new); | ||
216 | |||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | /** | ||
221 | * free_irq - free an interrupt | ||
222 | * @irq: Interrupt line to free | ||
223 | * @dev_id: Device identity to free | ||
224 | * | ||
225 | * Remove an interrupt handler. The handler is removed and if the | ||
226 | * interrupt line is no longer in use by any driver it is disabled. | ||
227 | * On a shared IRQ the caller must ensure the interrupt is disabled | ||
228 | * on the card it drives before calling this function. The function | ||
229 | * does not return until any executing interrupts for this IRQ | ||
230 | * have completed. | ||
231 | * | ||
232 | * This function must not be called from interrupt context. | ||
233 | */ | ||
234 | void free_irq(unsigned int irq, void *dev_id) | ||
235 | { | ||
236 | struct irq_desc *desc; | ||
237 | struct irqaction **p; | ||
238 | unsigned long flags; | ||
239 | |||
240 | if (irq >= NR_IRQS) | ||
241 | return; | ||
242 | |||
243 | desc = irq_desc + irq; | ||
244 | spin_lock_irqsave(&desc->lock,flags); | ||
245 | p = &desc->action; | ||
246 | for (;;) { | ||
247 | struct irqaction * action = *p; | ||
248 | |||
249 | if (action) { | ||
250 | struct irqaction **pp = p; | ||
251 | |||
252 | p = &action->next; | ||
253 | if (action->dev_id != dev_id) | ||
254 | continue; | ||
255 | |||
256 | /* Found it - now remove it from the list of entries */ | ||
257 | *pp = action->next; | ||
258 | if (!desc->action) { | ||
259 | desc->status |= IRQ_DISABLED; | ||
260 | if (desc->handler->shutdown) | ||
261 | desc->handler->shutdown(irq); | ||
262 | else | ||
263 | desc->handler->disable(irq); | ||
264 | } | ||
265 | spin_unlock_irqrestore(&desc->lock,flags); | ||
266 | unregister_handler_proc(irq, action); | ||
267 | |||
268 | /* Make sure it's not being used on another CPU */ | ||
269 | synchronize_irq(irq); | ||
270 | kfree(action); | ||
271 | return; | ||
272 | } | ||
273 | printk(KERN_ERR "Trying to free free IRQ%d\n",irq); | ||
274 | spin_unlock_irqrestore(&desc->lock,flags); | ||
275 | return; | ||
276 | } | ||
277 | } | ||
278 | |||
279 | EXPORT_SYMBOL(free_irq); | ||
280 | |||
281 | /** | ||
282 | * request_irq - allocate an interrupt line | ||
283 | * @irq: Interrupt line to allocate | ||
284 | * @handler: Function to be called when the IRQ occurs | ||
285 | * @irqflags: Interrupt type flags | ||
286 | * @devname: An ascii name for the claiming device | ||
287 | * @dev_id: A cookie passed back to the handler function | ||
288 | * | ||
289 | * This call allocates interrupt resources and enables the | ||
290 | * interrupt line and IRQ handling. From the point this | ||
291 | * call is made your handler function may be invoked. Since | ||
292 | * your handler function must clear any interrupt the board | ||
293 | * raises, you must take care both to initialise your hardware | ||
294 | * and to set up the interrupt handler in the right order. | ||
295 | * | ||
296 | * Dev_id must be globally unique. Normally the address of the | ||
297 | * device data structure is used as the cookie. Since the handler | ||
298 | * receives this value it makes sense to use it. | ||
299 | * | ||
300 | * If your interrupt is shared you must pass a non NULL dev_id | ||
301 | * as this is required when freeing the interrupt. | ||
302 | * | ||
303 | * Flags: | ||
304 | * | ||
305 | * SA_SHIRQ Interrupt is shared | ||
306 | * SA_INTERRUPT Disable local interrupts while processing | ||
307 | * SA_SAMPLE_RANDOM The interrupt can be used for entropy | ||
308 | * | ||
309 | */ | ||
310 | int request_irq(unsigned int irq, | ||
311 | irqreturn_t (*handler)(int, void *, struct pt_regs *), | ||
312 | unsigned long irqflags, const char * devname, void *dev_id) | ||
313 | { | ||
314 | struct irqaction * action; | ||
315 | int retval; | ||
316 | |||
317 | /* | ||
318 | * Sanity-check: shared interrupts must pass in a real dev-ID, | ||
319 | * otherwise we'll have trouble later trying to figure out | ||
320 | * which interrupt is which (messes up the interrupt freeing | ||
321 | * logic etc). | ||
322 | */ | ||
323 | if ((irqflags & SA_SHIRQ) && !dev_id) | ||
324 | return -EINVAL; | ||
325 | if (irq >= NR_IRQS) | ||
326 | return -EINVAL; | ||
327 | if (!handler) | ||
328 | return -EINVAL; | ||
329 | |||
330 | action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); | ||
331 | if (!action) | ||
332 | return -ENOMEM; | ||
333 | |||
334 | action->handler = handler; | ||
335 | action->flags = irqflags; | ||
336 | cpus_clear(action->mask); | ||
337 | action->name = devname; | ||
338 | action->next = NULL; | ||
339 | action->dev_id = dev_id; | ||
340 | |||
341 | retval = setup_irq(irq, action); | ||
342 | if (retval) | ||
343 | kfree(action); | ||
344 | |||
345 | return retval; | ||
346 | } | ||
347 | |||
348 | EXPORT_SYMBOL(request_irq); | ||
349 | |||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c new file mode 100644 index 000000000000..85d08daa6600 --- /dev/null +++ b/kernel/irq/proc.c | |||
@@ -0,0 +1,159 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/proc.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains the /proc/irq/ handling code. | ||
7 | */ | ||
8 | |||
9 | #include <linux/irq.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/interrupt.h> | ||
12 | |||
13 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | ||
14 | |||
15 | #ifdef CONFIG_SMP | ||
16 | |||
17 | /* | ||
18 | * The /proc/irq/<irq>/smp_affinity values: | ||
19 | */ | ||
20 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; | ||
21 | |||
22 | void __attribute__((weak)) | ||
23 | proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | ||
24 | { | ||
25 | irq_affinity[irq] = mask_val; | ||
26 | irq_desc[irq].handler->set_affinity(irq, mask_val); | ||
27 | } | ||
28 | |||
29 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | ||
30 | int count, int *eof, void *data) | ||
31 | { | ||
32 | int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); | ||
33 | |||
34 | if (count - len < 2) | ||
35 | return -EINVAL; | ||
36 | len += sprintf(page + len, "\n"); | ||
37 | return len; | ||
38 | } | ||
39 | |||
40 | int no_irq_affinity; | ||
41 | static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | ||
42 | unsigned long count, void *data) | ||
43 | { | ||
44 | unsigned int irq = (int)(long)data, full_count = count, err; | ||
45 | cpumask_t new_value, tmp; | ||
46 | |||
47 | if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) | ||
48 | return -EIO; | ||
49 | |||
50 | err = cpumask_parse(buffer, count, new_value); | ||
51 | if (err) | ||
52 | return err; | ||
53 | |||
54 | /* | ||
55 | * Do not allow disabling IRQs completely - it's a too easy | ||
56 | * way to make the system unusable accidentally :-) At least | ||
57 | * one online CPU still has to be targeted. | ||
58 | */ | ||
59 | cpus_and(tmp, new_value, cpu_online_map); | ||
60 | if (cpus_empty(tmp)) | ||
61 | return -EINVAL; | ||
62 | |||
63 | proc_set_irq_affinity(irq, new_value); | ||
64 | |||
65 | return full_count; | ||
66 | } | ||
67 | |||
68 | #endif | ||
69 | |||
70 | #define MAX_NAMELEN 128 | ||
71 | |||
72 | static int name_unique(unsigned int irq, struct irqaction *new_action) | ||
73 | { | ||
74 | struct irq_desc *desc = irq_desc + irq; | ||
75 | struct irqaction *action; | ||
76 | |||
77 | for (action = desc->action ; action; action = action->next) | ||
78 | if ((action != new_action) && action->name && | ||
79 | !strcmp(new_action->name, action->name)) | ||
80 | return 0; | ||
81 | return 1; | ||
82 | } | ||
83 | |||
84 | void register_handler_proc(unsigned int irq, struct irqaction *action) | ||
85 | { | ||
86 | char name [MAX_NAMELEN]; | ||
87 | |||
88 | if (!irq_dir[irq] || action->dir || !action->name || | ||
89 | !name_unique(irq, action)) | ||
90 | return; | ||
91 | |||
92 | memset(name, 0, MAX_NAMELEN); | ||
93 | snprintf(name, MAX_NAMELEN, "%s", action->name); | ||
94 | |||
95 | /* create /proc/irq/1234/handler/ */ | ||
96 | action->dir = proc_mkdir(name, irq_dir[irq]); | ||
97 | } | ||
98 | |||
99 | #undef MAX_NAMELEN | ||
100 | |||
101 | #define MAX_NAMELEN 10 | ||
102 | |||
103 | void register_irq_proc(unsigned int irq) | ||
104 | { | ||
105 | char name [MAX_NAMELEN]; | ||
106 | |||
107 | if (!root_irq_dir || | ||
108 | (irq_desc[irq].handler == &no_irq_type) || | ||
109 | irq_dir[irq]) | ||
110 | return; | ||
111 | |||
112 | memset(name, 0, MAX_NAMELEN); | ||
113 | sprintf(name, "%d", irq); | ||
114 | |||
115 | /* create /proc/irq/1234 */ | ||
116 | irq_dir[irq] = proc_mkdir(name, root_irq_dir); | ||
117 | |||
118 | #ifdef CONFIG_SMP | ||
119 | { | ||
120 | struct proc_dir_entry *entry; | ||
121 | |||
122 | /* create /proc/irq/<irq>/smp_affinity */ | ||
123 | entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); | ||
124 | |||
125 | if (entry) { | ||
126 | entry->nlink = 1; | ||
127 | entry->data = (void *)(long)irq; | ||
128 | entry->read_proc = irq_affinity_read_proc; | ||
129 | entry->write_proc = irq_affinity_write_proc; | ||
130 | } | ||
131 | smp_affinity_entry[irq] = entry; | ||
132 | } | ||
133 | #endif | ||
134 | } | ||
135 | |||
136 | #undef MAX_NAMELEN | ||
137 | |||
138 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | ||
139 | { | ||
140 | if (action->dir) | ||
141 | remove_proc_entry(action->dir->name, irq_dir[irq]); | ||
142 | } | ||
143 | |||
144 | void init_irq_proc(void) | ||
145 | { | ||
146 | int i; | ||
147 | |||
148 | /* create /proc/irq */ | ||
149 | root_irq_dir = proc_mkdir("irq", NULL); | ||
150 | if (!root_irq_dir) | ||
151 | return; | ||
152 | |||
153 | /* | ||
154 | * Create entries for all existing IRQs. | ||
155 | */ | ||
156 | for (i = 0; i < NR_IRQS; i++) | ||
157 | register_irq_proc(i); | ||
158 | } | ||
159 | |||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c new file mode 100644 index 000000000000..f6297c306905 --- /dev/null +++ b/kernel/irq/spurious.c | |||
@@ -0,0 +1,96 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/spurious.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | ||
5 | * | ||
6 | * This file contains spurious interrupt handling. | ||
7 | */ | ||
8 | |||
9 | #include <linux/irq.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/kallsyms.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | |||
14 | /* | ||
15 | * If 99,900 of the previous 100,000 interrupts have not been handled | ||
16 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic | ||
17 | * and try to turn the IRQ off. | ||
18 | * | ||
19 | * (The other 100-of-100,000 interrupts may have been a correctly | ||
20 | * functioning device sharing an IRQ with the failing one) | ||
21 | * | ||
22 | * Called under desc->lock | ||
23 | */ | ||
24 | |||
25 | static void | ||
26 | __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | ||
27 | { | ||
28 | struct irqaction *action; | ||
29 | |||
30 | if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { | ||
31 | printk(KERN_ERR "irq event %d: bogus return value %x\n", | ||
32 | irq, action_ret); | ||
33 | } else { | ||
34 | printk(KERN_ERR "irq %d: nobody cared!\n", irq); | ||
35 | } | ||
36 | dump_stack(); | ||
37 | printk(KERN_ERR "handlers:\n"); | ||
38 | action = desc->action; | ||
39 | while (action) { | ||
40 | printk(KERN_ERR "[<%p>]", action->handler); | ||
41 | print_symbol(" (%s)", | ||
42 | (unsigned long)action->handler); | ||
43 | printk("\n"); | ||
44 | action = action->next; | ||
45 | } | ||
46 | } | ||
47 | |||
48 | void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | ||
49 | { | ||
50 | static int count = 100; | ||
51 | |||
52 | if (count > 0) { | ||
53 | count--; | ||
54 | __report_bad_irq(irq, desc, action_ret); | ||
55 | } | ||
56 | } | ||
57 | |||
58 | void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | ||
59 | { | ||
60 | if (action_ret != IRQ_HANDLED) { | ||
61 | desc->irqs_unhandled++; | ||
62 | if (action_ret != IRQ_NONE) | ||
63 | report_bad_irq(irq, desc, action_ret); | ||
64 | } | ||
65 | |||
66 | desc->irq_count++; | ||
67 | if (desc->irq_count < 100000) | ||
68 | return; | ||
69 | |||
70 | desc->irq_count = 0; | ||
71 | if (desc->irqs_unhandled > 99900) { | ||
72 | /* | ||
73 | * The interrupt is stuck | ||
74 | */ | ||
75 | __report_bad_irq(irq, desc, action_ret); | ||
76 | /* | ||
77 | * Now kill the IRQ | ||
78 | */ | ||
79 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | ||
80 | desc->status |= IRQ_DISABLED; | ||
81 | desc->handler->disable(irq); | ||
82 | } | ||
83 | desc->irqs_unhandled = 0; | ||
84 | } | ||
85 | |||
86 | int noirqdebug; | ||
87 | |||
88 | int __init noirqdebug_setup(char *str) | ||
89 | { | ||
90 | noirqdebug = 1; | ||
91 | printk(KERN_INFO "IRQ lockup detection disabled\n"); | ||
92 | return 1; | ||
93 | } | ||
94 | |||
95 | __setup("noirqdebug", noirqdebug_setup); | ||
96 | |||
diff --git a/kernel/itimer.c b/kernel/itimer.c new file mode 100644 index 000000000000..e9a40e947e07 --- /dev/null +++ b/kernel/itimer.c | |||
@@ -0,0 +1,241 @@ | |||
1 | /* | ||
2 | * linux/kernel/itimer.c | ||
3 | * | ||
4 | * Copyright (C) 1992 Darren Senn | ||
5 | */ | ||
6 | |||
7 | /* These are all the functions necessary to implement itimers */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/smp_lock.h> | ||
11 | #include <linux/interrupt.h> | ||
12 | #include <linux/syscalls.h> | ||
13 | #include <linux/time.h> | ||
14 | #include <linux/posix-timers.h> | ||
15 | |||
16 | #include <asm/uaccess.h> | ||
17 | |||
18 | static unsigned long it_real_value(struct signal_struct *sig) | ||
19 | { | ||
20 | unsigned long val = 0; | ||
21 | if (timer_pending(&sig->real_timer)) { | ||
22 | val = sig->real_timer.expires - jiffies; | ||
23 | |||
24 | /* look out for negative/zero itimer.. */ | ||
25 | if ((long) val <= 0) | ||
26 | val = 1; | ||
27 | } | ||
28 | return val; | ||
29 | } | ||
30 | |||
31 | int do_getitimer(int which, struct itimerval *value) | ||
32 | { | ||
33 | struct task_struct *tsk = current; | ||
34 | unsigned long interval, val; | ||
35 | cputime_t cinterval, cval; | ||
36 | |||
37 | switch (which) { | ||
38 | case ITIMER_REAL: | ||
39 | spin_lock_irq(&tsk->sighand->siglock); | ||
40 | interval = tsk->signal->it_real_incr; | ||
41 | val = it_real_value(tsk->signal); | ||
42 | spin_unlock_irq(&tsk->sighand->siglock); | ||
43 | jiffies_to_timeval(val, &value->it_value); | ||
44 | jiffies_to_timeval(interval, &value->it_interval); | ||
45 | break; | ||
46 | case ITIMER_VIRTUAL: | ||
47 | read_lock(&tasklist_lock); | ||
48 | spin_lock_irq(&tsk->sighand->siglock); | ||
49 | cval = tsk->signal->it_virt_expires; | ||
50 | cinterval = tsk->signal->it_virt_incr; | ||
51 | if (!cputime_eq(cval, cputime_zero)) { | ||
52 | struct task_struct *t = tsk; | ||
53 | cputime_t utime = tsk->signal->utime; | ||
54 | do { | ||
55 | utime = cputime_add(utime, t->utime); | ||
56 | t = next_thread(t); | ||
57 | } while (t != tsk); | ||
58 | if (cputime_le(cval, utime)) { /* about to fire */ | ||
59 | cval = jiffies_to_cputime(1); | ||
60 | } else { | ||
61 | cval = cputime_sub(cval, utime); | ||
62 | } | ||
63 | } | ||
64 | spin_unlock_irq(&tsk->sighand->siglock); | ||
65 | read_unlock(&tasklist_lock); | ||
66 | cputime_to_timeval(cval, &value->it_value); | ||
67 | cputime_to_timeval(cinterval, &value->it_interval); | ||
68 | break; | ||
69 | case ITIMER_PROF: | ||
70 | read_lock(&tasklist_lock); | ||
71 | spin_lock_irq(&tsk->sighand->siglock); | ||
72 | cval = tsk->signal->it_prof_expires; | ||
73 | cinterval = tsk->signal->it_prof_incr; | ||
74 | if (!cputime_eq(cval, cputime_zero)) { | ||
75 | struct task_struct *t = tsk; | ||
76 | cputime_t ptime = cputime_add(tsk->signal->utime, | ||
77 | tsk->signal->stime); | ||
78 | do { | ||
79 | ptime = cputime_add(ptime, | ||
80 | cputime_add(t->utime, | ||
81 | t->stime)); | ||
82 | t = next_thread(t); | ||
83 | } while (t != tsk); | ||
84 | if (cputime_le(cval, ptime)) { /* about to fire */ | ||
85 | cval = jiffies_to_cputime(1); | ||
86 | } else { | ||
87 | cval = cputime_sub(cval, ptime); | ||
88 | } | ||
89 | } | ||
90 | spin_unlock_irq(&tsk->sighand->siglock); | ||
91 | read_unlock(&tasklist_lock); | ||
92 | cputime_to_timeval(cval, &value->it_value); | ||
93 | cputime_to_timeval(cinterval, &value->it_interval); | ||
94 | break; | ||
95 | default: | ||
96 | return(-EINVAL); | ||
97 | } | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | asmlinkage long sys_getitimer(int which, struct itimerval __user *value) | ||
102 | { | ||
103 | int error = -EFAULT; | ||
104 | struct itimerval get_buffer; | ||
105 | |||
106 | if (value) { | ||
107 | error = do_getitimer(which, &get_buffer); | ||
108 | if (!error && | ||
109 | copy_to_user(value, &get_buffer, sizeof(get_buffer))) | ||
110 | error = -EFAULT; | ||
111 | } | ||
112 | return error; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Called with P->sighand->siglock held and P->signal->real_timer inactive. | ||
117 | * If interval is nonzero, arm the timer for interval ticks from now. | ||
118 | */ | ||
119 | static inline void it_real_arm(struct task_struct *p, unsigned long interval) | ||
120 | { | ||
121 | p->signal->it_real_value = interval; /* XXX unnecessary field?? */ | ||
122 | if (interval == 0) | ||
123 | return; | ||
124 | if (interval > (unsigned long) LONG_MAX) | ||
125 | interval = LONG_MAX; | ||
126 | p->signal->real_timer.expires = jiffies + interval; | ||
127 | add_timer(&p->signal->real_timer); | ||
128 | } | ||
129 | |||
130 | void it_real_fn(unsigned long __data) | ||
131 | { | ||
132 | struct task_struct * p = (struct task_struct *) __data; | ||
133 | |||
134 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); | ||
135 | |||
136 | /* | ||
137 | * Now restart the timer if necessary. We don't need any locking | ||
138 | * here because do_setitimer makes sure we have finished running | ||
139 | * before it touches anything. | ||
140 | */ | ||
141 | it_real_arm(p, p->signal->it_real_incr); | ||
142 | } | ||
143 | |||
144 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | ||
145 | { | ||
146 | struct task_struct *tsk = current; | ||
147 | unsigned long val, interval; | ||
148 | cputime_t cval, cinterval, nval, ninterval; | ||
149 | |||
150 | switch (which) { | ||
151 | case ITIMER_REAL: | ||
152 | spin_lock_irq(&tsk->sighand->siglock); | ||
153 | interval = tsk->signal->it_real_incr; | ||
154 | val = it_real_value(tsk->signal); | ||
155 | if (val) | ||
156 | del_timer_sync(&tsk->signal->real_timer); | ||
157 | tsk->signal->it_real_incr = | ||
158 | timeval_to_jiffies(&value->it_interval); | ||
159 | it_real_arm(tsk, timeval_to_jiffies(&value->it_value)); | ||
160 | spin_unlock_irq(&tsk->sighand->siglock); | ||
161 | if (ovalue) { | ||
162 | jiffies_to_timeval(val, &ovalue->it_value); | ||
163 | jiffies_to_timeval(interval, | ||
164 | &ovalue->it_interval); | ||
165 | } | ||
166 | break; | ||
167 | case ITIMER_VIRTUAL: | ||
168 | nval = timeval_to_cputime(&value->it_value); | ||
169 | ninterval = timeval_to_cputime(&value->it_interval); | ||
170 | read_lock(&tasklist_lock); | ||
171 | spin_lock_irq(&tsk->sighand->siglock); | ||
172 | cval = tsk->signal->it_virt_expires; | ||
173 | cinterval = tsk->signal->it_virt_incr; | ||
174 | if (!cputime_eq(cval, cputime_zero) || | ||
175 | !cputime_eq(nval, cputime_zero)) { | ||
176 | if (cputime_gt(nval, cputime_zero)) | ||
177 | nval = cputime_add(nval, | ||
178 | jiffies_to_cputime(1)); | ||
179 | set_process_cpu_timer(tsk, CPUCLOCK_VIRT, | ||
180 | &nval, &cval); | ||
181 | } | ||
182 | tsk->signal->it_virt_expires = nval; | ||
183 | tsk->signal->it_virt_incr = ninterval; | ||
184 | spin_unlock_irq(&tsk->sighand->siglock); | ||
185 | read_unlock(&tasklist_lock); | ||
186 | if (ovalue) { | ||
187 | cputime_to_timeval(cval, &ovalue->it_value); | ||
188 | cputime_to_timeval(cinterval, &ovalue->it_interval); | ||
189 | } | ||
190 | break; | ||
191 | case ITIMER_PROF: | ||
192 | nval = timeval_to_cputime(&value->it_value); | ||
193 | ninterval = timeval_to_cputime(&value->it_interval); | ||
194 | read_lock(&tasklist_lock); | ||
195 | spin_lock_irq(&tsk->sighand->siglock); | ||
196 | cval = tsk->signal->it_prof_expires; | ||
197 | cinterval = tsk->signal->it_prof_incr; | ||
198 | if (!cputime_eq(cval, cputime_zero) || | ||
199 | !cputime_eq(nval, cputime_zero)) { | ||
200 | if (cputime_gt(nval, cputime_zero)) | ||
201 | nval = cputime_add(nval, | ||
202 | jiffies_to_cputime(1)); | ||
203 | set_process_cpu_timer(tsk, CPUCLOCK_PROF, | ||
204 | &nval, &cval); | ||
205 | } | ||
206 | tsk->signal->it_prof_expires = nval; | ||
207 | tsk->signal->it_prof_incr = ninterval; | ||
208 | spin_unlock_irq(&tsk->sighand->siglock); | ||
209 | read_unlock(&tasklist_lock); | ||
210 | if (ovalue) { | ||
211 | cputime_to_timeval(cval, &ovalue->it_value); | ||
212 | cputime_to_timeval(cinterval, &ovalue->it_interval); | ||
213 | } | ||
214 | break; | ||
215 | default: | ||
216 | return -EINVAL; | ||
217 | } | ||
218 | return 0; | ||
219 | } | ||
220 | |||
221 | asmlinkage long sys_setitimer(int which, | ||
222 | struct itimerval __user *value, | ||
223 | struct itimerval __user *ovalue) | ||
224 | { | ||
225 | struct itimerval set_buffer, get_buffer; | ||
226 | int error; | ||
227 | |||
228 | if (value) { | ||
229 | if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) | ||
230 | return -EFAULT; | ||
231 | } else | ||
232 | memset((char *) &set_buffer, 0, sizeof(set_buffer)); | ||
233 | |||
234 | error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); | ||
235 | if (error || !ovalue) | ||
236 | return error; | ||
237 | |||
238 | if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) | ||
239 | return -EFAULT; | ||
240 | return 0; | ||
241 | } | ||
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c new file mode 100644 index 000000000000..1627f8d6e0cd --- /dev/null +++ b/kernel/kallsyms.c | |||
@@ -0,0 +1,411 @@ | |||
1 | /* | ||
2 | * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. | ||
3 | * | ||
4 | * Rewritten and vastly simplified by Rusty Russell for in-kernel | ||
5 | * module loader: | ||
6 | * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation | ||
7 | * | ||
8 | * ChangeLog: | ||
9 | * | ||
10 | * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> | ||
11 | * Changed the compression method from stem compression to "table lookup" | ||
12 | * compression (see scripts/kallsyms.c for a more complete description) | ||
13 | */ | ||
14 | #include <linux/kallsyms.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/seq_file.h> | ||
18 | #include <linux/fs.h> | ||
19 | #include <linux/err.h> | ||
20 | #include <linux/proc_fs.h> | ||
21 | #include <linux/mm.h> | ||
22 | |||
23 | #include <asm/sections.h> | ||
24 | |||
25 | #ifdef CONFIG_KALLSYMS_ALL | ||
26 | #define all_var 1 | ||
27 | #else | ||
28 | #define all_var 0 | ||
29 | #endif | ||
30 | |||
31 | /* These will be re-linked against their real values during the second link stage */ | ||
32 | extern unsigned long kallsyms_addresses[] __attribute__((weak)); | ||
33 | extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); | ||
34 | extern u8 kallsyms_names[] __attribute__((weak)); | ||
35 | |||
36 | extern u8 kallsyms_token_table[] __attribute__((weak)); | ||
37 | extern u16 kallsyms_token_index[] __attribute__((weak)); | ||
38 | |||
39 | extern unsigned long kallsyms_markers[] __attribute__((weak)); | ||
40 | |||
41 | static inline int is_kernel_inittext(unsigned long addr) | ||
42 | { | ||
43 | if (addr >= (unsigned long)_sinittext | ||
44 | && addr <= (unsigned long)_einittext) | ||
45 | return 1; | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | static inline int is_kernel_text(unsigned long addr) | ||
50 | { | ||
51 | if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) | ||
52 | return 1; | ||
53 | return in_gate_area_no_task(addr); | ||
54 | } | ||
55 | |||
56 | static inline int is_kernel(unsigned long addr) | ||
57 | { | ||
58 | if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) | ||
59 | return 1; | ||
60 | return in_gate_area_no_task(addr); | ||
61 | } | ||
62 | |||
63 | /* expand a compressed symbol data into the resulting uncompressed string, | ||
64 | given the offset to where the symbol is in the compressed stream */ | ||
65 | static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) | ||
66 | { | ||
67 | int len, skipped_first = 0; | ||
68 | u8 *tptr, *data; | ||
69 | |||
70 | /* get the compressed symbol length from the first symbol byte */ | ||
71 | data = &kallsyms_names[off]; | ||
72 | len = *data; | ||
73 | data++; | ||
74 | |||
75 | /* update the offset to return the offset for the next symbol on | ||
76 | * the compressed stream */ | ||
77 | off += len + 1; | ||
78 | |||
79 | /* for every byte on the compressed symbol data, copy the table | ||
80 | entry for that byte */ | ||
81 | while(len) { | ||
82 | tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; | ||
83 | data++; | ||
84 | len--; | ||
85 | |||
86 | while (*tptr) { | ||
87 | if(skipped_first) { | ||
88 | *result = *tptr; | ||
89 | result++; | ||
90 | } else | ||
91 | skipped_first = 1; | ||
92 | tptr++; | ||
93 | } | ||
94 | } | ||
95 | |||
96 | *result = '\0'; | ||
97 | |||
98 | /* return to offset to the next symbol */ | ||
99 | return off; | ||
100 | } | ||
101 | |||
102 | /* get symbol type information. This is encoded as a single char at the | ||
103 | * begining of the symbol name */ | ||
104 | static char kallsyms_get_symbol_type(unsigned int off) | ||
105 | { | ||
106 | /* get just the first code, look it up in the token table, and return the | ||
107 | * first char from this token */ | ||
108 | return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; | ||
109 | } | ||
110 | |||
111 | |||
112 | /* find the offset on the compressed stream given and index in the | ||
113 | * kallsyms array */ | ||
114 | static unsigned int get_symbol_offset(unsigned long pos) | ||
115 | { | ||
116 | u8 *name; | ||
117 | int i; | ||
118 | |||
119 | /* use the closest marker we have. We have markers every 256 positions, | ||
120 | * so that should be close enough */ | ||
121 | name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; | ||
122 | |||
123 | /* sequentially scan all the symbols up to the point we're searching for. | ||
124 | * Every symbol is stored in a [<len>][<len> bytes of data] format, so we | ||
125 | * just need to add the len to the current pointer for every symbol we | ||
126 | * wish to skip */ | ||
127 | for(i = 0; i < (pos&0xFF); i++) | ||
128 | name = name + (*name) + 1; | ||
129 | |||
130 | return name - kallsyms_names; | ||
131 | } | ||
132 | |||
133 | /* Lookup the address for this symbol. Returns 0 if not found. */ | ||
134 | unsigned long kallsyms_lookup_name(const char *name) | ||
135 | { | ||
136 | char namebuf[KSYM_NAME_LEN+1]; | ||
137 | unsigned long i; | ||
138 | unsigned int off; | ||
139 | |||
140 | for (i = 0, off = 0; i < kallsyms_num_syms; i++) { | ||
141 | off = kallsyms_expand_symbol(off, namebuf); | ||
142 | |||
143 | if (strcmp(namebuf, name) == 0) | ||
144 | return kallsyms_addresses[i]; | ||
145 | } | ||
146 | return module_kallsyms_lookup_name(name); | ||
147 | } | ||
148 | EXPORT_SYMBOL_GPL(kallsyms_lookup_name); | ||
149 | |||
150 | /* | ||
151 | * Lookup an address | ||
152 | * - modname is set to NULL if it's in the kernel | ||
153 | * - we guarantee that the returned name is valid until we reschedule even if | ||
154 | * it resides in a module | ||
155 | * - we also guarantee that modname will be valid until rescheduled | ||
156 | */ | ||
157 | const char *kallsyms_lookup(unsigned long addr, | ||
158 | unsigned long *symbolsize, | ||
159 | unsigned long *offset, | ||
160 | char **modname, char *namebuf) | ||
161 | { | ||
162 | unsigned long i, low, high, mid; | ||
163 | const char *msym; | ||
164 | |||
165 | /* This kernel should never had been booted. */ | ||
166 | BUG_ON(!kallsyms_addresses); | ||
167 | |||
168 | namebuf[KSYM_NAME_LEN] = 0; | ||
169 | namebuf[0] = 0; | ||
170 | |||
171 | if ((all_var && is_kernel(addr)) || | ||
172 | (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) { | ||
173 | unsigned long symbol_end=0; | ||
174 | |||
175 | /* do a binary search on the sorted kallsyms_addresses array */ | ||
176 | low = 0; | ||
177 | high = kallsyms_num_syms; | ||
178 | |||
179 | while (high-low > 1) { | ||
180 | mid = (low + high) / 2; | ||
181 | if (kallsyms_addresses[mid] <= addr) low = mid; | ||
182 | else high = mid; | ||
183 | } | ||
184 | |||
185 | /* search for the first aliased symbol. Aliased symbols are | ||
186 | symbols with the same address */ | ||
187 | while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low]) | ||
188 | --low; | ||
189 | |||
190 | /* Grab name */ | ||
191 | kallsyms_expand_symbol(get_symbol_offset(low), namebuf); | ||
192 | |||
193 | /* Search for next non-aliased symbol */ | ||
194 | for (i = low + 1; i < kallsyms_num_syms; i++) { | ||
195 | if (kallsyms_addresses[i] > kallsyms_addresses[low]) { | ||
196 | symbol_end = kallsyms_addresses[i]; | ||
197 | break; | ||
198 | } | ||
199 | } | ||
200 | |||
201 | /* if we found no next symbol, we use the end of the section */ | ||
202 | if (!symbol_end) { | ||
203 | if (is_kernel_inittext(addr)) | ||
204 | symbol_end = (unsigned long)_einittext; | ||
205 | else | ||
206 | symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext; | ||
207 | } | ||
208 | |||
209 | *symbolsize = symbol_end - kallsyms_addresses[low]; | ||
210 | *modname = NULL; | ||
211 | *offset = addr - kallsyms_addresses[low]; | ||
212 | return namebuf; | ||
213 | } | ||
214 | |||
215 | /* see if it's in a module */ | ||
216 | msym = module_address_lookup(addr, symbolsize, offset, modname); | ||
217 | if (msym) | ||
218 | return strncpy(namebuf, msym, KSYM_NAME_LEN); | ||
219 | |||
220 | return NULL; | ||
221 | } | ||
222 | |||
223 | /* Replace "%s" in format with address, or returns -errno. */ | ||
224 | void __print_symbol(const char *fmt, unsigned long address) | ||
225 | { | ||
226 | char *modname; | ||
227 | const char *name; | ||
228 | unsigned long offset, size; | ||
229 | char namebuf[KSYM_NAME_LEN+1]; | ||
230 | char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN + | ||
231 | 2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1]; | ||
232 | |||
233 | name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); | ||
234 | |||
235 | if (!name) | ||
236 | sprintf(buffer, "0x%lx", address); | ||
237 | else { | ||
238 | if (modname) | ||
239 | sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, | ||
240 | size, modname); | ||
241 | else | ||
242 | sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); | ||
243 | } | ||
244 | printk(fmt, buffer); | ||
245 | } | ||
246 | |||
247 | /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ | ||
248 | struct kallsym_iter | ||
249 | { | ||
250 | loff_t pos; | ||
251 | struct module *owner; | ||
252 | unsigned long value; | ||
253 | unsigned int nameoff; /* If iterating in core kernel symbols */ | ||
254 | char type; | ||
255 | char name[KSYM_NAME_LEN+1]; | ||
256 | }; | ||
257 | |||
258 | /* Only label it "global" if it is exported. */ | ||
259 | static void upcase_if_global(struct kallsym_iter *iter) | ||
260 | { | ||
261 | if (is_exported(iter->name, iter->owner)) | ||
262 | iter->type += 'A' - 'a'; | ||
263 | } | ||
264 | |||
265 | static int get_ksymbol_mod(struct kallsym_iter *iter) | ||
266 | { | ||
267 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, | ||
268 | &iter->value, | ||
269 | &iter->type, iter->name); | ||
270 | if (iter->owner == NULL) | ||
271 | return 0; | ||
272 | |||
273 | upcase_if_global(iter); | ||
274 | return 1; | ||
275 | } | ||
276 | |||
277 | /* Returns space to next name. */ | ||
278 | static unsigned long get_ksymbol_core(struct kallsym_iter *iter) | ||
279 | { | ||
280 | unsigned off = iter->nameoff; | ||
281 | |||
282 | iter->owner = NULL; | ||
283 | iter->value = kallsyms_addresses[iter->pos]; | ||
284 | |||
285 | iter->type = kallsyms_get_symbol_type(off); | ||
286 | |||
287 | off = kallsyms_expand_symbol(off, iter->name); | ||
288 | |||
289 | return off - iter->nameoff; | ||
290 | } | ||
291 | |||
292 | static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) | ||
293 | { | ||
294 | iter->name[0] = '\0'; | ||
295 | iter->nameoff = get_symbol_offset(new_pos); | ||
296 | iter->pos = new_pos; | ||
297 | } | ||
298 | |||
299 | /* Returns false if pos at or past end of file. */ | ||
300 | static int update_iter(struct kallsym_iter *iter, loff_t pos) | ||
301 | { | ||
302 | /* Module symbols can be accessed randomly. */ | ||
303 | if (pos >= kallsyms_num_syms) { | ||
304 | iter->pos = pos; | ||
305 | return get_ksymbol_mod(iter); | ||
306 | } | ||
307 | |||
308 | /* If we're not on the desired position, reset to new position. */ | ||
309 | if (pos != iter->pos) | ||
310 | reset_iter(iter, pos); | ||
311 | |||
312 | iter->nameoff += get_ksymbol_core(iter); | ||
313 | iter->pos++; | ||
314 | |||
315 | return 1; | ||
316 | } | ||
317 | |||
318 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
319 | { | ||
320 | (*pos)++; | ||
321 | |||
322 | if (!update_iter(m->private, *pos)) | ||
323 | return NULL; | ||
324 | return p; | ||
325 | } | ||
326 | |||
327 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
328 | { | ||
329 | if (!update_iter(m->private, *pos)) | ||
330 | return NULL; | ||
331 | return m->private; | ||
332 | } | ||
333 | |||
334 | static void s_stop(struct seq_file *m, void *p) | ||
335 | { | ||
336 | } | ||
337 | |||
338 | static int s_show(struct seq_file *m, void *p) | ||
339 | { | ||
340 | struct kallsym_iter *iter = m->private; | ||
341 | |||
342 | /* Some debugging symbols have no name. Ignore them. */ | ||
343 | if (!iter->name[0]) | ||
344 | return 0; | ||
345 | |||
346 | if (iter->owner) | ||
347 | seq_printf(m, "%0*lx %c %s\t[%s]\n", | ||
348 | (int)(2*sizeof(void*)), | ||
349 | iter->value, iter->type, iter->name, | ||
350 | module_name(iter->owner)); | ||
351 | else | ||
352 | seq_printf(m, "%0*lx %c %s\n", | ||
353 | (int)(2*sizeof(void*)), | ||
354 | iter->value, iter->type, iter->name); | ||
355 | return 0; | ||
356 | } | ||
357 | |||
358 | static struct seq_operations kallsyms_op = { | ||
359 | .start = s_start, | ||
360 | .next = s_next, | ||
361 | .stop = s_stop, | ||
362 | .show = s_show | ||
363 | }; | ||
364 | |||
365 | static int kallsyms_open(struct inode *inode, struct file *file) | ||
366 | { | ||
367 | /* We keep iterator in m->private, since normal case is to | ||
368 | * s_start from where we left off, so we avoid doing | ||
369 | * using get_symbol_offset for every symbol */ | ||
370 | struct kallsym_iter *iter; | ||
371 | int ret; | ||
372 | |||
373 | iter = kmalloc(sizeof(*iter), GFP_KERNEL); | ||
374 | if (!iter) | ||
375 | return -ENOMEM; | ||
376 | reset_iter(iter, 0); | ||
377 | |||
378 | ret = seq_open(file, &kallsyms_op); | ||
379 | if (ret == 0) | ||
380 | ((struct seq_file *)file->private_data)->private = iter; | ||
381 | else | ||
382 | kfree(iter); | ||
383 | return ret; | ||
384 | } | ||
385 | |||
386 | static int kallsyms_release(struct inode *inode, struct file *file) | ||
387 | { | ||
388 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
389 | kfree(m->private); | ||
390 | return seq_release(inode, file); | ||
391 | } | ||
392 | |||
393 | static struct file_operations kallsyms_operations = { | ||
394 | .open = kallsyms_open, | ||
395 | .read = seq_read, | ||
396 | .llseek = seq_lseek, | ||
397 | .release = kallsyms_release, | ||
398 | }; | ||
399 | |||
400 | static int __init kallsyms_init(void) | ||
401 | { | ||
402 | struct proc_dir_entry *entry; | ||
403 | |||
404 | entry = create_proc_entry("kallsyms", 0444, NULL); | ||
405 | if (entry) | ||
406 | entry->proc_fops = &kallsyms_operations; | ||
407 | return 0; | ||
408 | } | ||
409 | __initcall(kallsyms_init); | ||
410 | |||
411 | EXPORT_SYMBOL(__print_symbol); | ||
diff --git a/kernel/kfifo.c b/kernel/kfifo.c new file mode 100644 index 000000000000..179baafcdd96 --- /dev/null +++ b/kernel/kfifo.c | |||
@@ -0,0 +1,168 @@ | |||
1 | /* | ||
2 | * A simple kernel FIFO implementation. | ||
3 | * | ||
4 | * Copyright (C) 2004 Stelian Pop <stelian@popies.net> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/err.h> | ||
26 | #include <linux/kfifo.h> | ||
27 | |||
28 | /** | ||
29 | * kfifo_init - allocates a new FIFO using a preallocated buffer | ||
30 | * @buffer: the preallocated buffer to be used. | ||
31 | * @size: the size of the internal buffer, this have to be a power of 2. | ||
32 | * @gfp_mask: get_free_pages mask, passed to kmalloc() | ||
33 | * @lock: the lock to be used to protect the fifo buffer | ||
34 | * | ||
35 | * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the | ||
36 | * struct kfifo with kfree(). | ||
37 | */ | ||
38 | struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, | ||
39 | unsigned int __nocast gfp_mask, spinlock_t *lock) | ||
40 | { | ||
41 | struct kfifo *fifo; | ||
42 | |||
43 | /* size must be a power of 2 */ | ||
44 | BUG_ON(size & (size - 1)); | ||
45 | |||
46 | fifo = kmalloc(sizeof(struct kfifo), gfp_mask); | ||
47 | if (!fifo) | ||
48 | return ERR_PTR(-ENOMEM); | ||
49 | |||
50 | fifo->buffer = buffer; | ||
51 | fifo->size = size; | ||
52 | fifo->in = fifo->out = 0; | ||
53 | fifo->lock = lock; | ||
54 | |||
55 | return fifo; | ||
56 | } | ||
57 | EXPORT_SYMBOL(kfifo_init); | ||
58 | |||
59 | /** | ||
60 | * kfifo_alloc - allocates a new FIFO and its internal buffer | ||
61 | * @size: the size of the internal buffer to be allocated. | ||
62 | * @gfp_mask: get_free_pages mask, passed to kmalloc() | ||
63 | * @lock: the lock to be used to protect the fifo buffer | ||
64 | * | ||
65 | * The size will be rounded-up to a power of 2. | ||
66 | */ | ||
67 | struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock) | ||
68 | { | ||
69 | unsigned char *buffer; | ||
70 | struct kfifo *ret; | ||
71 | |||
72 | /* | ||
73 | * round up to the next power of 2, since our 'let the indices | ||
74 | * wrap' tachnique works only in this case. | ||
75 | */ | ||
76 | if (size & (size - 1)) { | ||
77 | BUG_ON(size > 0x80000000); | ||
78 | size = roundup_pow_of_two(size); | ||
79 | } | ||
80 | |||
81 | buffer = kmalloc(size, gfp_mask); | ||
82 | if (!buffer) | ||
83 | return ERR_PTR(-ENOMEM); | ||
84 | |||
85 | ret = kfifo_init(buffer, size, gfp_mask, lock); | ||
86 | |||
87 | if (IS_ERR(ret)) | ||
88 | kfree(buffer); | ||
89 | |||
90 | return ret; | ||
91 | } | ||
92 | EXPORT_SYMBOL(kfifo_alloc); | ||
93 | |||
94 | /** | ||
95 | * kfifo_free - frees the FIFO | ||
96 | * @fifo: the fifo to be freed. | ||
97 | */ | ||
98 | void kfifo_free(struct kfifo *fifo) | ||
99 | { | ||
100 | kfree(fifo->buffer); | ||
101 | kfree(fifo); | ||
102 | } | ||
103 | EXPORT_SYMBOL(kfifo_free); | ||
104 | |||
105 | /** | ||
106 | * __kfifo_put - puts some data into the FIFO, no locking version | ||
107 | * @fifo: the fifo to be used. | ||
108 | * @buffer: the data to be added. | ||
109 | * @len: the length of the data to be added. | ||
110 | * | ||
111 | * This function copies at most 'len' bytes from the 'buffer' into | ||
112 | * the FIFO depending on the free space, and returns the number of | ||
113 | * bytes copied. | ||
114 | * | ||
115 | * Note that with only one concurrent reader and one concurrent | ||
116 | * writer, you don't need extra locking to use these functions. | ||
117 | */ | ||
118 | unsigned int __kfifo_put(struct kfifo *fifo, | ||
119 | unsigned char *buffer, unsigned int len) | ||
120 | { | ||
121 | unsigned int l; | ||
122 | |||
123 | len = min(len, fifo->size - fifo->in + fifo->out); | ||
124 | |||
125 | /* first put the data starting from fifo->in to buffer end */ | ||
126 | l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); | ||
127 | memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); | ||
128 | |||
129 | /* then put the rest (if any) at the beginning of the buffer */ | ||
130 | memcpy(fifo->buffer, buffer + l, len - l); | ||
131 | |||
132 | fifo->in += len; | ||
133 | |||
134 | return len; | ||
135 | } | ||
136 | EXPORT_SYMBOL(__kfifo_put); | ||
137 | |||
138 | /** | ||
139 | * __kfifo_get - gets some data from the FIFO, no locking version | ||
140 | * @fifo: the fifo to be used. | ||
141 | * @buffer: where the data must be copied. | ||
142 | * @len: the size of the destination buffer. | ||
143 | * | ||
144 | * This function copies at most 'len' bytes from the FIFO into the | ||
145 | * 'buffer' and returns the number of copied bytes. | ||
146 | * | ||
147 | * Note that with only one concurrent reader and one concurrent | ||
148 | * writer, you don't need extra locking to use these functions. | ||
149 | */ | ||
150 | unsigned int __kfifo_get(struct kfifo *fifo, | ||
151 | unsigned char *buffer, unsigned int len) | ||
152 | { | ||
153 | unsigned int l; | ||
154 | |||
155 | len = min(len, fifo->in - fifo->out); | ||
156 | |||
157 | /* first get the data from fifo->out until the end of the buffer */ | ||
158 | l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); | ||
159 | memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); | ||
160 | |||
161 | /* then get the rest (if any) from the beginning of the buffer */ | ||
162 | memcpy(buffer + l, fifo->buffer, len - l); | ||
163 | |||
164 | fifo->out += len; | ||
165 | |||
166 | return len; | ||
167 | } | ||
168 | EXPORT_SYMBOL(__kfifo_get); | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c new file mode 100644 index 000000000000..eed53d4f5230 --- /dev/null +++ b/kernel/kmod.c | |||
@@ -0,0 +1,256 @@ | |||
1 | /* | ||
2 | kmod, the new module loader (replaces kerneld) | ||
3 | Kirk Petersen | ||
4 | |||
5 | Reorganized not to be a daemon by Adam Richter, with guidance | ||
6 | from Greg Zornetzer. | ||
7 | |||
8 | Modified to avoid chroot and file sharing problems. | ||
9 | Mikael Pettersson | ||
10 | |||
11 | Limit the concurrent number of kmod modprobes to catch loops from | ||
12 | "modprobe needs a service that is in a module". | ||
13 | Keith Owens <kaos@ocs.com.au> December 1999 | ||
14 | |||
15 | Unblock all signals when we exec a usermode process. | ||
16 | Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000 | ||
17 | |||
18 | call_usermodehelper wait flag, and remove exec_usermodehelper. | ||
19 | Rusty Russell <rusty@rustcorp.com.au> Jan 2003 | ||
20 | */ | ||
21 | #define __KERNEL_SYSCALLS__ | ||
22 | |||
23 | #include <linux/config.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/sched.h> | ||
26 | #include <linux/syscalls.h> | ||
27 | #include <linux/unistd.h> | ||
28 | #include <linux/kmod.h> | ||
29 | #include <linux/smp_lock.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/namespace.h> | ||
32 | #include <linux/completion.h> | ||
33 | #include <linux/file.h> | ||
34 | #include <linux/workqueue.h> | ||
35 | #include <linux/security.h> | ||
36 | #include <linux/mount.h> | ||
37 | #include <linux/kernel.h> | ||
38 | #include <linux/init.h> | ||
39 | #include <asm/uaccess.h> | ||
40 | |||
41 | extern int max_threads; | ||
42 | |||
43 | static struct workqueue_struct *khelper_wq; | ||
44 | |||
45 | #ifdef CONFIG_KMOD | ||
46 | |||
47 | /* | ||
48 | modprobe_path is set via /proc/sys. | ||
49 | */ | ||
50 | char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; | ||
51 | |||
52 | /** | ||
53 | * request_module - try to load a kernel module | ||
54 | * @fmt: printf style format string for the name of the module | ||
55 | * @varargs: arguements as specified in the format string | ||
56 | * | ||
57 | * Load a module using the user mode module loader. The function returns | ||
58 | * zero on success or a negative errno code on failure. Note that a | ||
59 | * successful module load does not mean the module did not then unload | ||
60 | * and exit on an error of its own. Callers must check that the service | ||
61 | * they requested is now available not blindly invoke it. | ||
62 | * | ||
63 | * If module auto-loading support is disabled then this function | ||
64 | * becomes a no-operation. | ||
65 | */ | ||
66 | int request_module(const char *fmt, ...) | ||
67 | { | ||
68 | va_list args; | ||
69 | char module_name[MODULE_NAME_LEN]; | ||
70 | unsigned int max_modprobes; | ||
71 | int ret; | ||
72 | char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; | ||
73 | static char *envp[] = { "HOME=/", | ||
74 | "TERM=linux", | ||
75 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
76 | NULL }; | ||
77 | static atomic_t kmod_concurrent = ATOMIC_INIT(0); | ||
78 | #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ | ||
79 | static int kmod_loop_msg; | ||
80 | |||
81 | va_start(args, fmt); | ||
82 | ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); | ||
83 | va_end(args); | ||
84 | if (ret >= MODULE_NAME_LEN) | ||
85 | return -ENAMETOOLONG; | ||
86 | |||
87 | /* If modprobe needs a service that is in a module, we get a recursive | ||
88 | * loop. Limit the number of running kmod threads to max_threads/2 or | ||
89 | * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method | ||
90 | * would be to run the parents of this process, counting how many times | ||
91 | * kmod was invoked. That would mean accessing the internals of the | ||
92 | * process tables to get the command line, proc_pid_cmdline is static | ||
93 | * and it is not worth changing the proc code just to handle this case. | ||
94 | * KAO. | ||
95 | * | ||
96 | * "trace the ppid" is simple, but will fail if someone's | ||
97 | * parent exits. I think this is as good as it gets. --RR | ||
98 | */ | ||
99 | max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); | ||
100 | atomic_inc(&kmod_concurrent); | ||
101 | if (atomic_read(&kmod_concurrent) > max_modprobes) { | ||
102 | /* We may be blaming an innocent here, but unlikely */ | ||
103 | if (kmod_loop_msg++ < 5) | ||
104 | printk(KERN_ERR | ||
105 | "request_module: runaway loop modprobe %s\n", | ||
106 | module_name); | ||
107 | atomic_dec(&kmod_concurrent); | ||
108 | return -ENOMEM; | ||
109 | } | ||
110 | |||
111 | ret = call_usermodehelper(modprobe_path, argv, envp, 1); | ||
112 | atomic_dec(&kmod_concurrent); | ||
113 | return ret; | ||
114 | } | ||
115 | EXPORT_SYMBOL(request_module); | ||
116 | #endif /* CONFIG_KMOD */ | ||
117 | |||
118 | struct subprocess_info { | ||
119 | struct completion *complete; | ||
120 | char *path; | ||
121 | char **argv; | ||
122 | char **envp; | ||
123 | int wait; | ||
124 | int retval; | ||
125 | }; | ||
126 | |||
127 | /* | ||
128 | * This is the task which runs the usermode application | ||
129 | */ | ||
130 | static int ____call_usermodehelper(void *data) | ||
131 | { | ||
132 | struct subprocess_info *sub_info = data; | ||
133 | int retval; | ||
134 | |||
135 | /* Unblock all signals. */ | ||
136 | flush_signals(current); | ||
137 | spin_lock_irq(¤t->sighand->siglock); | ||
138 | flush_signal_handlers(current, 1); | ||
139 | sigemptyset(¤t->blocked); | ||
140 | recalc_sigpending(); | ||
141 | spin_unlock_irq(¤t->sighand->siglock); | ||
142 | |||
143 | /* We can run anywhere, unlike our parent keventd(). */ | ||
144 | set_cpus_allowed(current, CPU_MASK_ALL); | ||
145 | |||
146 | retval = -EPERM; | ||
147 | if (current->fs->root) | ||
148 | retval = execve(sub_info->path, sub_info->argv,sub_info->envp); | ||
149 | |||
150 | /* Exec failed? */ | ||
151 | sub_info->retval = retval; | ||
152 | do_exit(0); | ||
153 | } | ||
154 | |||
155 | /* Keventd can't block, but this (a child) can. */ | ||
156 | static int wait_for_helper(void *data) | ||
157 | { | ||
158 | struct subprocess_info *sub_info = data; | ||
159 | pid_t pid; | ||
160 | struct k_sigaction sa; | ||
161 | |||
162 | /* Install a handler: if SIGCLD isn't handled sys_wait4 won't | ||
163 | * populate the status, but will return -ECHILD. */ | ||
164 | sa.sa.sa_handler = SIG_IGN; | ||
165 | sa.sa.sa_flags = 0; | ||
166 | siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); | ||
167 | do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); | ||
168 | allow_signal(SIGCHLD); | ||
169 | |||
170 | pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); | ||
171 | if (pid < 0) { | ||
172 | sub_info->retval = pid; | ||
173 | } else { | ||
174 | /* | ||
175 | * Normally it is bogus to call wait4() from in-kernel because | ||
176 | * wait4() wants to write the exit code to a userspace address. | ||
177 | * But wait_for_helper() always runs as keventd, and put_user() | ||
178 | * to a kernel address works OK for kernel threads, due to their | ||
179 | * having an mm_segment_t which spans the entire address space. | ||
180 | * | ||
181 | * Thus the __user pointer cast is valid here. | ||
182 | */ | ||
183 | sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL); | ||
184 | } | ||
185 | |||
186 | complete(sub_info->complete); | ||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | /* This is run by khelper thread */ | ||
191 | static void __call_usermodehelper(void *data) | ||
192 | { | ||
193 | struct subprocess_info *sub_info = data; | ||
194 | pid_t pid; | ||
195 | |||
196 | /* CLONE_VFORK: wait until the usermode helper has execve'd | ||
197 | * successfully We need the data structures to stay around | ||
198 | * until that is done. */ | ||
199 | if (sub_info->wait) | ||
200 | pid = kernel_thread(wait_for_helper, sub_info, | ||
201 | CLONE_FS | CLONE_FILES | SIGCHLD); | ||
202 | else | ||
203 | pid = kernel_thread(____call_usermodehelper, sub_info, | ||
204 | CLONE_VFORK | SIGCHLD); | ||
205 | |||
206 | if (pid < 0) { | ||
207 | sub_info->retval = pid; | ||
208 | complete(sub_info->complete); | ||
209 | } else if (!sub_info->wait) | ||
210 | complete(sub_info->complete); | ||
211 | } | ||
212 | |||
213 | /** | ||
214 | * call_usermodehelper - start a usermode application | ||
215 | * @path: pathname for the application | ||
216 | * @argv: null-terminated argument list | ||
217 | * @envp: null-terminated environment list | ||
218 | * @wait: wait for the application to finish and return status. | ||
219 | * | ||
220 | * Runs a user-space application. The application is started | ||
221 | * asynchronously if wait is not set, and runs as a child of keventd. | ||
222 | * (ie. it runs with full root capabilities). | ||
223 | * | ||
224 | * Must be called from process context. Returns a negative error code | ||
225 | * if program was not execed successfully, or 0. | ||
226 | */ | ||
227 | int call_usermodehelper(char *path, char **argv, char **envp, int wait) | ||
228 | { | ||
229 | DECLARE_COMPLETION(done); | ||
230 | struct subprocess_info sub_info = { | ||
231 | .complete = &done, | ||
232 | .path = path, | ||
233 | .argv = argv, | ||
234 | .envp = envp, | ||
235 | .wait = wait, | ||
236 | .retval = 0, | ||
237 | }; | ||
238 | DECLARE_WORK(work, __call_usermodehelper, &sub_info); | ||
239 | |||
240 | if (!khelper_wq) | ||
241 | return -EBUSY; | ||
242 | |||
243 | if (path[0] == '\0') | ||
244 | return 0; | ||
245 | |||
246 | queue_work(khelper_wq, &work); | ||
247 | wait_for_completion(&done); | ||
248 | return sub_info.retval; | ||
249 | } | ||
250 | EXPORT_SYMBOL(call_usermodehelper); | ||
251 | |||
252 | void __init usermodehelper_init(void) | ||
253 | { | ||
254 | khelper_wq = create_singlethread_workqueue("khelper"); | ||
255 | BUG_ON(!khelper_wq); | ||
256 | } | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c new file mode 100644 index 000000000000..1d5dd1337bd1 --- /dev/null +++ b/kernel/kprobes.c | |||
@@ -0,0 +1,157 @@ | |||
1 | /* | ||
2 | * Kernel Probes (KProbes) | ||
3 | * kernel/kprobes.c | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
20 | * | ||
21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | ||
22 | * Probes initial implementation (includes suggestions from | ||
23 | * Rusty Russell). | ||
24 | * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with | ||
25 | * hlists and exceptions notifier as suggested by Andi Kleen. | ||
26 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes | ||
27 | * interface to access function arguments. | ||
28 | * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes | ||
29 | * exceptions notifier to be first on the priority list. | ||
30 | */ | ||
31 | #include <linux/kprobes.h> | ||
32 | #include <linux/spinlock.h> | ||
33 | #include <linux/hash.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/module.h> | ||
36 | #include <asm/cacheflush.h> | ||
37 | #include <asm/errno.h> | ||
38 | #include <asm/kdebug.h> | ||
39 | |||
40 | #define KPROBE_HASH_BITS 6 | ||
41 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) | ||
42 | |||
43 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | ||
44 | |||
45 | unsigned int kprobe_cpu = NR_CPUS; | ||
46 | static DEFINE_SPINLOCK(kprobe_lock); | ||
47 | |||
48 | /* Locks kprobe: irqs must be disabled */ | ||
49 | void lock_kprobes(void) | ||
50 | { | ||
51 | spin_lock(&kprobe_lock); | ||
52 | kprobe_cpu = smp_processor_id(); | ||
53 | } | ||
54 | |||
55 | void unlock_kprobes(void) | ||
56 | { | ||
57 | kprobe_cpu = NR_CPUS; | ||
58 | spin_unlock(&kprobe_lock); | ||
59 | } | ||
60 | |||
61 | /* You have to be holding the kprobe_lock */ | ||
62 | struct kprobe *get_kprobe(void *addr) | ||
63 | { | ||
64 | struct hlist_head *head; | ||
65 | struct hlist_node *node; | ||
66 | |||
67 | head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; | ||
68 | hlist_for_each(node, head) { | ||
69 | struct kprobe *p = hlist_entry(node, struct kprobe, hlist); | ||
70 | if (p->addr == addr) | ||
71 | return p; | ||
72 | } | ||
73 | return NULL; | ||
74 | } | ||
75 | |||
76 | int register_kprobe(struct kprobe *p) | ||
77 | { | ||
78 | int ret = 0; | ||
79 | unsigned long flags = 0; | ||
80 | |||
81 | if ((ret = arch_prepare_kprobe(p)) != 0) { | ||
82 | goto rm_kprobe; | ||
83 | } | ||
84 | spin_lock_irqsave(&kprobe_lock, flags); | ||
85 | INIT_HLIST_NODE(&p->hlist); | ||
86 | if (get_kprobe(p->addr)) { | ||
87 | ret = -EEXIST; | ||
88 | goto out; | ||
89 | } | ||
90 | arch_copy_kprobe(p); | ||
91 | |||
92 | hlist_add_head(&p->hlist, | ||
93 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | ||
94 | |||
95 | p->opcode = *p->addr; | ||
96 | *p->addr = BREAKPOINT_INSTRUCTION; | ||
97 | flush_icache_range((unsigned long) p->addr, | ||
98 | (unsigned long) p->addr + sizeof(kprobe_opcode_t)); | ||
99 | out: | ||
100 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
101 | rm_kprobe: | ||
102 | if (ret == -EEXIST) | ||
103 | arch_remove_kprobe(p); | ||
104 | return ret; | ||
105 | } | ||
106 | |||
107 | void unregister_kprobe(struct kprobe *p) | ||
108 | { | ||
109 | unsigned long flags; | ||
110 | arch_remove_kprobe(p); | ||
111 | spin_lock_irqsave(&kprobe_lock, flags); | ||
112 | *p->addr = p->opcode; | ||
113 | hlist_del(&p->hlist); | ||
114 | flush_icache_range((unsigned long) p->addr, | ||
115 | (unsigned long) p->addr + sizeof(kprobe_opcode_t)); | ||
116 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
117 | } | ||
118 | |||
119 | static struct notifier_block kprobe_exceptions_nb = { | ||
120 | .notifier_call = kprobe_exceptions_notify, | ||
121 | .priority = 0x7fffffff /* we need to notified first */ | ||
122 | }; | ||
123 | |||
124 | int register_jprobe(struct jprobe *jp) | ||
125 | { | ||
126 | /* Todo: Verify probepoint is a function entry point */ | ||
127 | jp->kp.pre_handler = setjmp_pre_handler; | ||
128 | jp->kp.break_handler = longjmp_break_handler; | ||
129 | |||
130 | return register_kprobe(&jp->kp); | ||
131 | } | ||
132 | |||
133 | void unregister_jprobe(struct jprobe *jp) | ||
134 | { | ||
135 | unregister_kprobe(&jp->kp); | ||
136 | } | ||
137 | |||
138 | static int __init init_kprobes(void) | ||
139 | { | ||
140 | int i, err = 0; | ||
141 | |||
142 | /* FIXME allocate the probe table, currently defined statically */ | ||
143 | /* initialize all list heads */ | ||
144 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) | ||
145 | INIT_HLIST_HEAD(&kprobe_table[i]); | ||
146 | |||
147 | err = register_die_notifier(&kprobe_exceptions_nb); | ||
148 | return err; | ||
149 | } | ||
150 | |||
151 | __initcall(init_kprobes); | ||
152 | |||
153 | EXPORT_SYMBOL_GPL(register_kprobe); | ||
154 | EXPORT_SYMBOL_GPL(unregister_kprobe); | ||
155 | EXPORT_SYMBOL_GPL(register_jprobe); | ||
156 | EXPORT_SYMBOL_GPL(unregister_jprobe); | ||
157 | EXPORT_SYMBOL_GPL(jprobe_return); | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c new file mode 100644 index 000000000000..1f064a63f8cf --- /dev/null +++ b/kernel/ksysfs.c | |||
@@ -0,0 +1,57 @@ | |||
1 | /* | ||
2 | * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which | ||
3 | * are not related to any other subsystem | ||
4 | * | ||
5 | * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> | ||
6 | * | ||
7 | * This file is release under the GPLv2 | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | #include <linux/config.h> | ||
12 | #include <linux/kobject.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/sysfs.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/init.h> | ||
17 | |||
18 | #define KERNEL_ATTR_RO(_name) \ | ||
19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | ||
20 | |||
21 | #define KERNEL_ATTR_RW(_name) \ | ||
22 | static struct subsys_attribute _name##_attr = \ | ||
23 | __ATTR(_name, 0644, _name##_show, _name##_store) | ||
24 | |||
25 | #ifdef CONFIG_HOTPLUG | ||
26 | static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) | ||
27 | { | ||
28 | return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); | ||
29 | } | ||
30 | KERNEL_ATTR_RO(hotplug_seqnum); | ||
31 | #endif | ||
32 | |||
33 | decl_subsys(kernel, NULL, NULL); | ||
34 | EXPORT_SYMBOL_GPL(kernel_subsys); | ||
35 | |||
36 | static struct attribute * kernel_attrs[] = { | ||
37 | #ifdef CONFIG_HOTPLUG | ||
38 | &hotplug_seqnum_attr.attr, | ||
39 | #endif | ||
40 | NULL | ||
41 | }; | ||
42 | |||
43 | static struct attribute_group kernel_attr_group = { | ||
44 | .attrs = kernel_attrs, | ||
45 | }; | ||
46 | |||
47 | static int __init ksysfs_init(void) | ||
48 | { | ||
49 | int error = subsystem_register(&kernel_subsys); | ||
50 | if (!error) | ||
51 | error = sysfs_create_group(&kernel_subsys.kset.kobj, | ||
52 | &kernel_attr_group); | ||
53 | |||
54 | return error; | ||
55 | } | ||
56 | |||
57 | core_initcall(ksysfs_init); | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c new file mode 100644 index 000000000000..e377e2244103 --- /dev/null +++ b/kernel/kthread.c | |||
@@ -0,0 +1,202 @@ | |||
1 | /* Kernel thread helper functions. | ||
2 | * Copyright (C) 2004 IBM Corporation, Rusty Russell. | ||
3 | * | ||
4 | * Creation is done via keventd, so that we get a clean environment | ||
5 | * even if we're invoked from userspace (think modprobe, hotplug cpu, | ||
6 | * etc.). | ||
7 | */ | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/kthread.h> | ||
10 | #include <linux/completion.h> | ||
11 | #include <linux/err.h> | ||
12 | #include <linux/unistd.h> | ||
13 | #include <linux/file.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <asm/semaphore.h> | ||
16 | |||
17 | /* | ||
18 | * We dont want to execute off keventd since it might | ||
19 | * hold a semaphore our callers hold too: | ||
20 | */ | ||
21 | static struct workqueue_struct *helper_wq; | ||
22 | |||
23 | struct kthread_create_info | ||
24 | { | ||
25 | /* Information passed to kthread() from keventd. */ | ||
26 | int (*threadfn)(void *data); | ||
27 | void *data; | ||
28 | struct completion started; | ||
29 | |||
30 | /* Result passed back to kthread_create() from keventd. */ | ||
31 | struct task_struct *result; | ||
32 | struct completion done; | ||
33 | }; | ||
34 | |||
35 | struct kthread_stop_info | ||
36 | { | ||
37 | struct task_struct *k; | ||
38 | int err; | ||
39 | struct completion done; | ||
40 | }; | ||
41 | |||
42 | /* Thread stopping is done by setthing this var: lock serializes | ||
43 | * multiple kthread_stop calls. */ | ||
44 | static DECLARE_MUTEX(kthread_stop_lock); | ||
45 | static struct kthread_stop_info kthread_stop_info; | ||
46 | |||
47 | int kthread_should_stop(void) | ||
48 | { | ||
49 | return (kthread_stop_info.k == current); | ||
50 | } | ||
51 | EXPORT_SYMBOL(kthread_should_stop); | ||
52 | |||
53 | static void kthread_exit_files(void) | ||
54 | { | ||
55 | struct fs_struct *fs; | ||
56 | struct task_struct *tsk = current; | ||
57 | |||
58 | exit_fs(tsk); /* current->fs->count--; */ | ||
59 | fs = init_task.fs; | ||
60 | tsk->fs = fs; | ||
61 | atomic_inc(&fs->count); | ||
62 | exit_files(tsk); | ||
63 | current->files = init_task.files; | ||
64 | atomic_inc(&tsk->files->count); | ||
65 | } | ||
66 | |||
67 | static int kthread(void *_create) | ||
68 | { | ||
69 | struct kthread_create_info *create = _create; | ||
70 | int (*threadfn)(void *data); | ||
71 | void *data; | ||
72 | sigset_t blocked; | ||
73 | int ret = -EINTR; | ||
74 | |||
75 | kthread_exit_files(); | ||
76 | |||
77 | /* Copy data: it's on keventd's stack */ | ||
78 | threadfn = create->threadfn; | ||
79 | data = create->data; | ||
80 | |||
81 | /* Block and flush all signals (in case we're not from keventd). */ | ||
82 | sigfillset(&blocked); | ||
83 | sigprocmask(SIG_BLOCK, &blocked, NULL); | ||
84 | flush_signals(current); | ||
85 | |||
86 | /* By default we can run anywhere, unlike keventd. */ | ||
87 | set_cpus_allowed(current, CPU_MASK_ALL); | ||
88 | |||
89 | /* OK, tell user we're spawned, wait for stop or wakeup */ | ||
90 | __set_current_state(TASK_INTERRUPTIBLE); | ||
91 | complete(&create->started); | ||
92 | schedule(); | ||
93 | |||
94 | if (!kthread_should_stop()) | ||
95 | ret = threadfn(data); | ||
96 | |||
97 | /* It might have exited on its own, w/o kthread_stop. Check. */ | ||
98 | if (kthread_should_stop()) { | ||
99 | kthread_stop_info.err = ret; | ||
100 | complete(&kthread_stop_info.done); | ||
101 | } | ||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | /* We are keventd: create a thread. */ | ||
106 | static void keventd_create_kthread(void *_create) | ||
107 | { | ||
108 | struct kthread_create_info *create = _create; | ||
109 | int pid; | ||
110 | |||
111 | /* We want our own signal handler (we take no signals by default). */ | ||
112 | pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); | ||
113 | if (pid < 0) { | ||
114 | create->result = ERR_PTR(pid); | ||
115 | } else { | ||
116 | wait_for_completion(&create->started); | ||
117 | create->result = find_task_by_pid(pid); | ||
118 | } | ||
119 | complete(&create->done); | ||
120 | } | ||
121 | |||
122 | struct task_struct *kthread_create(int (*threadfn)(void *data), | ||
123 | void *data, | ||
124 | const char namefmt[], | ||
125 | ...) | ||
126 | { | ||
127 | struct kthread_create_info create; | ||
128 | DECLARE_WORK(work, keventd_create_kthread, &create); | ||
129 | |||
130 | create.threadfn = threadfn; | ||
131 | create.data = data; | ||
132 | init_completion(&create.started); | ||
133 | init_completion(&create.done); | ||
134 | |||
135 | /* | ||
136 | * The workqueue needs to start up first: | ||
137 | */ | ||
138 | if (!helper_wq) | ||
139 | work.func(work.data); | ||
140 | else { | ||
141 | queue_work(helper_wq, &work); | ||
142 | wait_for_completion(&create.done); | ||
143 | } | ||
144 | if (!IS_ERR(create.result)) { | ||
145 | va_list args; | ||
146 | va_start(args, namefmt); | ||
147 | vsnprintf(create.result->comm, sizeof(create.result->comm), | ||
148 | namefmt, args); | ||
149 | va_end(args); | ||
150 | } | ||
151 | |||
152 | return create.result; | ||
153 | } | ||
154 | EXPORT_SYMBOL(kthread_create); | ||
155 | |||
156 | void kthread_bind(struct task_struct *k, unsigned int cpu) | ||
157 | { | ||
158 | BUG_ON(k->state != TASK_INTERRUPTIBLE); | ||
159 | /* Must have done schedule() in kthread() before we set_task_cpu */ | ||
160 | wait_task_inactive(k); | ||
161 | set_task_cpu(k, cpu); | ||
162 | k->cpus_allowed = cpumask_of_cpu(cpu); | ||
163 | } | ||
164 | EXPORT_SYMBOL(kthread_bind); | ||
165 | |||
166 | int kthread_stop(struct task_struct *k) | ||
167 | { | ||
168 | int ret; | ||
169 | |||
170 | down(&kthread_stop_lock); | ||
171 | |||
172 | /* It could exit after stop_info.k set, but before wake_up_process. */ | ||
173 | get_task_struct(k); | ||
174 | |||
175 | /* Must init completion *before* thread sees kthread_stop_info.k */ | ||
176 | init_completion(&kthread_stop_info.done); | ||
177 | wmb(); | ||
178 | |||
179 | /* Now set kthread_should_stop() to true, and wake it up. */ | ||
180 | kthread_stop_info.k = k; | ||
181 | wake_up_process(k); | ||
182 | put_task_struct(k); | ||
183 | |||
184 | /* Once it dies, reset stop ptr, gather result and we're done. */ | ||
185 | wait_for_completion(&kthread_stop_info.done); | ||
186 | kthread_stop_info.k = NULL; | ||
187 | ret = kthread_stop_info.err; | ||
188 | up(&kthread_stop_lock); | ||
189 | |||
190 | return ret; | ||
191 | } | ||
192 | EXPORT_SYMBOL(kthread_stop); | ||
193 | |||
194 | static __init int helper_init(void) | ||
195 | { | ||
196 | helper_wq = create_singlethread_workqueue("kthread"); | ||
197 | BUG_ON(!helper_wq); | ||
198 | |||
199 | return 0; | ||
200 | } | ||
201 | core_initcall(helper_init); | ||
202 | |||
diff --git a/kernel/module.c b/kernel/module.c new file mode 100644 index 000000000000..2dbfa0773faf --- /dev/null +++ b/kernel/module.c | |||
@@ -0,0 +1,2108 @@ | |||
1 | /* Rewritten by Rusty Russell, on the backs of many others... | ||
2 | Copyright (C) 2002 Richard Henderson | ||
3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. | ||
4 | |||
5 | This program is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 2 of the License, or | ||
8 | (at your option) any later version. | ||
9 | |||
10 | This program is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | GNU General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with this program; if not, write to the Free Software | ||
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #include <linux/config.h> | ||
20 | #include <linux/module.h> | ||
21 | #include <linux/moduleloader.h> | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/vmalloc.h> | ||
25 | #include <linux/elf.h> | ||
26 | #include <linux/seq_file.h> | ||
27 | #include <linux/syscalls.h> | ||
28 | #include <linux/fcntl.h> | ||
29 | #include <linux/rcupdate.h> | ||
30 | #include <linux/cpu.h> | ||
31 | #include <linux/moduleparam.h> | ||
32 | #include <linux/errno.h> | ||
33 | #include <linux/err.h> | ||
34 | #include <linux/vermagic.h> | ||
35 | #include <linux/notifier.h> | ||
36 | #include <linux/stop_machine.h> | ||
37 | #include <linux/device.h> | ||
38 | #include <asm/uaccess.h> | ||
39 | #include <asm/semaphore.h> | ||
40 | #include <asm/cacheflush.h> | ||
41 | |||
42 | #if 0 | ||
43 | #define DEBUGP printk | ||
44 | #else | ||
45 | #define DEBUGP(fmt , a...) | ||
46 | #endif | ||
47 | |||
48 | #ifndef ARCH_SHF_SMALL | ||
49 | #define ARCH_SHF_SMALL 0 | ||
50 | #endif | ||
51 | |||
52 | /* If this is set, the section belongs in the init part of the module */ | ||
53 | #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) | ||
54 | |||
55 | /* Protects module list */ | ||
56 | static DEFINE_SPINLOCK(modlist_lock); | ||
57 | |||
58 | /* List of modules, protected by module_mutex AND modlist_lock */ | ||
59 | static DECLARE_MUTEX(module_mutex); | ||
60 | static LIST_HEAD(modules); | ||
61 | |||
62 | static DECLARE_MUTEX(notify_mutex); | ||
63 | static struct notifier_block * module_notify_list; | ||
64 | |||
65 | int register_module_notifier(struct notifier_block * nb) | ||
66 | { | ||
67 | int err; | ||
68 | down(¬ify_mutex); | ||
69 | err = notifier_chain_register(&module_notify_list, nb); | ||
70 | up(¬ify_mutex); | ||
71 | return err; | ||
72 | } | ||
73 | EXPORT_SYMBOL(register_module_notifier); | ||
74 | |||
75 | int unregister_module_notifier(struct notifier_block * nb) | ||
76 | { | ||
77 | int err; | ||
78 | down(¬ify_mutex); | ||
79 | err = notifier_chain_unregister(&module_notify_list, nb); | ||
80 | up(¬ify_mutex); | ||
81 | return err; | ||
82 | } | ||
83 | EXPORT_SYMBOL(unregister_module_notifier); | ||
84 | |||
85 | /* We require a truly strong try_module_get() */ | ||
86 | static inline int strong_try_module_get(struct module *mod) | ||
87 | { | ||
88 | if (mod && mod->state == MODULE_STATE_COMING) | ||
89 | return 0; | ||
90 | return try_module_get(mod); | ||
91 | } | ||
92 | |||
93 | /* A thread that wants to hold a reference to a module only while it | ||
94 | * is running can call ths to safely exit. | ||
95 | * nfsd and lockd use this. | ||
96 | */ | ||
97 | void __module_put_and_exit(struct module *mod, long code) | ||
98 | { | ||
99 | module_put(mod); | ||
100 | do_exit(code); | ||
101 | } | ||
102 | EXPORT_SYMBOL(__module_put_and_exit); | ||
103 | |||
104 | /* Find a module section: 0 means not found. */ | ||
105 | static unsigned int find_sec(Elf_Ehdr *hdr, | ||
106 | Elf_Shdr *sechdrs, | ||
107 | const char *secstrings, | ||
108 | const char *name) | ||
109 | { | ||
110 | unsigned int i; | ||
111 | |||
112 | for (i = 1; i < hdr->e_shnum; i++) | ||
113 | /* Alloc bit cleared means "ignore it." */ | ||
114 | if ((sechdrs[i].sh_flags & SHF_ALLOC) | ||
115 | && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) | ||
116 | return i; | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | /* Provided by the linker */ | ||
121 | extern const struct kernel_symbol __start___ksymtab[]; | ||
122 | extern const struct kernel_symbol __stop___ksymtab[]; | ||
123 | extern const struct kernel_symbol __start___ksymtab_gpl[]; | ||
124 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; | ||
125 | extern const unsigned long __start___kcrctab[]; | ||
126 | extern const unsigned long __start___kcrctab_gpl[]; | ||
127 | |||
128 | #ifndef CONFIG_MODVERSIONS | ||
129 | #define symversion(base, idx) NULL | ||
130 | #else | ||
131 | #define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) | ||
132 | #endif | ||
133 | |||
134 | /* Find a symbol, return value, crc and module which owns it */ | ||
135 | static unsigned long __find_symbol(const char *name, | ||
136 | struct module **owner, | ||
137 | const unsigned long **crc, | ||
138 | int gplok) | ||
139 | { | ||
140 | struct module *mod; | ||
141 | unsigned int i; | ||
142 | |||
143 | /* Core kernel first. */ | ||
144 | *owner = NULL; | ||
145 | for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) { | ||
146 | if (strcmp(__start___ksymtab[i].name, name) == 0) { | ||
147 | *crc = symversion(__start___kcrctab, i); | ||
148 | return __start___ksymtab[i].value; | ||
149 | } | ||
150 | } | ||
151 | if (gplok) { | ||
152 | for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++) | ||
153 | if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) { | ||
154 | *crc = symversion(__start___kcrctab_gpl, i); | ||
155 | return __start___ksymtab_gpl[i].value; | ||
156 | } | ||
157 | } | ||
158 | |||
159 | /* Now try modules. */ | ||
160 | list_for_each_entry(mod, &modules, list) { | ||
161 | *owner = mod; | ||
162 | for (i = 0; i < mod->num_syms; i++) | ||
163 | if (strcmp(mod->syms[i].name, name) == 0) { | ||
164 | *crc = symversion(mod->crcs, i); | ||
165 | return mod->syms[i].value; | ||
166 | } | ||
167 | |||
168 | if (gplok) { | ||
169 | for (i = 0; i < mod->num_gpl_syms; i++) { | ||
170 | if (strcmp(mod->gpl_syms[i].name, name) == 0) { | ||
171 | *crc = symversion(mod->gpl_crcs, i); | ||
172 | return mod->gpl_syms[i].value; | ||
173 | } | ||
174 | } | ||
175 | } | ||
176 | } | ||
177 | DEBUGP("Failed to find symbol %s\n", name); | ||
178 | return 0; | ||
179 | } | ||
180 | |||
181 | /* Find a symbol in this elf symbol table */ | ||
182 | static unsigned long find_local_symbol(Elf_Shdr *sechdrs, | ||
183 | unsigned int symindex, | ||
184 | const char *strtab, | ||
185 | const char *name) | ||
186 | { | ||
187 | unsigned int i; | ||
188 | Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; | ||
189 | |||
190 | /* Search (defined) internal symbols first. */ | ||
191 | for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) { | ||
192 | if (sym[i].st_shndx != SHN_UNDEF | ||
193 | && strcmp(name, strtab + sym[i].st_name) == 0) | ||
194 | return sym[i].st_value; | ||
195 | } | ||
196 | return 0; | ||
197 | } | ||
198 | |||
199 | /* Search for module by name: must hold module_mutex. */ | ||
200 | static struct module *find_module(const char *name) | ||
201 | { | ||
202 | struct module *mod; | ||
203 | |||
204 | list_for_each_entry(mod, &modules, list) { | ||
205 | if (strcmp(mod->name, name) == 0) | ||
206 | return mod; | ||
207 | } | ||
208 | return NULL; | ||
209 | } | ||
210 | |||
211 | #ifdef CONFIG_SMP | ||
212 | /* Number of blocks used and allocated. */ | ||
213 | static unsigned int pcpu_num_used, pcpu_num_allocated; | ||
214 | /* Size of each block. -ve means used. */ | ||
215 | static int *pcpu_size; | ||
216 | |||
217 | static int split_block(unsigned int i, unsigned short size) | ||
218 | { | ||
219 | /* Reallocation required? */ | ||
220 | if (pcpu_num_used + 1 > pcpu_num_allocated) { | ||
221 | int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2, | ||
222 | GFP_KERNEL); | ||
223 | if (!new) | ||
224 | return 0; | ||
225 | |||
226 | memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated); | ||
227 | pcpu_num_allocated *= 2; | ||
228 | kfree(pcpu_size); | ||
229 | pcpu_size = new; | ||
230 | } | ||
231 | |||
232 | /* Insert a new subblock */ | ||
233 | memmove(&pcpu_size[i+1], &pcpu_size[i], | ||
234 | sizeof(pcpu_size[0]) * (pcpu_num_used - i)); | ||
235 | pcpu_num_used++; | ||
236 | |||
237 | pcpu_size[i+1] -= size; | ||
238 | pcpu_size[i] = size; | ||
239 | return 1; | ||
240 | } | ||
241 | |||
242 | static inline unsigned int block_size(int val) | ||
243 | { | ||
244 | if (val < 0) | ||
245 | return -val; | ||
246 | return val; | ||
247 | } | ||
248 | |||
249 | /* Created by linker magic */ | ||
250 | extern char __per_cpu_start[], __per_cpu_end[]; | ||
251 | |||
252 | static void *percpu_modalloc(unsigned long size, unsigned long align) | ||
253 | { | ||
254 | unsigned long extra; | ||
255 | unsigned int i; | ||
256 | void *ptr; | ||
257 | |||
258 | BUG_ON(align > SMP_CACHE_BYTES); | ||
259 | |||
260 | ptr = __per_cpu_start; | ||
261 | for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { | ||
262 | /* Extra for alignment requirement. */ | ||
263 | extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr; | ||
264 | BUG_ON(i == 0 && extra != 0); | ||
265 | |||
266 | if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size) | ||
267 | continue; | ||
268 | |||
269 | /* Transfer extra to previous block. */ | ||
270 | if (pcpu_size[i-1] < 0) | ||
271 | pcpu_size[i-1] -= extra; | ||
272 | else | ||
273 | pcpu_size[i-1] += extra; | ||
274 | pcpu_size[i] -= extra; | ||
275 | ptr += extra; | ||
276 | |||
277 | /* Split block if warranted */ | ||
278 | if (pcpu_size[i] - size > sizeof(unsigned long)) | ||
279 | if (!split_block(i, size)) | ||
280 | return NULL; | ||
281 | |||
282 | /* Mark allocated */ | ||
283 | pcpu_size[i] = -pcpu_size[i]; | ||
284 | return ptr; | ||
285 | } | ||
286 | |||
287 | printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", | ||
288 | size); | ||
289 | return NULL; | ||
290 | } | ||
291 | |||
292 | static void percpu_modfree(void *freeme) | ||
293 | { | ||
294 | unsigned int i; | ||
295 | void *ptr = __per_cpu_start + block_size(pcpu_size[0]); | ||
296 | |||
297 | /* First entry is core kernel percpu data. */ | ||
298 | for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { | ||
299 | if (ptr == freeme) { | ||
300 | pcpu_size[i] = -pcpu_size[i]; | ||
301 | goto free; | ||
302 | } | ||
303 | } | ||
304 | BUG(); | ||
305 | |||
306 | free: | ||
307 | /* Merge with previous? */ | ||
308 | if (pcpu_size[i-1] >= 0) { | ||
309 | pcpu_size[i-1] += pcpu_size[i]; | ||
310 | pcpu_num_used--; | ||
311 | memmove(&pcpu_size[i], &pcpu_size[i+1], | ||
312 | (pcpu_num_used - i) * sizeof(pcpu_size[0])); | ||
313 | i--; | ||
314 | } | ||
315 | /* Merge with next? */ | ||
316 | if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) { | ||
317 | pcpu_size[i] += pcpu_size[i+1]; | ||
318 | pcpu_num_used--; | ||
319 | memmove(&pcpu_size[i+1], &pcpu_size[i+2], | ||
320 | (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0])); | ||
321 | } | ||
322 | } | ||
323 | |||
324 | static unsigned int find_pcpusec(Elf_Ehdr *hdr, | ||
325 | Elf_Shdr *sechdrs, | ||
326 | const char *secstrings) | ||
327 | { | ||
328 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); | ||
329 | } | ||
330 | |||
331 | static int percpu_modinit(void) | ||
332 | { | ||
333 | pcpu_num_used = 2; | ||
334 | pcpu_num_allocated = 2; | ||
335 | pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, | ||
336 | GFP_KERNEL); | ||
337 | /* Static in-kernel percpu data (used). */ | ||
338 | pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES); | ||
339 | /* Free room. */ | ||
340 | pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; | ||
341 | if (pcpu_size[1] < 0) { | ||
342 | printk(KERN_ERR "No per-cpu room for modules.\n"); | ||
343 | pcpu_num_used = 1; | ||
344 | } | ||
345 | |||
346 | return 0; | ||
347 | } | ||
348 | __initcall(percpu_modinit); | ||
349 | #else /* ... !CONFIG_SMP */ | ||
350 | static inline void *percpu_modalloc(unsigned long size, unsigned long align) | ||
351 | { | ||
352 | return NULL; | ||
353 | } | ||
354 | static inline void percpu_modfree(void *pcpuptr) | ||
355 | { | ||
356 | BUG(); | ||
357 | } | ||
358 | static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, | ||
359 | Elf_Shdr *sechdrs, | ||
360 | const char *secstrings) | ||
361 | { | ||
362 | return 0; | ||
363 | } | ||
364 | static inline void percpu_modcopy(void *pcpudst, const void *src, | ||
365 | unsigned long size) | ||
366 | { | ||
367 | /* pcpusec should be 0, and size of that section should be 0. */ | ||
368 | BUG_ON(size != 0); | ||
369 | } | ||
370 | #endif /* CONFIG_SMP */ | ||
371 | |||
372 | #ifdef CONFIG_MODULE_UNLOAD | ||
373 | /* Init the unload section of the module. */ | ||
374 | static void module_unload_init(struct module *mod) | ||
375 | { | ||
376 | unsigned int i; | ||
377 | |||
378 | INIT_LIST_HEAD(&mod->modules_which_use_me); | ||
379 | for (i = 0; i < NR_CPUS; i++) | ||
380 | local_set(&mod->ref[i].count, 0); | ||
381 | /* Hold reference count during initialization. */ | ||
382 | local_set(&mod->ref[_smp_processor_id()].count, 1); | ||
383 | /* Backwards compatibility macros put refcount during init. */ | ||
384 | mod->waiter = current; | ||
385 | } | ||
386 | |||
387 | /* modules using other modules */ | ||
388 | struct module_use | ||
389 | { | ||
390 | struct list_head list; | ||
391 | struct module *module_which_uses; | ||
392 | }; | ||
393 | |||
394 | /* Does a already use b? */ | ||
395 | static int already_uses(struct module *a, struct module *b) | ||
396 | { | ||
397 | struct module_use *use; | ||
398 | |||
399 | list_for_each_entry(use, &b->modules_which_use_me, list) { | ||
400 | if (use->module_which_uses == a) { | ||
401 | DEBUGP("%s uses %s!\n", a->name, b->name); | ||
402 | return 1; | ||
403 | } | ||
404 | } | ||
405 | DEBUGP("%s does not use %s!\n", a->name, b->name); | ||
406 | return 0; | ||
407 | } | ||
408 | |||
409 | /* Module a uses b */ | ||
410 | static int use_module(struct module *a, struct module *b) | ||
411 | { | ||
412 | struct module_use *use; | ||
413 | if (b == NULL || already_uses(a, b)) return 1; | ||
414 | |||
415 | if (!strong_try_module_get(b)) | ||
416 | return 0; | ||
417 | |||
418 | DEBUGP("Allocating new usage for %s.\n", a->name); | ||
419 | use = kmalloc(sizeof(*use), GFP_ATOMIC); | ||
420 | if (!use) { | ||
421 | printk("%s: out of memory loading\n", a->name); | ||
422 | module_put(b); | ||
423 | return 0; | ||
424 | } | ||
425 | |||
426 | use->module_which_uses = a; | ||
427 | list_add(&use->list, &b->modules_which_use_me); | ||
428 | return 1; | ||
429 | } | ||
430 | |||
431 | /* Clear the unload stuff of the module. */ | ||
432 | static void module_unload_free(struct module *mod) | ||
433 | { | ||
434 | struct module *i; | ||
435 | |||
436 | list_for_each_entry(i, &modules, list) { | ||
437 | struct module_use *use; | ||
438 | |||
439 | list_for_each_entry(use, &i->modules_which_use_me, list) { | ||
440 | if (use->module_which_uses == mod) { | ||
441 | DEBUGP("%s unusing %s\n", mod->name, i->name); | ||
442 | module_put(i); | ||
443 | list_del(&use->list); | ||
444 | kfree(use); | ||
445 | /* There can be at most one match. */ | ||
446 | break; | ||
447 | } | ||
448 | } | ||
449 | } | ||
450 | } | ||
451 | |||
452 | #ifdef CONFIG_MODULE_FORCE_UNLOAD | ||
453 | static inline int try_force(unsigned int flags) | ||
454 | { | ||
455 | int ret = (flags & O_TRUNC); | ||
456 | if (ret) | ||
457 | tainted |= TAINT_FORCED_MODULE; | ||
458 | return ret; | ||
459 | } | ||
460 | #else | ||
461 | static inline int try_force(unsigned int flags) | ||
462 | { | ||
463 | return 0; | ||
464 | } | ||
465 | #endif /* CONFIG_MODULE_FORCE_UNLOAD */ | ||
466 | |||
467 | struct stopref | ||
468 | { | ||
469 | struct module *mod; | ||
470 | int flags; | ||
471 | int *forced; | ||
472 | }; | ||
473 | |||
474 | /* Whole machine is stopped with interrupts off when this runs. */ | ||
475 | static int __try_stop_module(void *_sref) | ||
476 | { | ||
477 | struct stopref *sref = _sref; | ||
478 | |||
479 | /* If it's not unused, quit unless we are told to block. */ | ||
480 | if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { | ||
481 | if (!(*sref->forced = try_force(sref->flags))) | ||
482 | return -EWOULDBLOCK; | ||
483 | } | ||
484 | |||
485 | /* Mark it as dying. */ | ||
486 | sref->mod->state = MODULE_STATE_GOING; | ||
487 | return 0; | ||
488 | } | ||
489 | |||
490 | static int try_stop_module(struct module *mod, int flags, int *forced) | ||
491 | { | ||
492 | struct stopref sref = { mod, flags, forced }; | ||
493 | |||
494 | return stop_machine_run(__try_stop_module, &sref, NR_CPUS); | ||
495 | } | ||
496 | |||
497 | unsigned int module_refcount(struct module *mod) | ||
498 | { | ||
499 | unsigned int i, total = 0; | ||
500 | |||
501 | for (i = 0; i < NR_CPUS; i++) | ||
502 | total += local_read(&mod->ref[i].count); | ||
503 | return total; | ||
504 | } | ||
505 | EXPORT_SYMBOL(module_refcount); | ||
506 | |||
507 | /* This exists whether we can unload or not */ | ||
508 | static void free_module(struct module *mod); | ||
509 | |||
510 | static void wait_for_zero_refcount(struct module *mod) | ||
511 | { | ||
512 | /* Since we might sleep for some time, drop the semaphore first */ | ||
513 | up(&module_mutex); | ||
514 | for (;;) { | ||
515 | DEBUGP("Looking at refcount...\n"); | ||
516 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
517 | if (module_refcount(mod) == 0) | ||
518 | break; | ||
519 | schedule(); | ||
520 | } | ||
521 | current->state = TASK_RUNNING; | ||
522 | down(&module_mutex); | ||
523 | } | ||
524 | |||
525 | asmlinkage long | ||
526 | sys_delete_module(const char __user *name_user, unsigned int flags) | ||
527 | { | ||
528 | struct module *mod; | ||
529 | char name[MODULE_NAME_LEN]; | ||
530 | int ret, forced = 0; | ||
531 | |||
532 | if (!capable(CAP_SYS_MODULE)) | ||
533 | return -EPERM; | ||
534 | |||
535 | if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) | ||
536 | return -EFAULT; | ||
537 | name[MODULE_NAME_LEN-1] = '\0'; | ||
538 | |||
539 | if (down_interruptible(&module_mutex) != 0) | ||
540 | return -EINTR; | ||
541 | |||
542 | mod = find_module(name); | ||
543 | if (!mod) { | ||
544 | ret = -ENOENT; | ||
545 | goto out; | ||
546 | } | ||
547 | |||
548 | if (!list_empty(&mod->modules_which_use_me)) { | ||
549 | /* Other modules depend on us: get rid of them first. */ | ||
550 | ret = -EWOULDBLOCK; | ||
551 | goto out; | ||
552 | } | ||
553 | |||
554 | /* Doing init or already dying? */ | ||
555 | if (mod->state != MODULE_STATE_LIVE) { | ||
556 | /* FIXME: if (force), slam module count and wake up | ||
557 | waiter --RR */ | ||
558 | DEBUGP("%s already dying\n", mod->name); | ||
559 | ret = -EBUSY; | ||
560 | goto out; | ||
561 | } | ||
562 | |||
563 | /* If it has an init func, it must have an exit func to unload */ | ||
564 | if ((mod->init != NULL && mod->exit == NULL) | ||
565 | || mod->unsafe) { | ||
566 | forced = try_force(flags); | ||
567 | if (!forced) { | ||
568 | /* This module can't be removed */ | ||
569 | ret = -EBUSY; | ||
570 | goto out; | ||
571 | } | ||
572 | } | ||
573 | |||
574 | /* Set this up before setting mod->state */ | ||
575 | mod->waiter = current; | ||
576 | |||
577 | /* Stop the machine so refcounts can't move and disable module. */ | ||
578 | ret = try_stop_module(mod, flags, &forced); | ||
579 | if (ret != 0) | ||
580 | goto out; | ||
581 | |||
582 | /* Never wait if forced. */ | ||
583 | if (!forced && module_refcount(mod) != 0) | ||
584 | wait_for_zero_refcount(mod); | ||
585 | |||
586 | /* Final destruction now noone is using it. */ | ||
587 | if (mod->exit != NULL) { | ||
588 | up(&module_mutex); | ||
589 | mod->exit(); | ||
590 | down(&module_mutex); | ||
591 | } | ||
592 | free_module(mod); | ||
593 | |||
594 | out: | ||
595 | up(&module_mutex); | ||
596 | return ret; | ||
597 | } | ||
598 | |||
599 | static void print_unload_info(struct seq_file *m, struct module *mod) | ||
600 | { | ||
601 | struct module_use *use; | ||
602 | int printed_something = 0; | ||
603 | |||
604 | seq_printf(m, " %u ", module_refcount(mod)); | ||
605 | |||
606 | /* Always include a trailing , so userspace can differentiate | ||
607 | between this and the old multi-field proc format. */ | ||
608 | list_for_each_entry(use, &mod->modules_which_use_me, list) { | ||
609 | printed_something = 1; | ||
610 | seq_printf(m, "%s,", use->module_which_uses->name); | ||
611 | } | ||
612 | |||
613 | if (mod->unsafe) { | ||
614 | printed_something = 1; | ||
615 | seq_printf(m, "[unsafe],"); | ||
616 | } | ||
617 | |||
618 | if (mod->init != NULL && mod->exit == NULL) { | ||
619 | printed_something = 1; | ||
620 | seq_printf(m, "[permanent],"); | ||
621 | } | ||
622 | |||
623 | if (!printed_something) | ||
624 | seq_printf(m, "-"); | ||
625 | } | ||
626 | |||
627 | void __symbol_put(const char *symbol) | ||
628 | { | ||
629 | struct module *owner; | ||
630 | unsigned long flags; | ||
631 | const unsigned long *crc; | ||
632 | |||
633 | spin_lock_irqsave(&modlist_lock, flags); | ||
634 | if (!__find_symbol(symbol, &owner, &crc, 1)) | ||
635 | BUG(); | ||
636 | module_put(owner); | ||
637 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
638 | } | ||
639 | EXPORT_SYMBOL(__symbol_put); | ||
640 | |||
641 | void symbol_put_addr(void *addr) | ||
642 | { | ||
643 | unsigned long flags; | ||
644 | |||
645 | spin_lock_irqsave(&modlist_lock, flags); | ||
646 | if (!kernel_text_address((unsigned long)addr)) | ||
647 | BUG(); | ||
648 | |||
649 | module_put(module_text_address((unsigned long)addr)); | ||
650 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
651 | } | ||
652 | EXPORT_SYMBOL_GPL(symbol_put_addr); | ||
653 | |||
654 | static ssize_t show_refcnt(struct module_attribute *mattr, | ||
655 | struct module *mod, char *buffer) | ||
656 | { | ||
657 | /* sysfs holds a reference */ | ||
658 | return sprintf(buffer, "%u\n", module_refcount(mod)-1); | ||
659 | } | ||
660 | |||
661 | static struct module_attribute refcnt = { | ||
662 | .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE }, | ||
663 | .show = show_refcnt, | ||
664 | }; | ||
665 | |||
666 | #else /* !CONFIG_MODULE_UNLOAD */ | ||
667 | static void print_unload_info(struct seq_file *m, struct module *mod) | ||
668 | { | ||
669 | /* We don't know the usage count, or what modules are using. */ | ||
670 | seq_printf(m, " - -"); | ||
671 | } | ||
672 | |||
673 | static inline void module_unload_free(struct module *mod) | ||
674 | { | ||
675 | } | ||
676 | |||
677 | static inline int use_module(struct module *a, struct module *b) | ||
678 | { | ||
679 | return strong_try_module_get(b); | ||
680 | } | ||
681 | |||
682 | static inline void module_unload_init(struct module *mod) | ||
683 | { | ||
684 | } | ||
685 | #endif /* CONFIG_MODULE_UNLOAD */ | ||
686 | |||
687 | #ifdef CONFIG_OBSOLETE_MODPARM | ||
688 | /* Bounds checking done below */ | ||
689 | static int obsparm_copy_string(const char *val, struct kernel_param *kp) | ||
690 | { | ||
691 | strcpy(kp->arg, val); | ||
692 | return 0; | ||
693 | } | ||
694 | |||
695 | int set_obsolete(const char *val, struct kernel_param *kp) | ||
696 | { | ||
697 | unsigned int min, max; | ||
698 | unsigned int size, maxsize; | ||
699 | int dummy; | ||
700 | char *endp; | ||
701 | const char *p; | ||
702 | struct obsolete_modparm *obsparm = kp->arg; | ||
703 | |||
704 | if (!val) { | ||
705 | printk(KERN_ERR "Parameter %s needs an argument\n", kp->name); | ||
706 | return -EINVAL; | ||
707 | } | ||
708 | |||
709 | /* type is: [min[-max]]{b,h,i,l,s} */ | ||
710 | p = obsparm->type; | ||
711 | min = simple_strtol(p, &endp, 10); | ||
712 | if (endp == obsparm->type) | ||
713 | min = max = 1; | ||
714 | else if (*endp == '-') { | ||
715 | p = endp+1; | ||
716 | max = simple_strtol(p, &endp, 10); | ||
717 | } else | ||
718 | max = min; | ||
719 | switch (*endp) { | ||
720 | case 'b': | ||
721 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
722 | 1, param_set_byte, &dummy); | ||
723 | case 'h': | ||
724 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
725 | sizeof(short), param_set_short, &dummy); | ||
726 | case 'i': | ||
727 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
728 | sizeof(int), param_set_int, &dummy); | ||
729 | case 'l': | ||
730 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
731 | sizeof(long), param_set_long, &dummy); | ||
732 | case 's': | ||
733 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
734 | sizeof(char *), param_set_charp, &dummy); | ||
735 | |||
736 | case 'c': | ||
737 | /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars, | ||
738 | and the decl is "char xxx[5][50];" */ | ||
739 | p = endp+1; | ||
740 | maxsize = simple_strtol(p, &endp, 10); | ||
741 | /* We check lengths here (yes, this is a hack). */ | ||
742 | p = val; | ||
743 | while (p[size = strcspn(p, ",")]) { | ||
744 | if (size >= maxsize) | ||
745 | goto oversize; | ||
746 | p += size+1; | ||
747 | } | ||
748 | if (size >= maxsize) | ||
749 | goto oversize; | ||
750 | return param_array(kp->name, val, min, max, obsparm->addr, | ||
751 | maxsize, obsparm_copy_string, &dummy); | ||
752 | } | ||
753 | printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type); | ||
754 | return -EINVAL; | ||
755 | oversize: | ||
756 | printk(KERN_ERR | ||
757 | "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize); | ||
758 | return -EINVAL; | ||
759 | } | ||
760 | |||
761 | static int obsolete_params(const char *name, | ||
762 | char *args, | ||
763 | struct obsolete_modparm obsparm[], | ||
764 | unsigned int num, | ||
765 | Elf_Shdr *sechdrs, | ||
766 | unsigned int symindex, | ||
767 | const char *strtab) | ||
768 | { | ||
769 | struct kernel_param *kp; | ||
770 | unsigned int i; | ||
771 | int ret; | ||
772 | |||
773 | kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL); | ||
774 | if (!kp) | ||
775 | return -ENOMEM; | ||
776 | |||
777 | for (i = 0; i < num; i++) { | ||
778 | char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)]; | ||
779 | |||
780 | snprintf(sym_name, sizeof(sym_name), "%s%s", | ||
781 | MODULE_SYMBOL_PREFIX, obsparm[i].name); | ||
782 | |||
783 | kp[i].name = obsparm[i].name; | ||
784 | kp[i].perm = 000; | ||
785 | kp[i].set = set_obsolete; | ||
786 | kp[i].get = NULL; | ||
787 | obsparm[i].addr | ||
788 | = (void *)find_local_symbol(sechdrs, symindex, strtab, | ||
789 | sym_name); | ||
790 | if (!obsparm[i].addr) { | ||
791 | printk("%s: falsely claims to have parameter %s\n", | ||
792 | name, obsparm[i].name); | ||
793 | ret = -EINVAL; | ||
794 | goto out; | ||
795 | } | ||
796 | kp[i].arg = &obsparm[i]; | ||
797 | } | ||
798 | |||
799 | ret = parse_args(name, args, kp, num, NULL); | ||
800 | out: | ||
801 | kfree(kp); | ||
802 | return ret; | ||
803 | } | ||
804 | #else | ||
805 | static int obsolete_params(const char *name, | ||
806 | char *args, | ||
807 | struct obsolete_modparm obsparm[], | ||
808 | unsigned int num, | ||
809 | Elf_Shdr *sechdrs, | ||
810 | unsigned int symindex, | ||
811 | const char *strtab) | ||
812 | { | ||
813 | if (num != 0) | ||
814 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", | ||
815 | name); | ||
816 | return 0; | ||
817 | } | ||
818 | #endif /* CONFIG_OBSOLETE_MODPARM */ | ||
819 | |||
820 | static const char vermagic[] = VERMAGIC_STRING; | ||
821 | |||
822 | #ifdef CONFIG_MODVERSIONS | ||
823 | static int check_version(Elf_Shdr *sechdrs, | ||
824 | unsigned int versindex, | ||
825 | const char *symname, | ||
826 | struct module *mod, | ||
827 | const unsigned long *crc) | ||
828 | { | ||
829 | unsigned int i, num_versions; | ||
830 | struct modversion_info *versions; | ||
831 | |||
832 | /* Exporting module didn't supply crcs? OK, we're already tainted. */ | ||
833 | if (!crc) | ||
834 | return 1; | ||
835 | |||
836 | versions = (void *) sechdrs[versindex].sh_addr; | ||
837 | num_versions = sechdrs[versindex].sh_size | ||
838 | / sizeof(struct modversion_info); | ||
839 | |||
840 | for (i = 0; i < num_versions; i++) { | ||
841 | if (strcmp(versions[i].name, symname) != 0) | ||
842 | continue; | ||
843 | |||
844 | if (versions[i].crc == *crc) | ||
845 | return 1; | ||
846 | printk("%s: disagrees about version of symbol %s\n", | ||
847 | mod->name, symname); | ||
848 | DEBUGP("Found checksum %lX vs module %lX\n", | ||
849 | *crc, versions[i].crc); | ||
850 | return 0; | ||
851 | } | ||
852 | /* Not in module's version table. OK, but that taints the kernel. */ | ||
853 | if (!(tainted & TAINT_FORCED_MODULE)) { | ||
854 | printk("%s: no version for \"%s\" found: kernel tainted.\n", | ||
855 | mod->name, symname); | ||
856 | tainted |= TAINT_FORCED_MODULE; | ||
857 | } | ||
858 | return 1; | ||
859 | } | ||
860 | |||
861 | static inline int check_modstruct_version(Elf_Shdr *sechdrs, | ||
862 | unsigned int versindex, | ||
863 | struct module *mod) | ||
864 | { | ||
865 | const unsigned long *crc; | ||
866 | struct module *owner; | ||
867 | |||
868 | if (!__find_symbol("struct_module", &owner, &crc, 1)) | ||
869 | BUG(); | ||
870 | return check_version(sechdrs, versindex, "struct_module", mod, | ||
871 | crc); | ||
872 | } | ||
873 | |||
874 | /* First part is kernel version, which we ignore. */ | ||
875 | static inline int same_magic(const char *amagic, const char *bmagic) | ||
876 | { | ||
877 | amagic += strcspn(amagic, " "); | ||
878 | bmagic += strcspn(bmagic, " "); | ||
879 | return strcmp(amagic, bmagic) == 0; | ||
880 | } | ||
881 | #else | ||
882 | static inline int check_version(Elf_Shdr *sechdrs, | ||
883 | unsigned int versindex, | ||
884 | const char *symname, | ||
885 | struct module *mod, | ||
886 | const unsigned long *crc) | ||
887 | { | ||
888 | return 1; | ||
889 | } | ||
890 | |||
891 | static inline int check_modstruct_version(Elf_Shdr *sechdrs, | ||
892 | unsigned int versindex, | ||
893 | struct module *mod) | ||
894 | { | ||
895 | return 1; | ||
896 | } | ||
897 | |||
898 | static inline int same_magic(const char *amagic, const char *bmagic) | ||
899 | { | ||
900 | return strcmp(amagic, bmagic) == 0; | ||
901 | } | ||
902 | #endif /* CONFIG_MODVERSIONS */ | ||
903 | |||
904 | /* Resolve a symbol for this module. I.e. if we find one, record usage. | ||
905 | Must be holding module_mutex. */ | ||
906 | static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | ||
907 | unsigned int versindex, | ||
908 | const char *name, | ||
909 | struct module *mod) | ||
910 | { | ||
911 | struct module *owner; | ||
912 | unsigned long ret; | ||
913 | const unsigned long *crc; | ||
914 | |||
915 | spin_lock_irq(&modlist_lock); | ||
916 | ret = __find_symbol(name, &owner, &crc, mod->license_gplok); | ||
917 | if (ret) { | ||
918 | /* use_module can fail due to OOM, or module unloading */ | ||
919 | if (!check_version(sechdrs, versindex, name, mod, crc) || | ||
920 | !use_module(mod, owner)) | ||
921 | ret = 0; | ||
922 | } | ||
923 | spin_unlock_irq(&modlist_lock); | ||
924 | return ret; | ||
925 | } | ||
926 | |||
927 | |||
928 | /* | ||
929 | * /sys/module/foo/sections stuff | ||
930 | * J. Corbet <corbet@lwn.net> | ||
931 | */ | ||
932 | #ifdef CONFIG_KALLSYMS | ||
933 | static ssize_t module_sect_show(struct module_attribute *mattr, | ||
934 | struct module *mod, char *buf) | ||
935 | { | ||
936 | struct module_sect_attr *sattr = | ||
937 | container_of(mattr, struct module_sect_attr, mattr); | ||
938 | return sprintf(buf, "0x%lx\n", sattr->address); | ||
939 | } | ||
940 | |||
941 | static void add_sect_attrs(struct module *mod, unsigned int nsect, | ||
942 | char *secstrings, Elf_Shdr *sechdrs) | ||
943 | { | ||
944 | unsigned int nloaded = 0, i, size[2]; | ||
945 | struct module_sect_attrs *sect_attrs; | ||
946 | struct module_sect_attr *sattr; | ||
947 | struct attribute **gattr; | ||
948 | |||
949 | /* Count loaded sections and allocate structures */ | ||
950 | for (i = 0; i < nsect; i++) | ||
951 | if (sechdrs[i].sh_flags & SHF_ALLOC) | ||
952 | nloaded++; | ||
953 | size[0] = ALIGN(sizeof(*sect_attrs) | ||
954 | + nloaded * sizeof(sect_attrs->attrs[0]), | ||
955 | sizeof(sect_attrs->grp.attrs[0])); | ||
956 | size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); | ||
957 | if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) | ||
958 | return; | ||
959 | |||
960 | /* Setup section attributes. */ | ||
961 | sect_attrs->grp.name = "sections"; | ||
962 | sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; | ||
963 | |||
964 | sattr = §_attrs->attrs[0]; | ||
965 | gattr = §_attrs->grp.attrs[0]; | ||
966 | for (i = 0; i < nsect; i++) { | ||
967 | if (! (sechdrs[i].sh_flags & SHF_ALLOC)) | ||
968 | continue; | ||
969 | sattr->address = sechdrs[i].sh_addr; | ||
970 | strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, | ||
971 | MODULE_SECT_NAME_LEN); | ||
972 | sattr->mattr.show = module_sect_show; | ||
973 | sattr->mattr.store = NULL; | ||
974 | sattr->mattr.attr.name = sattr->name; | ||
975 | sattr->mattr.attr.owner = mod; | ||
976 | sattr->mattr.attr.mode = S_IRUGO; | ||
977 | *(gattr++) = &(sattr++)->mattr.attr; | ||
978 | } | ||
979 | *gattr = NULL; | ||
980 | |||
981 | if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) | ||
982 | goto out; | ||
983 | |||
984 | mod->sect_attrs = sect_attrs; | ||
985 | return; | ||
986 | out: | ||
987 | kfree(sect_attrs); | ||
988 | } | ||
989 | |||
990 | static void remove_sect_attrs(struct module *mod) | ||
991 | { | ||
992 | if (mod->sect_attrs) { | ||
993 | sysfs_remove_group(&mod->mkobj.kobj, | ||
994 | &mod->sect_attrs->grp); | ||
995 | /* We are positive that no one is using any sect attrs | ||
996 | * at this point. Deallocate immediately. */ | ||
997 | kfree(mod->sect_attrs); | ||
998 | mod->sect_attrs = NULL; | ||
999 | } | ||
1000 | } | ||
1001 | |||
1002 | |||
1003 | #else | ||
1004 | static inline void add_sect_attrs(struct module *mod, unsigned int nsect, | ||
1005 | char *sectstrings, Elf_Shdr *sechdrs) | ||
1006 | { | ||
1007 | } | ||
1008 | |||
1009 | static inline void remove_sect_attrs(struct module *mod) | ||
1010 | { | ||
1011 | } | ||
1012 | #endif /* CONFIG_KALLSYMS */ | ||
1013 | |||
1014 | |||
1015 | #ifdef CONFIG_MODULE_UNLOAD | ||
1016 | static inline int module_add_refcnt_attr(struct module *mod) | ||
1017 | { | ||
1018 | return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr); | ||
1019 | } | ||
1020 | static void module_remove_refcnt_attr(struct module *mod) | ||
1021 | { | ||
1022 | return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr); | ||
1023 | } | ||
1024 | #else | ||
1025 | static inline int module_add_refcnt_attr(struct module *mod) | ||
1026 | { | ||
1027 | return 0; | ||
1028 | } | ||
1029 | static void module_remove_refcnt_attr(struct module *mod) | ||
1030 | { | ||
1031 | } | ||
1032 | #endif | ||
1033 | |||
1034 | |||
1035 | static int mod_sysfs_setup(struct module *mod, | ||
1036 | struct kernel_param *kparam, | ||
1037 | unsigned int num_params) | ||
1038 | { | ||
1039 | int err; | ||
1040 | |||
1041 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); | ||
1042 | err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); | ||
1043 | if (err) | ||
1044 | goto out; | ||
1045 | kobj_set_kset_s(&mod->mkobj, module_subsys); | ||
1046 | mod->mkobj.mod = mod; | ||
1047 | err = kobject_register(&mod->mkobj.kobj); | ||
1048 | if (err) | ||
1049 | goto out; | ||
1050 | |||
1051 | err = module_add_refcnt_attr(mod); | ||
1052 | if (err) | ||
1053 | goto out_unreg; | ||
1054 | |||
1055 | err = module_param_sysfs_setup(mod, kparam, num_params); | ||
1056 | if (err) | ||
1057 | goto out_unreg; | ||
1058 | |||
1059 | return 0; | ||
1060 | |||
1061 | out_unreg: | ||
1062 | kobject_unregister(&mod->mkobj.kobj); | ||
1063 | out: | ||
1064 | return err; | ||
1065 | } | ||
1066 | |||
1067 | static void mod_kobject_remove(struct module *mod) | ||
1068 | { | ||
1069 | module_remove_refcnt_attr(mod); | ||
1070 | module_param_sysfs_remove(mod); | ||
1071 | |||
1072 | kobject_unregister(&mod->mkobj.kobj); | ||
1073 | } | ||
1074 | |||
1075 | /* | ||
1076 | * unlink the module with the whole machine is stopped with interrupts off | ||
1077 | * - this defends against kallsyms not taking locks | ||
1078 | */ | ||
1079 | static int __unlink_module(void *_mod) | ||
1080 | { | ||
1081 | struct module *mod = _mod; | ||
1082 | list_del(&mod->list); | ||
1083 | return 0; | ||
1084 | } | ||
1085 | |||
1086 | /* Free a module, remove from lists, etc (must hold module mutex). */ | ||
1087 | static void free_module(struct module *mod) | ||
1088 | { | ||
1089 | /* Delete from various lists */ | ||
1090 | stop_machine_run(__unlink_module, mod, NR_CPUS); | ||
1091 | remove_sect_attrs(mod); | ||
1092 | mod_kobject_remove(mod); | ||
1093 | |||
1094 | /* Arch-specific cleanup. */ | ||
1095 | module_arch_cleanup(mod); | ||
1096 | |||
1097 | /* Module unload stuff */ | ||
1098 | module_unload_free(mod); | ||
1099 | |||
1100 | /* This may be NULL, but that's OK */ | ||
1101 | module_free(mod, mod->module_init); | ||
1102 | kfree(mod->args); | ||
1103 | if (mod->percpu) | ||
1104 | percpu_modfree(mod->percpu); | ||
1105 | |||
1106 | /* Finally, free the core (containing the module structure) */ | ||
1107 | module_free(mod, mod->module_core); | ||
1108 | } | ||
1109 | |||
1110 | void *__symbol_get(const char *symbol) | ||
1111 | { | ||
1112 | struct module *owner; | ||
1113 | unsigned long value, flags; | ||
1114 | const unsigned long *crc; | ||
1115 | |||
1116 | spin_lock_irqsave(&modlist_lock, flags); | ||
1117 | value = __find_symbol(symbol, &owner, &crc, 1); | ||
1118 | if (value && !strong_try_module_get(owner)) | ||
1119 | value = 0; | ||
1120 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
1121 | |||
1122 | return (void *)value; | ||
1123 | } | ||
1124 | EXPORT_SYMBOL_GPL(__symbol_get); | ||
1125 | |||
1126 | /* Change all symbols so that sh_value encodes the pointer directly. */ | ||
1127 | static int simplify_symbols(Elf_Shdr *sechdrs, | ||
1128 | unsigned int symindex, | ||
1129 | const char *strtab, | ||
1130 | unsigned int versindex, | ||
1131 | unsigned int pcpuindex, | ||
1132 | struct module *mod) | ||
1133 | { | ||
1134 | Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; | ||
1135 | unsigned long secbase; | ||
1136 | unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); | ||
1137 | int ret = 0; | ||
1138 | |||
1139 | for (i = 1; i < n; i++) { | ||
1140 | switch (sym[i].st_shndx) { | ||
1141 | case SHN_COMMON: | ||
1142 | /* We compiled with -fno-common. These are not | ||
1143 | supposed to happen. */ | ||
1144 | DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); | ||
1145 | printk("%s: please compile with -fno-common\n", | ||
1146 | mod->name); | ||
1147 | ret = -ENOEXEC; | ||
1148 | break; | ||
1149 | |||
1150 | case SHN_ABS: | ||
1151 | /* Don't need to do anything */ | ||
1152 | DEBUGP("Absolute symbol: 0x%08lx\n", | ||
1153 | (long)sym[i].st_value); | ||
1154 | break; | ||
1155 | |||
1156 | case SHN_UNDEF: | ||
1157 | sym[i].st_value | ||
1158 | = resolve_symbol(sechdrs, versindex, | ||
1159 | strtab + sym[i].st_name, mod); | ||
1160 | |||
1161 | /* Ok if resolved. */ | ||
1162 | if (sym[i].st_value != 0) | ||
1163 | break; | ||
1164 | /* Ok if weak. */ | ||
1165 | if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) | ||
1166 | break; | ||
1167 | |||
1168 | printk(KERN_WARNING "%s: Unknown symbol %s\n", | ||
1169 | mod->name, strtab + sym[i].st_name); | ||
1170 | ret = -ENOENT; | ||
1171 | break; | ||
1172 | |||
1173 | default: | ||
1174 | /* Divert to percpu allocation if a percpu var. */ | ||
1175 | if (sym[i].st_shndx == pcpuindex) | ||
1176 | secbase = (unsigned long)mod->percpu; | ||
1177 | else | ||
1178 | secbase = sechdrs[sym[i].st_shndx].sh_addr; | ||
1179 | sym[i].st_value += secbase; | ||
1180 | break; | ||
1181 | } | ||
1182 | } | ||
1183 | |||
1184 | return ret; | ||
1185 | } | ||
1186 | |||
1187 | /* Update size with this section: return offset. */ | ||
1188 | static long get_offset(unsigned long *size, Elf_Shdr *sechdr) | ||
1189 | { | ||
1190 | long ret; | ||
1191 | |||
1192 | ret = ALIGN(*size, sechdr->sh_addralign ?: 1); | ||
1193 | *size = ret + sechdr->sh_size; | ||
1194 | return ret; | ||
1195 | } | ||
1196 | |||
1197 | /* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld | ||
1198 | might -- code, read-only data, read-write data, small data. Tally | ||
1199 | sizes, and place the offsets into sh_entsize fields: high bit means it | ||
1200 | belongs in init. */ | ||
1201 | static void layout_sections(struct module *mod, | ||
1202 | const Elf_Ehdr *hdr, | ||
1203 | Elf_Shdr *sechdrs, | ||
1204 | const char *secstrings) | ||
1205 | { | ||
1206 | static unsigned long const masks[][2] = { | ||
1207 | /* NOTE: all executable code must be the first section | ||
1208 | * in this array; otherwise modify the text_size | ||
1209 | * finder in the two loops below */ | ||
1210 | { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, | ||
1211 | { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, | ||
1212 | { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, | ||
1213 | { ARCH_SHF_SMALL | SHF_ALLOC, 0 } | ||
1214 | }; | ||
1215 | unsigned int m, i; | ||
1216 | |||
1217 | for (i = 0; i < hdr->e_shnum; i++) | ||
1218 | sechdrs[i].sh_entsize = ~0UL; | ||
1219 | |||
1220 | DEBUGP("Core section allocation order:\n"); | ||
1221 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | ||
1222 | for (i = 0; i < hdr->e_shnum; ++i) { | ||
1223 | Elf_Shdr *s = &sechdrs[i]; | ||
1224 | |||
1225 | if ((s->sh_flags & masks[m][0]) != masks[m][0] | ||
1226 | || (s->sh_flags & masks[m][1]) | ||
1227 | || s->sh_entsize != ~0UL | ||
1228 | || strncmp(secstrings + s->sh_name, | ||
1229 | ".init", 5) == 0) | ||
1230 | continue; | ||
1231 | s->sh_entsize = get_offset(&mod->core_size, s); | ||
1232 | DEBUGP("\t%s\n", secstrings + s->sh_name); | ||
1233 | } | ||
1234 | if (m == 0) | ||
1235 | mod->core_text_size = mod->core_size; | ||
1236 | } | ||
1237 | |||
1238 | DEBUGP("Init section allocation order:\n"); | ||
1239 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | ||
1240 | for (i = 0; i < hdr->e_shnum; ++i) { | ||
1241 | Elf_Shdr *s = &sechdrs[i]; | ||
1242 | |||
1243 | if ((s->sh_flags & masks[m][0]) != masks[m][0] | ||
1244 | || (s->sh_flags & masks[m][1]) | ||
1245 | || s->sh_entsize != ~0UL | ||
1246 | || strncmp(secstrings + s->sh_name, | ||
1247 | ".init", 5) != 0) | ||
1248 | continue; | ||
1249 | s->sh_entsize = (get_offset(&mod->init_size, s) | ||
1250 | | INIT_OFFSET_MASK); | ||
1251 | DEBUGP("\t%s\n", secstrings + s->sh_name); | ||
1252 | } | ||
1253 | if (m == 0) | ||
1254 | mod->init_text_size = mod->init_size; | ||
1255 | } | ||
1256 | } | ||
1257 | |||
1258 | static inline int license_is_gpl_compatible(const char *license) | ||
1259 | { | ||
1260 | return (strcmp(license, "GPL") == 0 | ||
1261 | || strcmp(license, "GPL v2") == 0 | ||
1262 | || strcmp(license, "GPL and additional rights") == 0 | ||
1263 | || strcmp(license, "Dual BSD/GPL") == 0 | ||
1264 | || strcmp(license, "Dual MPL/GPL") == 0); | ||
1265 | } | ||
1266 | |||
1267 | static void set_license(struct module *mod, const char *license) | ||
1268 | { | ||
1269 | if (!license) | ||
1270 | license = "unspecified"; | ||
1271 | |||
1272 | mod->license_gplok = license_is_gpl_compatible(license); | ||
1273 | if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { | ||
1274 | printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", | ||
1275 | mod->name, license); | ||
1276 | tainted |= TAINT_PROPRIETARY_MODULE; | ||
1277 | } | ||
1278 | } | ||
1279 | |||
1280 | /* Parse tag=value strings from .modinfo section */ | ||
1281 | static char *next_string(char *string, unsigned long *secsize) | ||
1282 | { | ||
1283 | /* Skip non-zero chars */ | ||
1284 | while (string[0]) { | ||
1285 | string++; | ||
1286 | if ((*secsize)-- <= 1) | ||
1287 | return NULL; | ||
1288 | } | ||
1289 | |||
1290 | /* Skip any zero padding. */ | ||
1291 | while (!string[0]) { | ||
1292 | string++; | ||
1293 | if ((*secsize)-- <= 1) | ||
1294 | return NULL; | ||
1295 | } | ||
1296 | return string; | ||
1297 | } | ||
1298 | |||
1299 | static char *get_modinfo(Elf_Shdr *sechdrs, | ||
1300 | unsigned int info, | ||
1301 | const char *tag) | ||
1302 | { | ||
1303 | char *p; | ||
1304 | unsigned int taglen = strlen(tag); | ||
1305 | unsigned long size = sechdrs[info].sh_size; | ||
1306 | |||
1307 | for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { | ||
1308 | if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') | ||
1309 | return p + taglen + 1; | ||
1310 | } | ||
1311 | return NULL; | ||
1312 | } | ||
1313 | |||
1314 | #ifdef CONFIG_KALLSYMS | ||
1315 | int is_exported(const char *name, const struct module *mod) | ||
1316 | { | ||
1317 | unsigned int i; | ||
1318 | |||
1319 | if (!mod) { | ||
1320 | for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) | ||
1321 | if (strcmp(__start___ksymtab[i].name, name) == 0) | ||
1322 | return 1; | ||
1323 | return 0; | ||
1324 | } | ||
1325 | for (i = 0; i < mod->num_syms; i++) | ||
1326 | if (strcmp(mod->syms[i].name, name) == 0) | ||
1327 | return 1; | ||
1328 | return 0; | ||
1329 | } | ||
1330 | |||
1331 | /* As per nm */ | ||
1332 | static char elf_type(const Elf_Sym *sym, | ||
1333 | Elf_Shdr *sechdrs, | ||
1334 | const char *secstrings, | ||
1335 | struct module *mod) | ||
1336 | { | ||
1337 | if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { | ||
1338 | if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) | ||
1339 | return 'v'; | ||
1340 | else | ||
1341 | return 'w'; | ||
1342 | } | ||
1343 | if (sym->st_shndx == SHN_UNDEF) | ||
1344 | return 'U'; | ||
1345 | if (sym->st_shndx == SHN_ABS) | ||
1346 | return 'a'; | ||
1347 | if (sym->st_shndx >= SHN_LORESERVE) | ||
1348 | return '?'; | ||
1349 | if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) | ||
1350 | return 't'; | ||
1351 | if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC | ||
1352 | && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { | ||
1353 | if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) | ||
1354 | return 'r'; | ||
1355 | else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) | ||
1356 | return 'g'; | ||
1357 | else | ||
1358 | return 'd'; | ||
1359 | } | ||
1360 | if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { | ||
1361 | if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) | ||
1362 | return 's'; | ||
1363 | else | ||
1364 | return 'b'; | ||
1365 | } | ||
1366 | if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name, | ||
1367 | ".debug", strlen(".debug")) == 0) | ||
1368 | return 'n'; | ||
1369 | return '?'; | ||
1370 | } | ||
1371 | |||
1372 | static void add_kallsyms(struct module *mod, | ||
1373 | Elf_Shdr *sechdrs, | ||
1374 | unsigned int symindex, | ||
1375 | unsigned int strindex, | ||
1376 | const char *secstrings) | ||
1377 | { | ||
1378 | unsigned int i; | ||
1379 | |||
1380 | mod->symtab = (void *)sechdrs[symindex].sh_addr; | ||
1381 | mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); | ||
1382 | mod->strtab = (void *)sechdrs[strindex].sh_addr; | ||
1383 | |||
1384 | /* Set types up while we still have access to sections. */ | ||
1385 | for (i = 0; i < mod->num_symtab; i++) | ||
1386 | mod->symtab[i].st_info | ||
1387 | = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); | ||
1388 | } | ||
1389 | #else | ||
1390 | static inline void add_kallsyms(struct module *mod, | ||
1391 | Elf_Shdr *sechdrs, | ||
1392 | unsigned int symindex, | ||
1393 | unsigned int strindex, | ||
1394 | const char *secstrings) | ||
1395 | { | ||
1396 | } | ||
1397 | #endif /* CONFIG_KALLSYMS */ | ||
1398 | |||
1399 | /* Allocate and load the module: note that size of section 0 is always | ||
1400 | zero, and we rely on this for optional sections. */ | ||
1401 | static struct module *load_module(void __user *umod, | ||
1402 | unsigned long len, | ||
1403 | const char __user *uargs) | ||
1404 | { | ||
1405 | Elf_Ehdr *hdr; | ||
1406 | Elf_Shdr *sechdrs; | ||
1407 | char *secstrings, *args, *modmagic, *strtab = NULL; | ||
1408 | unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, | ||
1409 | exportindex, modindex, obsparmindex, infoindex, gplindex, | ||
1410 | crcindex, gplcrcindex, versindex, pcpuindex; | ||
1411 | long arglen; | ||
1412 | struct module *mod; | ||
1413 | long err = 0; | ||
1414 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | ||
1415 | struct exception_table_entry *extable; | ||
1416 | |||
1417 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | ||
1418 | umod, len, uargs); | ||
1419 | if (len < sizeof(*hdr)) | ||
1420 | return ERR_PTR(-ENOEXEC); | ||
1421 | |||
1422 | /* Suck in entire file: we'll want most of it. */ | ||
1423 | /* vmalloc barfs on "unusual" numbers. Check here */ | ||
1424 | if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) | ||
1425 | return ERR_PTR(-ENOMEM); | ||
1426 | if (copy_from_user(hdr, umod, len) != 0) { | ||
1427 | err = -EFAULT; | ||
1428 | goto free_hdr; | ||
1429 | } | ||
1430 | |||
1431 | /* Sanity checks against insmoding binaries or wrong arch, | ||
1432 | weird elf version */ | ||
1433 | if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 | ||
1434 | || hdr->e_type != ET_REL | ||
1435 | || !elf_check_arch(hdr) | ||
1436 | || hdr->e_shentsize != sizeof(*sechdrs)) { | ||
1437 | err = -ENOEXEC; | ||
1438 | goto free_hdr; | ||
1439 | } | ||
1440 | |||
1441 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) | ||
1442 | goto truncated; | ||
1443 | |||
1444 | /* Convenience variables */ | ||
1445 | sechdrs = (void *)hdr + hdr->e_shoff; | ||
1446 | secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | ||
1447 | sechdrs[0].sh_addr = 0; | ||
1448 | |||
1449 | for (i = 1; i < hdr->e_shnum; i++) { | ||
1450 | if (sechdrs[i].sh_type != SHT_NOBITS | ||
1451 | && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) | ||
1452 | goto truncated; | ||
1453 | |||
1454 | /* Mark all sections sh_addr with their address in the | ||
1455 | temporary image. */ | ||
1456 | sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; | ||
1457 | |||
1458 | /* Internal symbols and strings. */ | ||
1459 | if (sechdrs[i].sh_type == SHT_SYMTAB) { | ||
1460 | symindex = i; | ||
1461 | strindex = sechdrs[i].sh_link; | ||
1462 | strtab = (char *)hdr + sechdrs[strindex].sh_offset; | ||
1463 | } | ||
1464 | #ifndef CONFIG_MODULE_UNLOAD | ||
1465 | /* Don't load .exit sections */ | ||
1466 | if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0) | ||
1467 | sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
1468 | #endif | ||
1469 | } | ||
1470 | |||
1471 | modindex = find_sec(hdr, sechdrs, secstrings, | ||
1472 | ".gnu.linkonce.this_module"); | ||
1473 | if (!modindex) { | ||
1474 | printk(KERN_WARNING "No module found in object\n"); | ||
1475 | err = -ENOEXEC; | ||
1476 | goto free_hdr; | ||
1477 | } | ||
1478 | mod = (void *)sechdrs[modindex].sh_addr; | ||
1479 | |||
1480 | if (symindex == 0) { | ||
1481 | printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", | ||
1482 | mod->name); | ||
1483 | err = -ENOEXEC; | ||
1484 | goto free_hdr; | ||
1485 | } | ||
1486 | |||
1487 | /* Optional sections */ | ||
1488 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); | ||
1489 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); | ||
1490 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); | ||
1491 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); | ||
1492 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); | ||
1493 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); | ||
1494 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); | ||
1495 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); | ||
1496 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); | ||
1497 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); | ||
1498 | |||
1499 | /* Don't keep modinfo section */ | ||
1500 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
1501 | #ifdef CONFIG_KALLSYMS | ||
1502 | /* Keep symbol and string tables for decoding later. */ | ||
1503 | sechdrs[symindex].sh_flags |= SHF_ALLOC; | ||
1504 | sechdrs[strindex].sh_flags |= SHF_ALLOC; | ||
1505 | #endif | ||
1506 | |||
1507 | /* Check module struct version now, before we try to use module. */ | ||
1508 | if (!check_modstruct_version(sechdrs, versindex, mod)) { | ||
1509 | err = -ENOEXEC; | ||
1510 | goto free_hdr; | ||
1511 | } | ||
1512 | |||
1513 | modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); | ||
1514 | /* This is allowed: modprobe --force will invalidate it. */ | ||
1515 | if (!modmagic) { | ||
1516 | tainted |= TAINT_FORCED_MODULE; | ||
1517 | printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", | ||
1518 | mod->name); | ||
1519 | } else if (!same_magic(modmagic, vermagic)) { | ||
1520 | printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", | ||
1521 | mod->name, modmagic, vermagic); | ||
1522 | err = -ENOEXEC; | ||
1523 | goto free_hdr; | ||
1524 | } | ||
1525 | |||
1526 | /* Now copy in args */ | ||
1527 | arglen = strlen_user(uargs); | ||
1528 | if (!arglen) { | ||
1529 | err = -EFAULT; | ||
1530 | goto free_hdr; | ||
1531 | } | ||
1532 | args = kmalloc(arglen, GFP_KERNEL); | ||
1533 | if (!args) { | ||
1534 | err = -ENOMEM; | ||
1535 | goto free_hdr; | ||
1536 | } | ||
1537 | if (copy_from_user(args, uargs, arglen) != 0) { | ||
1538 | err = -EFAULT; | ||
1539 | goto free_mod; | ||
1540 | } | ||
1541 | |||
1542 | if (find_module(mod->name)) { | ||
1543 | err = -EEXIST; | ||
1544 | goto free_mod; | ||
1545 | } | ||
1546 | |||
1547 | mod->state = MODULE_STATE_COMING; | ||
1548 | |||
1549 | /* Allow arches to frob section contents and sizes. */ | ||
1550 | err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); | ||
1551 | if (err < 0) | ||
1552 | goto free_mod; | ||
1553 | |||
1554 | if (pcpuindex) { | ||
1555 | /* We have a special allocation for this section. */ | ||
1556 | percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, | ||
1557 | sechdrs[pcpuindex].sh_addralign); | ||
1558 | if (!percpu) { | ||
1559 | err = -ENOMEM; | ||
1560 | goto free_mod; | ||
1561 | } | ||
1562 | sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
1563 | mod->percpu = percpu; | ||
1564 | } | ||
1565 | |||
1566 | /* Determine total sizes, and put offsets in sh_entsize. For now | ||
1567 | this is done generically; there doesn't appear to be any | ||
1568 | special cases for the architectures. */ | ||
1569 | layout_sections(mod, hdr, sechdrs, secstrings); | ||
1570 | |||
1571 | /* Do the allocs. */ | ||
1572 | ptr = module_alloc(mod->core_size); | ||
1573 | if (!ptr) { | ||
1574 | err = -ENOMEM; | ||
1575 | goto free_percpu; | ||
1576 | } | ||
1577 | memset(ptr, 0, mod->core_size); | ||
1578 | mod->module_core = ptr; | ||
1579 | |||
1580 | ptr = module_alloc(mod->init_size); | ||
1581 | if (!ptr && mod->init_size) { | ||
1582 | err = -ENOMEM; | ||
1583 | goto free_core; | ||
1584 | } | ||
1585 | memset(ptr, 0, mod->init_size); | ||
1586 | mod->module_init = ptr; | ||
1587 | |||
1588 | /* Transfer each section which specifies SHF_ALLOC */ | ||
1589 | DEBUGP("final section addresses:\n"); | ||
1590 | for (i = 0; i < hdr->e_shnum; i++) { | ||
1591 | void *dest; | ||
1592 | |||
1593 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
1594 | continue; | ||
1595 | |||
1596 | if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) | ||
1597 | dest = mod->module_init | ||
1598 | + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); | ||
1599 | else | ||
1600 | dest = mod->module_core + sechdrs[i].sh_entsize; | ||
1601 | |||
1602 | if (sechdrs[i].sh_type != SHT_NOBITS) | ||
1603 | memcpy(dest, (void *)sechdrs[i].sh_addr, | ||
1604 | sechdrs[i].sh_size); | ||
1605 | /* Update sh_addr to point to copy in image. */ | ||
1606 | sechdrs[i].sh_addr = (unsigned long)dest; | ||
1607 | DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); | ||
1608 | } | ||
1609 | /* Module has been moved. */ | ||
1610 | mod = (void *)sechdrs[modindex].sh_addr; | ||
1611 | |||
1612 | /* Now we've moved module, initialize linked lists, etc. */ | ||
1613 | module_unload_init(mod); | ||
1614 | |||
1615 | /* Set up license info based on the info section */ | ||
1616 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | ||
1617 | |||
1618 | /* Fix up syms, so that st_value is a pointer to location. */ | ||
1619 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, | ||
1620 | mod); | ||
1621 | if (err < 0) | ||
1622 | goto cleanup; | ||
1623 | |||
1624 | /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ | ||
1625 | mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); | ||
1626 | mod->syms = (void *)sechdrs[exportindex].sh_addr; | ||
1627 | if (crcindex) | ||
1628 | mod->crcs = (void *)sechdrs[crcindex].sh_addr; | ||
1629 | mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); | ||
1630 | mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; | ||
1631 | if (gplcrcindex) | ||
1632 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; | ||
1633 | |||
1634 | #ifdef CONFIG_MODVERSIONS | ||
1635 | if ((mod->num_syms && !crcindex) || | ||
1636 | (mod->num_gpl_syms && !gplcrcindex)) { | ||
1637 | printk(KERN_WARNING "%s: No versions for exported symbols." | ||
1638 | " Tainting kernel.\n", mod->name); | ||
1639 | tainted |= TAINT_FORCED_MODULE; | ||
1640 | } | ||
1641 | #endif | ||
1642 | |||
1643 | /* Now do relocations. */ | ||
1644 | for (i = 1; i < hdr->e_shnum; i++) { | ||
1645 | const char *strtab = (char *)sechdrs[strindex].sh_addr; | ||
1646 | unsigned int info = sechdrs[i].sh_info; | ||
1647 | |||
1648 | /* Not a valid relocation section? */ | ||
1649 | if (info >= hdr->e_shnum) | ||
1650 | continue; | ||
1651 | |||
1652 | /* Don't bother with non-allocated sections */ | ||
1653 | if (!(sechdrs[info].sh_flags & SHF_ALLOC)) | ||
1654 | continue; | ||
1655 | |||
1656 | if (sechdrs[i].sh_type == SHT_REL) | ||
1657 | err = apply_relocate(sechdrs, strtab, symindex, i,mod); | ||
1658 | else if (sechdrs[i].sh_type == SHT_RELA) | ||
1659 | err = apply_relocate_add(sechdrs, strtab, symindex, i, | ||
1660 | mod); | ||
1661 | if (err < 0) | ||
1662 | goto cleanup; | ||
1663 | } | ||
1664 | |||
1665 | /* Set up and sort exception table */ | ||
1666 | mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); | ||
1667 | mod->extable = extable = (void *)sechdrs[exindex].sh_addr; | ||
1668 | sort_extable(extable, extable + mod->num_exentries); | ||
1669 | |||
1670 | /* Finally, copy percpu area over. */ | ||
1671 | percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, | ||
1672 | sechdrs[pcpuindex].sh_size); | ||
1673 | |||
1674 | add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); | ||
1675 | |||
1676 | err = module_finalize(hdr, sechdrs, mod); | ||
1677 | if (err < 0) | ||
1678 | goto cleanup; | ||
1679 | |||
1680 | mod->args = args; | ||
1681 | if (obsparmindex) { | ||
1682 | err = obsolete_params(mod->name, mod->args, | ||
1683 | (struct obsolete_modparm *) | ||
1684 | sechdrs[obsparmindex].sh_addr, | ||
1685 | sechdrs[obsparmindex].sh_size | ||
1686 | / sizeof(struct obsolete_modparm), | ||
1687 | sechdrs, symindex, | ||
1688 | (char *)sechdrs[strindex].sh_addr); | ||
1689 | if (setupindex) | ||
1690 | printk(KERN_WARNING "%s: Ignoring new-style " | ||
1691 | "parameters in presence of obsolete ones\n", | ||
1692 | mod->name); | ||
1693 | } else { | ||
1694 | /* Size of section 0 is 0, so this works well if no params */ | ||
1695 | err = parse_args(mod->name, mod->args, | ||
1696 | (struct kernel_param *) | ||
1697 | sechdrs[setupindex].sh_addr, | ||
1698 | sechdrs[setupindex].sh_size | ||
1699 | / sizeof(struct kernel_param), | ||
1700 | NULL); | ||
1701 | } | ||
1702 | if (err < 0) | ||
1703 | goto arch_cleanup; | ||
1704 | |||
1705 | err = mod_sysfs_setup(mod, | ||
1706 | (struct kernel_param *) | ||
1707 | sechdrs[setupindex].sh_addr, | ||
1708 | sechdrs[setupindex].sh_size | ||
1709 | / sizeof(struct kernel_param)); | ||
1710 | if (err < 0) | ||
1711 | goto arch_cleanup; | ||
1712 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | ||
1713 | |||
1714 | /* Get rid of temporary copy */ | ||
1715 | vfree(hdr); | ||
1716 | |||
1717 | /* Done! */ | ||
1718 | return mod; | ||
1719 | |||
1720 | arch_cleanup: | ||
1721 | module_arch_cleanup(mod); | ||
1722 | cleanup: | ||
1723 | module_unload_free(mod); | ||
1724 | module_free(mod, mod->module_init); | ||
1725 | free_core: | ||
1726 | module_free(mod, mod->module_core); | ||
1727 | free_percpu: | ||
1728 | if (percpu) | ||
1729 | percpu_modfree(percpu); | ||
1730 | free_mod: | ||
1731 | kfree(args); | ||
1732 | free_hdr: | ||
1733 | vfree(hdr); | ||
1734 | if (err < 0) return ERR_PTR(err); | ||
1735 | else return ptr; | ||
1736 | |||
1737 | truncated: | ||
1738 | printk(KERN_ERR "Module len %lu truncated\n", len); | ||
1739 | err = -ENOEXEC; | ||
1740 | goto free_hdr; | ||
1741 | } | ||
1742 | |||
1743 | /* | ||
1744 | * link the module with the whole machine is stopped with interrupts off | ||
1745 | * - this defends against kallsyms not taking locks | ||
1746 | */ | ||
1747 | static int __link_module(void *_mod) | ||
1748 | { | ||
1749 | struct module *mod = _mod; | ||
1750 | list_add(&mod->list, &modules); | ||
1751 | return 0; | ||
1752 | } | ||
1753 | |||
1754 | /* This is where the real work happens */ | ||
1755 | asmlinkage long | ||
1756 | sys_init_module(void __user *umod, | ||
1757 | unsigned long len, | ||
1758 | const char __user *uargs) | ||
1759 | { | ||
1760 | struct module *mod; | ||
1761 | int ret = 0; | ||
1762 | |||
1763 | /* Must have permission */ | ||
1764 | if (!capable(CAP_SYS_MODULE)) | ||
1765 | return -EPERM; | ||
1766 | |||
1767 | /* Only one module load at a time, please */ | ||
1768 | if (down_interruptible(&module_mutex) != 0) | ||
1769 | return -EINTR; | ||
1770 | |||
1771 | /* Do all the hard work */ | ||
1772 | mod = load_module(umod, len, uargs); | ||
1773 | if (IS_ERR(mod)) { | ||
1774 | up(&module_mutex); | ||
1775 | return PTR_ERR(mod); | ||
1776 | } | ||
1777 | |||
1778 | /* Flush the instruction cache, since we've played with text */ | ||
1779 | if (mod->module_init) | ||
1780 | flush_icache_range((unsigned long)mod->module_init, | ||
1781 | (unsigned long)mod->module_init | ||
1782 | + mod->init_size); | ||
1783 | flush_icache_range((unsigned long)mod->module_core, | ||
1784 | (unsigned long)mod->module_core + mod->core_size); | ||
1785 | |||
1786 | /* Now sew it into the lists. They won't access us, since | ||
1787 | strong_try_module_get() will fail. */ | ||
1788 | stop_machine_run(__link_module, mod, NR_CPUS); | ||
1789 | |||
1790 | /* Drop lock so they can recurse */ | ||
1791 | up(&module_mutex); | ||
1792 | |||
1793 | down(¬ify_mutex); | ||
1794 | notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); | ||
1795 | up(¬ify_mutex); | ||
1796 | |||
1797 | /* Start the module */ | ||
1798 | if (mod->init != NULL) | ||
1799 | ret = mod->init(); | ||
1800 | if (ret < 0) { | ||
1801 | /* Init routine failed: abort. Try to protect us from | ||
1802 | buggy refcounters. */ | ||
1803 | mod->state = MODULE_STATE_GOING; | ||
1804 | synchronize_kernel(); | ||
1805 | if (mod->unsafe) | ||
1806 | printk(KERN_ERR "%s: module is now stuck!\n", | ||
1807 | mod->name); | ||
1808 | else { | ||
1809 | module_put(mod); | ||
1810 | down(&module_mutex); | ||
1811 | free_module(mod); | ||
1812 | up(&module_mutex); | ||
1813 | } | ||
1814 | return ret; | ||
1815 | } | ||
1816 | |||
1817 | /* Now it's a first class citizen! */ | ||
1818 | down(&module_mutex); | ||
1819 | mod->state = MODULE_STATE_LIVE; | ||
1820 | /* Drop initial reference. */ | ||
1821 | module_put(mod); | ||
1822 | module_free(mod, mod->module_init); | ||
1823 | mod->module_init = NULL; | ||
1824 | mod->init_size = 0; | ||
1825 | mod->init_text_size = 0; | ||
1826 | up(&module_mutex); | ||
1827 | |||
1828 | return 0; | ||
1829 | } | ||
1830 | |||
1831 | static inline int within(unsigned long addr, void *start, unsigned long size) | ||
1832 | { | ||
1833 | return ((void *)addr >= start && (void *)addr < start + size); | ||
1834 | } | ||
1835 | |||
1836 | #ifdef CONFIG_KALLSYMS | ||
1837 | /* | ||
1838 | * This ignores the intensely annoying "mapping symbols" found | ||
1839 | * in ARM ELF files: $a, $t and $d. | ||
1840 | */ | ||
1841 | static inline int is_arm_mapping_symbol(const char *str) | ||
1842 | { | ||
1843 | return str[0] == '$' && strchr("atd", str[1]) | ||
1844 | && (str[2] == '\0' || str[2] == '.'); | ||
1845 | } | ||
1846 | |||
1847 | static const char *get_ksymbol(struct module *mod, | ||
1848 | unsigned long addr, | ||
1849 | unsigned long *size, | ||
1850 | unsigned long *offset) | ||
1851 | { | ||
1852 | unsigned int i, best = 0; | ||
1853 | unsigned long nextval; | ||
1854 | |||
1855 | /* At worse, next value is at end of module */ | ||
1856 | if (within(addr, mod->module_init, mod->init_size)) | ||
1857 | nextval = (unsigned long)mod->module_init+mod->init_text_size; | ||
1858 | else | ||
1859 | nextval = (unsigned long)mod->module_core+mod->core_text_size; | ||
1860 | |||
1861 | /* Scan for closest preceeding symbol, and next symbol. (ELF | ||
1862 | starts real symbols at 1). */ | ||
1863 | for (i = 1; i < mod->num_symtab; i++) { | ||
1864 | if (mod->symtab[i].st_shndx == SHN_UNDEF) | ||
1865 | continue; | ||
1866 | |||
1867 | /* We ignore unnamed symbols: they're uninformative | ||
1868 | * and inserted at a whim. */ | ||
1869 | if (mod->symtab[i].st_value <= addr | ||
1870 | && mod->symtab[i].st_value > mod->symtab[best].st_value | ||
1871 | && *(mod->strtab + mod->symtab[i].st_name) != '\0' | ||
1872 | && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) | ||
1873 | best = i; | ||
1874 | if (mod->symtab[i].st_value > addr | ||
1875 | && mod->symtab[i].st_value < nextval | ||
1876 | && *(mod->strtab + mod->symtab[i].st_name) != '\0' | ||
1877 | && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) | ||
1878 | nextval = mod->symtab[i].st_value; | ||
1879 | } | ||
1880 | |||
1881 | if (!best) | ||
1882 | return NULL; | ||
1883 | |||
1884 | *size = nextval - mod->symtab[best].st_value; | ||
1885 | *offset = addr - mod->symtab[best].st_value; | ||
1886 | return mod->strtab + mod->symtab[best].st_name; | ||
1887 | } | ||
1888 | |||
1889 | /* For kallsyms to ask for address resolution. NULL means not found. | ||
1890 | We don't lock, as this is used for oops resolution and races are a | ||
1891 | lesser concern. */ | ||
1892 | const char *module_address_lookup(unsigned long addr, | ||
1893 | unsigned long *size, | ||
1894 | unsigned long *offset, | ||
1895 | char **modname) | ||
1896 | { | ||
1897 | struct module *mod; | ||
1898 | |||
1899 | list_for_each_entry(mod, &modules, list) { | ||
1900 | if (within(addr, mod->module_init, mod->init_size) | ||
1901 | || within(addr, mod->module_core, mod->core_size)) { | ||
1902 | *modname = mod->name; | ||
1903 | return get_ksymbol(mod, addr, size, offset); | ||
1904 | } | ||
1905 | } | ||
1906 | return NULL; | ||
1907 | } | ||
1908 | |||
1909 | struct module *module_get_kallsym(unsigned int symnum, | ||
1910 | unsigned long *value, | ||
1911 | char *type, | ||
1912 | char namebuf[128]) | ||
1913 | { | ||
1914 | struct module *mod; | ||
1915 | |||
1916 | down(&module_mutex); | ||
1917 | list_for_each_entry(mod, &modules, list) { | ||
1918 | if (symnum < mod->num_symtab) { | ||
1919 | *value = mod->symtab[symnum].st_value; | ||
1920 | *type = mod->symtab[symnum].st_info; | ||
1921 | strncpy(namebuf, | ||
1922 | mod->strtab + mod->symtab[symnum].st_name, | ||
1923 | 127); | ||
1924 | up(&module_mutex); | ||
1925 | return mod; | ||
1926 | } | ||
1927 | symnum -= mod->num_symtab; | ||
1928 | } | ||
1929 | up(&module_mutex); | ||
1930 | return NULL; | ||
1931 | } | ||
1932 | |||
1933 | static unsigned long mod_find_symname(struct module *mod, const char *name) | ||
1934 | { | ||
1935 | unsigned int i; | ||
1936 | |||
1937 | for (i = 0; i < mod->num_symtab; i++) | ||
1938 | if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0) | ||
1939 | return mod->symtab[i].st_value; | ||
1940 | return 0; | ||
1941 | } | ||
1942 | |||
1943 | /* Look for this name: can be of form module:name. */ | ||
1944 | unsigned long module_kallsyms_lookup_name(const char *name) | ||
1945 | { | ||
1946 | struct module *mod; | ||
1947 | char *colon; | ||
1948 | unsigned long ret = 0; | ||
1949 | |||
1950 | /* Don't lock: we're in enough trouble already. */ | ||
1951 | if ((colon = strchr(name, ':')) != NULL) { | ||
1952 | *colon = '\0'; | ||
1953 | if ((mod = find_module(name)) != NULL) | ||
1954 | ret = mod_find_symname(mod, colon+1); | ||
1955 | *colon = ':'; | ||
1956 | } else { | ||
1957 | list_for_each_entry(mod, &modules, list) | ||
1958 | if ((ret = mod_find_symname(mod, name)) != 0) | ||
1959 | break; | ||
1960 | } | ||
1961 | return ret; | ||
1962 | } | ||
1963 | #endif /* CONFIG_KALLSYMS */ | ||
1964 | |||
1965 | /* Called by the /proc file system to return a list of modules. */ | ||
1966 | static void *m_start(struct seq_file *m, loff_t *pos) | ||
1967 | { | ||
1968 | struct list_head *i; | ||
1969 | loff_t n = 0; | ||
1970 | |||
1971 | down(&module_mutex); | ||
1972 | list_for_each(i, &modules) { | ||
1973 | if (n++ == *pos) | ||
1974 | break; | ||
1975 | } | ||
1976 | if (i == &modules) | ||
1977 | return NULL; | ||
1978 | return i; | ||
1979 | } | ||
1980 | |||
1981 | static void *m_next(struct seq_file *m, void *p, loff_t *pos) | ||
1982 | { | ||
1983 | struct list_head *i = p; | ||
1984 | (*pos)++; | ||
1985 | if (i->next == &modules) | ||
1986 | return NULL; | ||
1987 | return i->next; | ||
1988 | } | ||
1989 | |||
1990 | static void m_stop(struct seq_file *m, void *p) | ||
1991 | { | ||
1992 | up(&module_mutex); | ||
1993 | } | ||
1994 | |||
1995 | static int m_show(struct seq_file *m, void *p) | ||
1996 | { | ||
1997 | struct module *mod = list_entry(p, struct module, list); | ||
1998 | seq_printf(m, "%s %lu", | ||
1999 | mod->name, mod->init_size + mod->core_size); | ||
2000 | print_unload_info(m, mod); | ||
2001 | |||
2002 | /* Informative for users. */ | ||
2003 | seq_printf(m, " %s", | ||
2004 | mod->state == MODULE_STATE_GOING ? "Unloading": | ||
2005 | mod->state == MODULE_STATE_COMING ? "Loading": | ||
2006 | "Live"); | ||
2007 | /* Used by oprofile and other similar tools. */ | ||
2008 | seq_printf(m, " 0x%p", mod->module_core); | ||
2009 | |||
2010 | seq_printf(m, "\n"); | ||
2011 | return 0; | ||
2012 | } | ||
2013 | |||
2014 | /* Format: modulename size refcount deps address | ||
2015 | |||
2016 | Where refcount is a number or -, and deps is a comma-separated list | ||
2017 | of depends or -. | ||
2018 | */ | ||
2019 | struct seq_operations modules_op = { | ||
2020 | .start = m_start, | ||
2021 | .next = m_next, | ||
2022 | .stop = m_stop, | ||
2023 | .show = m_show | ||
2024 | }; | ||
2025 | |||
2026 | /* Given an address, look for it in the module exception tables. */ | ||
2027 | const struct exception_table_entry *search_module_extables(unsigned long addr) | ||
2028 | { | ||
2029 | unsigned long flags; | ||
2030 | const struct exception_table_entry *e = NULL; | ||
2031 | struct module *mod; | ||
2032 | |||
2033 | spin_lock_irqsave(&modlist_lock, flags); | ||
2034 | list_for_each_entry(mod, &modules, list) { | ||
2035 | if (mod->num_exentries == 0) | ||
2036 | continue; | ||
2037 | |||
2038 | e = search_extable(mod->extable, | ||
2039 | mod->extable + mod->num_exentries - 1, | ||
2040 | addr); | ||
2041 | if (e) | ||
2042 | break; | ||
2043 | } | ||
2044 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
2045 | |||
2046 | /* Now, if we found one, we are running inside it now, hence | ||
2047 | we cannot unload the module, hence no refcnt needed. */ | ||
2048 | return e; | ||
2049 | } | ||
2050 | |||
2051 | /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ | ||
2052 | struct module *__module_text_address(unsigned long addr) | ||
2053 | { | ||
2054 | struct module *mod; | ||
2055 | |||
2056 | list_for_each_entry(mod, &modules, list) | ||
2057 | if (within(addr, mod->module_init, mod->init_text_size) | ||
2058 | || within(addr, mod->module_core, mod->core_text_size)) | ||
2059 | return mod; | ||
2060 | return NULL; | ||
2061 | } | ||
2062 | |||
2063 | struct module *module_text_address(unsigned long addr) | ||
2064 | { | ||
2065 | struct module *mod; | ||
2066 | unsigned long flags; | ||
2067 | |||
2068 | spin_lock_irqsave(&modlist_lock, flags); | ||
2069 | mod = __module_text_address(addr); | ||
2070 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
2071 | |||
2072 | return mod; | ||
2073 | } | ||
2074 | |||
2075 | /* Don't grab lock, we're oopsing. */ | ||
2076 | void print_modules(void) | ||
2077 | { | ||
2078 | struct module *mod; | ||
2079 | |||
2080 | printk("Modules linked in:"); | ||
2081 | list_for_each_entry(mod, &modules, list) | ||
2082 | printk(" %s", mod->name); | ||
2083 | printk("\n"); | ||
2084 | } | ||
2085 | |||
2086 | void module_add_driver(struct module *mod, struct device_driver *drv) | ||
2087 | { | ||
2088 | if (!mod || !drv) | ||
2089 | return; | ||
2090 | |||
2091 | /* Don't check return code; this call is idempotent */ | ||
2092 | sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); | ||
2093 | } | ||
2094 | EXPORT_SYMBOL(module_add_driver); | ||
2095 | |||
2096 | void module_remove_driver(struct device_driver *drv) | ||
2097 | { | ||
2098 | if (!drv) | ||
2099 | return; | ||
2100 | sysfs_remove_link(&drv->kobj, "module"); | ||
2101 | } | ||
2102 | EXPORT_SYMBOL(module_remove_driver); | ||
2103 | |||
2104 | #ifdef CONFIG_MODVERSIONS | ||
2105 | /* Generate the signature for struct module here, too, for modversions. */ | ||
2106 | void struct_module(struct module *mod) { return; } | ||
2107 | EXPORT_SYMBOL(struct_module); | ||
2108 | #endif | ||
diff --git a/kernel/panic.c b/kernel/panic.c new file mode 100644 index 000000000000..0fa3f3a66fb6 --- /dev/null +++ b/kernel/panic.c | |||
@@ -0,0 +1,157 @@ | |||
1 | /* | ||
2 | * linux/kernel/panic.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * This function is used through-out the kernel (including mm and fs) | ||
9 | * to indicate a major problem. | ||
10 | */ | ||
11 | #include <linux/config.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/delay.h> | ||
15 | #include <linux/reboot.h> | ||
16 | #include <linux/notifier.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/sysrq.h> | ||
19 | #include <linux/interrupt.h> | ||
20 | #include <linux/nmi.h> | ||
21 | |||
22 | int panic_timeout; | ||
23 | int panic_on_oops; | ||
24 | int tainted; | ||
25 | |||
26 | EXPORT_SYMBOL(panic_timeout); | ||
27 | |||
28 | struct notifier_block *panic_notifier_list; | ||
29 | |||
30 | EXPORT_SYMBOL(panic_notifier_list); | ||
31 | |||
32 | static int __init panic_setup(char *str) | ||
33 | { | ||
34 | panic_timeout = simple_strtoul(str, NULL, 0); | ||
35 | return 1; | ||
36 | } | ||
37 | __setup("panic=", panic_setup); | ||
38 | |||
39 | static long no_blink(long time) | ||
40 | { | ||
41 | return 0; | ||
42 | } | ||
43 | |||
44 | /* Returns how long it waited in ms */ | ||
45 | long (*panic_blink)(long time); | ||
46 | EXPORT_SYMBOL(panic_blink); | ||
47 | |||
48 | /** | ||
49 | * panic - halt the system | ||
50 | * @fmt: The text string to print | ||
51 | * | ||
52 | * Display a message, then perform cleanups. | ||
53 | * | ||
54 | * This function never returns. | ||
55 | */ | ||
56 | |||
57 | NORET_TYPE void panic(const char * fmt, ...) | ||
58 | { | ||
59 | long i; | ||
60 | static char buf[1024]; | ||
61 | va_list args; | ||
62 | #if defined(CONFIG_ARCH_S390) | ||
63 | unsigned long caller = (unsigned long) __builtin_return_address(0); | ||
64 | #endif | ||
65 | |||
66 | bust_spinlocks(1); | ||
67 | va_start(args, fmt); | ||
68 | vsnprintf(buf, sizeof(buf), fmt, args); | ||
69 | va_end(args); | ||
70 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); | ||
71 | bust_spinlocks(0); | ||
72 | |||
73 | #ifdef CONFIG_SMP | ||
74 | smp_send_stop(); | ||
75 | #endif | ||
76 | |||
77 | notifier_call_chain(&panic_notifier_list, 0, buf); | ||
78 | |||
79 | if (!panic_blink) | ||
80 | panic_blink = no_blink; | ||
81 | |||
82 | if (panic_timeout > 0) | ||
83 | { | ||
84 | /* | ||
85 | * Delay timeout seconds before rebooting the machine. | ||
86 | * We can't use the "normal" timers since we just panicked.. | ||
87 | */ | ||
88 | printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); | ||
89 | for (i = 0; i < panic_timeout*1000; ) { | ||
90 | touch_nmi_watchdog(); | ||
91 | i += panic_blink(i); | ||
92 | mdelay(1); | ||
93 | i++; | ||
94 | } | ||
95 | /* | ||
96 | * Should we run the reboot notifier. For the moment Im | ||
97 | * choosing not too. It might crash, be corrupt or do | ||
98 | * more harm than good for other reasons. | ||
99 | */ | ||
100 | machine_restart(NULL); | ||
101 | } | ||
102 | #ifdef __sparc__ | ||
103 | { | ||
104 | extern int stop_a_enabled; | ||
105 | /* Make sure the user can actually press L1-A */ | ||
106 | stop_a_enabled = 1; | ||
107 | printk(KERN_EMERG "Press L1-A to return to the boot prom\n"); | ||
108 | } | ||
109 | #endif | ||
110 | #if defined(CONFIG_ARCH_S390) | ||
111 | disabled_wait(caller); | ||
112 | #endif | ||
113 | local_irq_enable(); | ||
114 | for (i = 0;;) { | ||
115 | i += panic_blink(i); | ||
116 | mdelay(1); | ||
117 | i++; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | EXPORT_SYMBOL(panic); | ||
122 | |||
123 | /** | ||
124 | * print_tainted - return a string to represent the kernel taint state. | ||
125 | * | ||
126 | * 'P' - Proprietary module has been loaded. | ||
127 | * 'F' - Module has been forcibly loaded. | ||
128 | * 'S' - SMP with CPUs not designed for SMP. | ||
129 | * 'R' - User forced a module unload. | ||
130 | * 'M' - Machine had a machine check experience. | ||
131 | * 'B' - System has hit bad_page. | ||
132 | * | ||
133 | * The string is overwritten by the next call to print_taint(). | ||
134 | */ | ||
135 | |||
136 | const char *print_tainted(void) | ||
137 | { | ||
138 | static char buf[20]; | ||
139 | if (tainted) { | ||
140 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", | ||
141 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', | ||
142 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', | ||
143 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', | ||
144 | tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', | ||
145 | tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', | ||
146 | tainted & TAINT_BAD_PAGE ? 'B' : ' '); | ||
147 | } | ||
148 | else | ||
149 | snprintf(buf, sizeof(buf), "Not tainted"); | ||
150 | return(buf); | ||
151 | } | ||
152 | |||
153 | void add_taint(unsigned flag) | ||
154 | { | ||
155 | tainted |= flag; | ||
156 | } | ||
157 | EXPORT_SYMBOL(add_taint); | ||
diff --git a/kernel/params.c b/kernel/params.c new file mode 100644 index 000000000000..5538608bd339 --- /dev/null +++ b/kernel/params.c | |||
@@ -0,0 +1,721 @@ | |||
1 | /* Helpers for initial module or kernel cmdline parsing | ||
2 | Copyright (C) 2001 Rusty Russell. | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2 of the License, or | ||
7 | (at your option) any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software | ||
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #include <linux/config.h> | ||
19 | #include <linux/moduleparam.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/string.h> | ||
22 | #include <linux/errno.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/device.h> | ||
25 | #include <linux/err.h> | ||
26 | |||
27 | #if 0 | ||
28 | #define DEBUGP printk | ||
29 | #else | ||
30 | #define DEBUGP(fmt, a...) | ||
31 | #endif | ||
32 | |||
33 | static inline int dash2underscore(char c) | ||
34 | { | ||
35 | if (c == '-') | ||
36 | return '_'; | ||
37 | return c; | ||
38 | } | ||
39 | |||
40 | static inline int parameq(const char *input, const char *paramname) | ||
41 | { | ||
42 | unsigned int i; | ||
43 | for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) | ||
44 | if (input[i] == '\0') | ||
45 | return 1; | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | static int parse_one(char *param, | ||
50 | char *val, | ||
51 | struct kernel_param *params, | ||
52 | unsigned num_params, | ||
53 | int (*handle_unknown)(char *param, char *val)) | ||
54 | { | ||
55 | unsigned int i; | ||
56 | |||
57 | /* Find parameter */ | ||
58 | for (i = 0; i < num_params; i++) { | ||
59 | if (parameq(param, params[i].name)) { | ||
60 | DEBUGP("They are equal! Calling %p\n", | ||
61 | params[i].set); | ||
62 | return params[i].set(val, ¶ms[i]); | ||
63 | } | ||
64 | } | ||
65 | |||
66 | if (handle_unknown) { | ||
67 | DEBUGP("Unknown argument: calling %p\n", handle_unknown); | ||
68 | return handle_unknown(param, val); | ||
69 | } | ||
70 | |||
71 | DEBUGP("Unknown argument `%s'\n", param); | ||
72 | return -ENOENT; | ||
73 | } | ||
74 | |||
75 | /* You can use " around spaces, but can't escape ". */ | ||
76 | /* Hyphens and underscores equivalent in parameter names. */ | ||
77 | static char *next_arg(char *args, char **param, char **val) | ||
78 | { | ||
79 | unsigned int i, equals = 0; | ||
80 | int in_quote = 0, quoted = 0; | ||
81 | char *next; | ||
82 | |||
83 | /* Chew any extra spaces */ | ||
84 | while (*args == ' ') args++; | ||
85 | if (*args == '"') { | ||
86 | args++; | ||
87 | in_quote = 1; | ||
88 | quoted = 1; | ||
89 | } | ||
90 | |||
91 | for (i = 0; args[i]; i++) { | ||
92 | if (args[i] == ' ' && !in_quote) | ||
93 | break; | ||
94 | if (equals == 0) { | ||
95 | if (args[i] == '=') | ||
96 | equals = i; | ||
97 | } | ||
98 | if (args[i] == '"') | ||
99 | in_quote = !in_quote; | ||
100 | } | ||
101 | |||
102 | *param = args; | ||
103 | if (!equals) | ||
104 | *val = NULL; | ||
105 | else { | ||
106 | args[equals] = '\0'; | ||
107 | *val = args + equals + 1; | ||
108 | |||
109 | /* Don't include quotes in value. */ | ||
110 | if (**val == '"') { | ||
111 | (*val)++; | ||
112 | if (args[i-1] == '"') | ||
113 | args[i-1] = '\0'; | ||
114 | } | ||
115 | if (quoted && args[i-1] == '"') | ||
116 | args[i-1] = '\0'; | ||
117 | } | ||
118 | |||
119 | if (args[i]) { | ||
120 | args[i] = '\0'; | ||
121 | next = args + i + 1; | ||
122 | } else | ||
123 | next = args + i; | ||
124 | return next; | ||
125 | } | ||
126 | |||
127 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | ||
128 | int parse_args(const char *name, | ||
129 | char *args, | ||
130 | struct kernel_param *params, | ||
131 | unsigned num, | ||
132 | int (*unknown)(char *param, char *val)) | ||
133 | { | ||
134 | char *param, *val; | ||
135 | |||
136 | DEBUGP("Parsing ARGS: %s\n", args); | ||
137 | |||
138 | while (*args) { | ||
139 | int ret; | ||
140 | |||
141 | args = next_arg(args, ¶m, &val); | ||
142 | ret = parse_one(param, val, params, num, unknown); | ||
143 | switch (ret) { | ||
144 | case -ENOENT: | ||
145 | printk(KERN_ERR "%s: Unknown parameter `%s'\n", | ||
146 | name, param); | ||
147 | return ret; | ||
148 | case -ENOSPC: | ||
149 | printk(KERN_ERR | ||
150 | "%s: `%s' too large for parameter `%s'\n", | ||
151 | name, val ?: "", param); | ||
152 | return ret; | ||
153 | case 0: | ||
154 | break; | ||
155 | default: | ||
156 | printk(KERN_ERR | ||
157 | "%s: `%s' invalid for parameter `%s'\n", | ||
158 | name, val ?: "", param); | ||
159 | return ret; | ||
160 | } | ||
161 | } | ||
162 | |||
163 | /* All parsed OK. */ | ||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | /* Lazy bastard, eh? */ | ||
168 | #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ | ||
169 | int param_set_##name(const char *val, struct kernel_param *kp) \ | ||
170 | { \ | ||
171 | char *endp; \ | ||
172 | tmptype l; \ | ||
173 | \ | ||
174 | if (!val) return -EINVAL; \ | ||
175 | l = strtolfn(val, &endp, 0); \ | ||
176 | if (endp == val || ((type)l != l)) \ | ||
177 | return -EINVAL; \ | ||
178 | *((type *)kp->arg) = l; \ | ||
179 | return 0; \ | ||
180 | } \ | ||
181 | int param_get_##name(char *buffer, struct kernel_param *kp) \ | ||
182 | { \ | ||
183 | return sprintf(buffer, format, *((type *)kp->arg)); \ | ||
184 | } | ||
185 | |||
186 | STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul); | ||
187 | STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol); | ||
188 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul); | ||
189 | STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol); | ||
190 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul); | ||
191 | STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol); | ||
192 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul); | ||
193 | |||
194 | int param_set_charp(const char *val, struct kernel_param *kp) | ||
195 | { | ||
196 | if (!val) { | ||
197 | printk(KERN_ERR "%s: string parameter expected\n", | ||
198 | kp->name); | ||
199 | return -EINVAL; | ||
200 | } | ||
201 | |||
202 | if (strlen(val) > 1024) { | ||
203 | printk(KERN_ERR "%s: string parameter too long\n", | ||
204 | kp->name); | ||
205 | return -ENOSPC; | ||
206 | } | ||
207 | |||
208 | *(char **)kp->arg = (char *)val; | ||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | int param_get_charp(char *buffer, struct kernel_param *kp) | ||
213 | { | ||
214 | return sprintf(buffer, "%s", *((char **)kp->arg)); | ||
215 | } | ||
216 | |||
217 | int param_set_bool(const char *val, struct kernel_param *kp) | ||
218 | { | ||
219 | /* No equals means "set"... */ | ||
220 | if (!val) val = "1"; | ||
221 | |||
222 | /* One of =[yYnN01] */ | ||
223 | switch (val[0]) { | ||
224 | case 'y': case 'Y': case '1': | ||
225 | *(int *)kp->arg = 1; | ||
226 | return 0; | ||
227 | case 'n': case 'N': case '0': | ||
228 | *(int *)kp->arg = 0; | ||
229 | return 0; | ||
230 | } | ||
231 | return -EINVAL; | ||
232 | } | ||
233 | |||
234 | int param_get_bool(char *buffer, struct kernel_param *kp) | ||
235 | { | ||
236 | /* Y and N chosen as being relatively non-coder friendly */ | ||
237 | return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); | ||
238 | } | ||
239 | |||
240 | int param_set_invbool(const char *val, struct kernel_param *kp) | ||
241 | { | ||
242 | int boolval, ret; | ||
243 | struct kernel_param dummy = { .arg = &boolval }; | ||
244 | |||
245 | ret = param_set_bool(val, &dummy); | ||
246 | if (ret == 0) | ||
247 | *(int *)kp->arg = !boolval; | ||
248 | return ret; | ||
249 | } | ||
250 | |||
251 | int param_get_invbool(char *buffer, struct kernel_param *kp) | ||
252 | { | ||
253 | int val; | ||
254 | struct kernel_param dummy = { .arg = &val }; | ||
255 | |||
256 | val = !*(int *)kp->arg; | ||
257 | return param_get_bool(buffer, &dummy); | ||
258 | } | ||
259 | |||
260 | /* We cheat here and temporarily mangle the string. */ | ||
261 | int param_array(const char *name, | ||
262 | const char *val, | ||
263 | unsigned int min, unsigned int max, | ||
264 | void *elem, int elemsize, | ||
265 | int (*set)(const char *, struct kernel_param *kp), | ||
266 | int *num) | ||
267 | { | ||
268 | int ret; | ||
269 | struct kernel_param kp; | ||
270 | char save; | ||
271 | |||
272 | /* Get the name right for errors. */ | ||
273 | kp.name = name; | ||
274 | kp.arg = elem; | ||
275 | |||
276 | /* No equals sign? */ | ||
277 | if (!val) { | ||
278 | printk(KERN_ERR "%s: expects arguments\n", name); | ||
279 | return -EINVAL; | ||
280 | } | ||
281 | |||
282 | *num = 0; | ||
283 | /* We expect a comma-separated list of values. */ | ||
284 | do { | ||
285 | int len; | ||
286 | |||
287 | if (*num == max) { | ||
288 | printk(KERN_ERR "%s: can only take %i arguments\n", | ||
289 | name, max); | ||
290 | return -EINVAL; | ||
291 | } | ||
292 | len = strcspn(val, ","); | ||
293 | |||
294 | /* nul-terminate and parse */ | ||
295 | save = val[len]; | ||
296 | ((char *)val)[len] = '\0'; | ||
297 | ret = set(val, &kp); | ||
298 | |||
299 | if (ret != 0) | ||
300 | return ret; | ||
301 | kp.arg += elemsize; | ||
302 | val += len+1; | ||
303 | (*num)++; | ||
304 | } while (save == ','); | ||
305 | |||
306 | if (*num < min) { | ||
307 | printk(KERN_ERR "%s: needs at least %i arguments\n", | ||
308 | name, min); | ||
309 | return -EINVAL; | ||
310 | } | ||
311 | return 0; | ||
312 | } | ||
313 | |||
314 | int param_array_set(const char *val, struct kernel_param *kp) | ||
315 | { | ||
316 | struct kparam_array *arr = kp->arg; | ||
317 | |||
318 | return param_array(kp->name, val, 1, arr->max, arr->elem, | ||
319 | arr->elemsize, arr->set, arr->num ?: &arr->max); | ||
320 | } | ||
321 | |||
322 | int param_array_get(char *buffer, struct kernel_param *kp) | ||
323 | { | ||
324 | int i, off, ret; | ||
325 | struct kparam_array *arr = kp->arg; | ||
326 | struct kernel_param p; | ||
327 | |||
328 | p = *kp; | ||
329 | for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { | ||
330 | if (i) | ||
331 | buffer[off++] = ','; | ||
332 | p.arg = arr->elem + arr->elemsize * i; | ||
333 | ret = arr->get(buffer + off, &p); | ||
334 | if (ret < 0) | ||
335 | return ret; | ||
336 | off += ret; | ||
337 | } | ||
338 | buffer[off] = '\0'; | ||
339 | return off; | ||
340 | } | ||
341 | |||
342 | int param_set_copystring(const char *val, struct kernel_param *kp) | ||
343 | { | ||
344 | struct kparam_string *kps = kp->arg; | ||
345 | |||
346 | if (strlen(val)+1 > kps->maxlen) { | ||
347 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", | ||
348 | kp->name, kps->maxlen-1); | ||
349 | return -ENOSPC; | ||
350 | } | ||
351 | strcpy(kps->string, val); | ||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | int param_get_string(char *buffer, struct kernel_param *kp) | ||
356 | { | ||
357 | struct kparam_string *kps = kp->arg; | ||
358 | return strlcpy(buffer, kps->string, kps->maxlen); | ||
359 | } | ||
360 | |||
361 | /* sysfs output in /sys/modules/XYZ/parameters/ */ | ||
362 | |||
363 | extern struct kernel_param __start___param[], __stop___param[]; | ||
364 | |||
365 | #define MAX_KBUILD_MODNAME KOBJ_NAME_LEN | ||
366 | |||
367 | struct param_attribute | ||
368 | { | ||
369 | struct module_attribute mattr; | ||
370 | struct kernel_param *param; | ||
371 | }; | ||
372 | |||
373 | struct module_param_attrs | ||
374 | { | ||
375 | struct attribute_group grp; | ||
376 | struct param_attribute attrs[0]; | ||
377 | }; | ||
378 | |||
379 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr); | ||
380 | |||
381 | static ssize_t param_attr_show(struct module_attribute *mattr, | ||
382 | struct module *mod, char *buf) | ||
383 | { | ||
384 | int count; | ||
385 | struct param_attribute *attribute = to_param_attr(mattr); | ||
386 | |||
387 | if (!attribute->param->get) | ||
388 | return -EPERM; | ||
389 | |||
390 | count = attribute->param->get(buf, attribute->param); | ||
391 | if (count > 0) { | ||
392 | strcat(buf, "\n"); | ||
393 | ++count; | ||
394 | } | ||
395 | return count; | ||
396 | } | ||
397 | |||
398 | /* sysfs always hands a nul-terminated string in buf. We rely on that. */ | ||
399 | static ssize_t param_attr_store(struct module_attribute *mattr, | ||
400 | struct module *owner, | ||
401 | const char *buf, size_t len) | ||
402 | { | ||
403 | int err; | ||
404 | struct param_attribute *attribute = to_param_attr(mattr); | ||
405 | |||
406 | if (!attribute->param->set) | ||
407 | return -EPERM; | ||
408 | |||
409 | err = attribute->param->set(buf, attribute->param); | ||
410 | if (!err) | ||
411 | return len; | ||
412 | return err; | ||
413 | } | ||
414 | |||
415 | #ifdef CONFIG_MODULES | ||
416 | #define __modinit | ||
417 | #else | ||
418 | #define __modinit __init | ||
419 | #endif | ||
420 | |||
421 | /* | ||
422 | * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME | ||
423 | * @mk: struct module_kobject (contains parent kobject) | ||
424 | * @kparam: array of struct kernel_param, the actual parameter definitions | ||
425 | * @num_params: number of entries in array | ||
426 | * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules" | ||
427 | * | ||
428 | * Create a kobject for a (per-module) group of parameters, and create files | ||
429 | * in sysfs. A pointer to the param_kobject is returned on success, | ||
430 | * NULL if there's no parameter to export, or other ERR_PTR(err). | ||
431 | */ | ||
432 | static __modinit struct module_param_attrs * | ||
433 | param_sysfs_setup(struct module_kobject *mk, | ||
434 | struct kernel_param *kparam, | ||
435 | unsigned int num_params, | ||
436 | unsigned int name_skip) | ||
437 | { | ||
438 | struct module_param_attrs *mp; | ||
439 | unsigned int valid_attrs = 0; | ||
440 | unsigned int i, size[2]; | ||
441 | struct param_attribute *pattr; | ||
442 | struct attribute **gattr; | ||
443 | int err; | ||
444 | |||
445 | for (i=0; i<num_params; i++) { | ||
446 | if (kparam[i].perm) | ||
447 | valid_attrs++; | ||
448 | } | ||
449 | |||
450 | if (!valid_attrs) | ||
451 | return NULL; | ||
452 | |||
453 | size[0] = ALIGN(sizeof(*mp) + | ||
454 | valid_attrs * sizeof(mp->attrs[0]), | ||
455 | sizeof(mp->grp.attrs[0])); | ||
456 | size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); | ||
457 | |||
458 | mp = kmalloc(size[0] + size[1], GFP_KERNEL); | ||
459 | if (!mp) | ||
460 | return ERR_PTR(-ENOMEM); | ||
461 | |||
462 | mp->grp.name = "parameters"; | ||
463 | mp->grp.attrs = (void *)mp + size[0]; | ||
464 | |||
465 | pattr = &mp->attrs[0]; | ||
466 | gattr = &mp->grp.attrs[0]; | ||
467 | for (i = 0; i < num_params; i++) { | ||
468 | struct kernel_param *kp = &kparam[i]; | ||
469 | if (kp->perm) { | ||
470 | pattr->param = kp; | ||
471 | pattr->mattr.show = param_attr_show; | ||
472 | pattr->mattr.store = param_attr_store; | ||
473 | pattr->mattr.attr.name = (char *)&kp->name[name_skip]; | ||
474 | pattr->mattr.attr.owner = mk->mod; | ||
475 | pattr->mattr.attr.mode = kp->perm; | ||
476 | *(gattr++) = &(pattr++)->mattr.attr; | ||
477 | } | ||
478 | } | ||
479 | *gattr = NULL; | ||
480 | |||
481 | if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { | ||
482 | kfree(mp); | ||
483 | return ERR_PTR(err); | ||
484 | } | ||
485 | return mp; | ||
486 | } | ||
487 | |||
488 | |||
489 | #ifdef CONFIG_MODULES | ||
490 | |||
491 | /* | ||
492 | * module_param_sysfs_setup - setup sysfs support for one module | ||
493 | * @mod: module | ||
494 | * @kparam: module parameters (array) | ||
495 | * @num_params: number of module parameters | ||
496 | * | ||
497 | * Adds sysfs entries for module parameters, and creates a link from | ||
498 | * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/ | ||
499 | */ | ||
500 | int module_param_sysfs_setup(struct module *mod, | ||
501 | struct kernel_param *kparam, | ||
502 | unsigned int num_params) | ||
503 | { | ||
504 | struct module_param_attrs *mp; | ||
505 | |||
506 | mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); | ||
507 | if (IS_ERR(mp)) | ||
508 | return PTR_ERR(mp); | ||
509 | |||
510 | mod->param_attrs = mp; | ||
511 | return 0; | ||
512 | } | ||
513 | |||
514 | /* | ||
515 | * module_param_sysfs_remove - remove sysfs support for one module | ||
516 | * @mod: module | ||
517 | * | ||
518 | * Remove sysfs entries for module parameters and the corresponding | ||
519 | * kobject. | ||
520 | */ | ||
521 | void module_param_sysfs_remove(struct module *mod) | ||
522 | { | ||
523 | if (mod->param_attrs) { | ||
524 | sysfs_remove_group(&mod->mkobj.kobj, | ||
525 | &mod->param_attrs->grp); | ||
526 | /* We are positive that no one is using any param | ||
527 | * attrs at this point. Deallocate immediately. */ | ||
528 | kfree(mod->param_attrs); | ||
529 | mod->param_attrs = NULL; | ||
530 | } | ||
531 | } | ||
532 | #endif | ||
533 | |||
534 | /* | ||
535 | * kernel_param_sysfs_setup - wrapper for built-in params support | ||
536 | */ | ||
537 | static void __init kernel_param_sysfs_setup(const char *name, | ||
538 | struct kernel_param *kparam, | ||
539 | unsigned int num_params, | ||
540 | unsigned int name_skip) | ||
541 | { | ||
542 | struct module_kobject *mk; | ||
543 | |||
544 | mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); | ||
545 | memset(mk, 0, sizeof(struct module_kobject)); | ||
546 | |||
547 | mk->mod = THIS_MODULE; | ||
548 | kobj_set_kset_s(mk, module_subsys); | ||
549 | kobject_set_name(&mk->kobj, name); | ||
550 | kobject_register(&mk->kobj); | ||
551 | |||
552 | /* no need to keep the kobject if no parameter is exported */ | ||
553 | if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { | ||
554 | kobject_unregister(&mk->kobj); | ||
555 | kfree(mk); | ||
556 | } | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * param_sysfs_builtin - add contents in /sys/parameters for built-in modules | ||
561 | * | ||
562 | * Add module_parameters to sysfs for "modules" built into the kernel. | ||
563 | * | ||
564 | * The "module" name (KBUILD_MODNAME) is stored before a dot, the | ||
565 | * "parameter" name is stored behind a dot in kernel_param->name. So, | ||
566 | * extract the "module" name for all built-in kernel_param-eters, | ||
567 | * and for all who have the same, call kernel_param_sysfs_setup. | ||
568 | */ | ||
569 | static void __init param_sysfs_builtin(void) | ||
570 | { | ||
571 | struct kernel_param *kp, *kp_begin = NULL; | ||
572 | unsigned int i, name_len, count = 0; | ||
573 | char modname[MAX_KBUILD_MODNAME + 1] = ""; | ||
574 | |||
575 | for (i=0; i < __stop___param - __start___param; i++) { | ||
576 | char *dot; | ||
577 | |||
578 | kp = &__start___param[i]; | ||
579 | |||
580 | /* We do not handle args without periods. */ | ||
581 | dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME); | ||
582 | if (!dot) { | ||
583 | DEBUGP("couldn't find period in %s\n", kp->name); | ||
584 | continue; | ||
585 | } | ||
586 | name_len = dot - kp->name; | ||
587 | |||
588 | /* new kbuild_modname? */ | ||
589 | if (strlen(modname) != name_len | ||
590 | || strncmp(modname, kp->name, name_len) != 0) { | ||
591 | /* add a new kobject for previous kernel_params. */ | ||
592 | if (count) | ||
593 | kernel_param_sysfs_setup(modname, | ||
594 | kp_begin, | ||
595 | count, | ||
596 | strlen(modname)+1); | ||
597 | |||
598 | strncpy(modname, kp->name, name_len); | ||
599 | modname[name_len] = '\0'; | ||
600 | count = 0; | ||
601 | kp_begin = kp; | ||
602 | } | ||
603 | count++; | ||
604 | } | ||
605 | |||
606 | /* last kernel_params need to be registered as well */ | ||
607 | if (count) | ||
608 | kernel_param_sysfs_setup(modname, kp_begin, count, | ||
609 | strlen(modname)+1); | ||
610 | } | ||
611 | |||
612 | |||
613 | /* module-related sysfs stuff */ | ||
614 | #ifdef CONFIG_MODULES | ||
615 | |||
616 | #define to_module_attr(n) container_of(n, struct module_attribute, attr); | ||
617 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj); | ||
618 | |||
619 | static ssize_t module_attr_show(struct kobject *kobj, | ||
620 | struct attribute *attr, | ||
621 | char *buf) | ||
622 | { | ||
623 | struct module_attribute *attribute; | ||
624 | struct module_kobject *mk; | ||
625 | int ret; | ||
626 | |||
627 | attribute = to_module_attr(attr); | ||
628 | mk = to_module_kobject(kobj); | ||
629 | |||
630 | if (!attribute->show) | ||
631 | return -EPERM; | ||
632 | |||
633 | if (!try_module_get(mk->mod)) | ||
634 | return -ENODEV; | ||
635 | |||
636 | ret = attribute->show(attribute, mk->mod, buf); | ||
637 | |||
638 | module_put(mk->mod); | ||
639 | |||
640 | return ret; | ||
641 | } | ||
642 | |||
643 | static ssize_t module_attr_store(struct kobject *kobj, | ||
644 | struct attribute *attr, | ||
645 | const char *buf, size_t len) | ||
646 | { | ||
647 | struct module_attribute *attribute; | ||
648 | struct module_kobject *mk; | ||
649 | int ret; | ||
650 | |||
651 | attribute = to_module_attr(attr); | ||
652 | mk = to_module_kobject(kobj); | ||
653 | |||
654 | if (!attribute->store) | ||
655 | return -EPERM; | ||
656 | |||
657 | if (!try_module_get(mk->mod)) | ||
658 | return -ENODEV; | ||
659 | |||
660 | ret = attribute->store(attribute, mk->mod, buf, len); | ||
661 | |||
662 | module_put(mk->mod); | ||
663 | |||
664 | return ret; | ||
665 | } | ||
666 | |||
667 | static struct sysfs_ops module_sysfs_ops = { | ||
668 | .show = module_attr_show, | ||
669 | .store = module_attr_store, | ||
670 | }; | ||
671 | |||
672 | #else | ||
673 | static struct sysfs_ops module_sysfs_ops = { | ||
674 | .show = NULL, | ||
675 | .store = NULL, | ||
676 | }; | ||
677 | #endif | ||
678 | |||
679 | static struct kobj_type module_ktype = { | ||
680 | .sysfs_ops = &module_sysfs_ops, | ||
681 | }; | ||
682 | |||
683 | decl_subsys(module, &module_ktype, NULL); | ||
684 | |||
685 | /* | ||
686 | * param_sysfs_init - wrapper for built-in params support | ||
687 | */ | ||
688 | static int __init param_sysfs_init(void) | ||
689 | { | ||
690 | subsystem_register(&module_subsys); | ||
691 | |||
692 | param_sysfs_builtin(); | ||
693 | |||
694 | return 0; | ||
695 | } | ||
696 | __initcall(param_sysfs_init); | ||
697 | |||
698 | EXPORT_SYMBOL(param_set_byte); | ||
699 | EXPORT_SYMBOL(param_get_byte); | ||
700 | EXPORT_SYMBOL(param_set_short); | ||
701 | EXPORT_SYMBOL(param_get_short); | ||
702 | EXPORT_SYMBOL(param_set_ushort); | ||
703 | EXPORT_SYMBOL(param_get_ushort); | ||
704 | EXPORT_SYMBOL(param_set_int); | ||
705 | EXPORT_SYMBOL(param_get_int); | ||
706 | EXPORT_SYMBOL(param_set_uint); | ||
707 | EXPORT_SYMBOL(param_get_uint); | ||
708 | EXPORT_SYMBOL(param_set_long); | ||
709 | EXPORT_SYMBOL(param_get_long); | ||
710 | EXPORT_SYMBOL(param_set_ulong); | ||
711 | EXPORT_SYMBOL(param_get_ulong); | ||
712 | EXPORT_SYMBOL(param_set_charp); | ||
713 | EXPORT_SYMBOL(param_get_charp); | ||
714 | EXPORT_SYMBOL(param_set_bool); | ||
715 | EXPORT_SYMBOL(param_get_bool); | ||
716 | EXPORT_SYMBOL(param_set_invbool); | ||
717 | EXPORT_SYMBOL(param_get_invbool); | ||
718 | EXPORT_SYMBOL(param_array_set); | ||
719 | EXPORT_SYMBOL(param_array_get); | ||
720 | EXPORT_SYMBOL(param_set_copystring); | ||
721 | EXPORT_SYMBOL(param_get_string); | ||
diff --git a/kernel/pid.c b/kernel/pid.c new file mode 100644 index 000000000000..edba31c681ac --- /dev/null +++ b/kernel/pid.c | |||
@@ -0,0 +1,292 @@ | |||
1 | /* | ||
2 | * Generic pidhash and scalable, time-bounded PID allocator | ||
3 | * | ||
4 | * (C) 2002-2003 William Irwin, IBM | ||
5 | * (C) 2004 William Irwin, Oracle | ||
6 | * (C) 2002-2004 Ingo Molnar, Red Hat | ||
7 | * | ||
8 | * pid-structures are backing objects for tasks sharing a given ID to chain | ||
9 | * against. There is very little to them aside from hashing them and | ||
10 | * parking tasks using given ID's on a list. | ||
11 | * | ||
12 | * The hash is always changed with the tasklist_lock write-acquired, | ||
13 | * and the hash is only accessed with the tasklist_lock at least | ||
14 | * read-acquired, so there's no additional SMP locking needed here. | ||
15 | * | ||
16 | * We have a list of bitmap pages, which bitmaps represent the PID space. | ||
17 | * Allocating and freeing PIDs is completely lockless. The worst-case | ||
18 | * allocation scenario when all but one out of 1 million PIDs possible are | ||
19 | * allocated already: the scanning of 32 list entries and at most PAGE_SIZE | ||
20 | * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). | ||
21 | */ | ||
22 | |||
23 | #include <linux/mm.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/slab.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/hash.h> | ||
29 | |||
30 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) | ||
31 | static struct hlist_head *pid_hash[PIDTYPE_MAX]; | ||
32 | static int pidhash_shift; | ||
33 | |||
34 | int pid_max = PID_MAX_DEFAULT; | ||
35 | int last_pid; | ||
36 | |||
37 | #define RESERVED_PIDS 300 | ||
38 | |||
39 | int pid_max_min = RESERVED_PIDS + 1; | ||
40 | int pid_max_max = PID_MAX_LIMIT; | ||
41 | |||
42 | #define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) | ||
43 | #define BITS_PER_PAGE (PAGE_SIZE*8) | ||
44 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) | ||
45 | #define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) | ||
46 | #define find_next_offset(map, off) \ | ||
47 | find_next_zero_bit((map)->page, BITS_PER_PAGE, off) | ||
48 | |||
49 | /* | ||
50 | * PID-map pages start out as NULL, they get allocated upon | ||
51 | * first use and are never deallocated. This way a low pid_max | ||
52 | * value does not cause lots of bitmaps to be allocated, but | ||
53 | * the scheme scales to up to 4 million PIDs, runtime. | ||
54 | */ | ||
55 | typedef struct pidmap { | ||
56 | atomic_t nr_free; | ||
57 | void *page; | ||
58 | } pidmap_t; | ||
59 | |||
60 | static pidmap_t pidmap_array[PIDMAP_ENTRIES] = | ||
61 | { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; | ||
62 | |||
63 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); | ||
64 | |||
65 | fastcall void free_pidmap(int pid) | ||
66 | { | ||
67 | pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; | ||
68 | int offset = pid & BITS_PER_PAGE_MASK; | ||
69 | |||
70 | clear_bit(offset, map->page); | ||
71 | atomic_inc(&map->nr_free); | ||
72 | } | ||
73 | |||
74 | int alloc_pidmap(void) | ||
75 | { | ||
76 | int i, offset, max_scan, pid, last = last_pid; | ||
77 | pidmap_t *map; | ||
78 | |||
79 | pid = last + 1; | ||
80 | if (pid >= pid_max) | ||
81 | pid = RESERVED_PIDS; | ||
82 | offset = pid & BITS_PER_PAGE_MASK; | ||
83 | map = &pidmap_array[pid/BITS_PER_PAGE]; | ||
84 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; | ||
85 | for (i = 0; i <= max_scan; ++i) { | ||
86 | if (unlikely(!map->page)) { | ||
87 | unsigned long page = get_zeroed_page(GFP_KERNEL); | ||
88 | /* | ||
89 | * Free the page if someone raced with us | ||
90 | * installing it: | ||
91 | */ | ||
92 | spin_lock(&pidmap_lock); | ||
93 | if (map->page) | ||
94 | free_page(page); | ||
95 | else | ||
96 | map->page = (void *)page; | ||
97 | spin_unlock(&pidmap_lock); | ||
98 | if (unlikely(!map->page)) | ||
99 | break; | ||
100 | } | ||
101 | if (likely(atomic_read(&map->nr_free))) { | ||
102 | do { | ||
103 | if (!test_and_set_bit(offset, map->page)) { | ||
104 | atomic_dec(&map->nr_free); | ||
105 | last_pid = pid; | ||
106 | return pid; | ||
107 | } | ||
108 | offset = find_next_offset(map, offset); | ||
109 | pid = mk_pid(map, offset); | ||
110 | /* | ||
111 | * find_next_offset() found a bit, the pid from it | ||
112 | * is in-bounds, and if we fell back to the last | ||
113 | * bitmap block and the final block was the same | ||
114 | * as the starting point, pid is before last_pid. | ||
115 | */ | ||
116 | } while (offset < BITS_PER_PAGE && pid < pid_max && | ||
117 | (i != max_scan || pid < last || | ||
118 | !((last+1) & BITS_PER_PAGE_MASK))); | ||
119 | } | ||
120 | if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { | ||
121 | ++map; | ||
122 | offset = 0; | ||
123 | } else { | ||
124 | map = &pidmap_array[0]; | ||
125 | offset = RESERVED_PIDS; | ||
126 | if (unlikely(last == offset)) | ||
127 | break; | ||
128 | } | ||
129 | pid = mk_pid(map, offset); | ||
130 | } | ||
131 | return -1; | ||
132 | } | ||
133 | |||
134 | struct pid * fastcall find_pid(enum pid_type type, int nr) | ||
135 | { | ||
136 | struct hlist_node *elem; | ||
137 | struct pid *pid; | ||
138 | |||
139 | hlist_for_each_entry(pid, elem, | ||
140 | &pid_hash[type][pid_hashfn(nr)], pid_chain) { | ||
141 | if (pid->nr == nr) | ||
142 | return pid; | ||
143 | } | ||
144 | return NULL; | ||
145 | } | ||
146 | |||
147 | int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | ||
148 | { | ||
149 | struct pid *pid, *task_pid; | ||
150 | |||
151 | task_pid = &task->pids[type]; | ||
152 | pid = find_pid(type, nr); | ||
153 | if (pid == NULL) { | ||
154 | hlist_add_head(&task_pid->pid_chain, | ||
155 | &pid_hash[type][pid_hashfn(nr)]); | ||
156 | INIT_LIST_HEAD(&task_pid->pid_list); | ||
157 | } else { | ||
158 | INIT_HLIST_NODE(&task_pid->pid_chain); | ||
159 | list_add_tail(&task_pid->pid_list, &pid->pid_list); | ||
160 | } | ||
161 | task_pid->nr = nr; | ||
162 | |||
163 | return 0; | ||
164 | } | ||
165 | |||
166 | static fastcall int __detach_pid(task_t *task, enum pid_type type) | ||
167 | { | ||
168 | struct pid *pid, *pid_next; | ||
169 | int nr = 0; | ||
170 | |||
171 | pid = &task->pids[type]; | ||
172 | if (!hlist_unhashed(&pid->pid_chain)) { | ||
173 | hlist_del(&pid->pid_chain); | ||
174 | |||
175 | if (list_empty(&pid->pid_list)) | ||
176 | nr = pid->nr; | ||
177 | else { | ||
178 | pid_next = list_entry(pid->pid_list.next, | ||
179 | struct pid, pid_list); | ||
180 | /* insert next pid from pid_list to hash */ | ||
181 | hlist_add_head(&pid_next->pid_chain, | ||
182 | &pid_hash[type][pid_hashfn(pid_next->nr)]); | ||
183 | } | ||
184 | } | ||
185 | |||
186 | list_del(&pid->pid_list); | ||
187 | pid->nr = 0; | ||
188 | |||
189 | return nr; | ||
190 | } | ||
191 | |||
192 | void fastcall detach_pid(task_t *task, enum pid_type type) | ||
193 | { | ||
194 | int tmp, nr; | ||
195 | |||
196 | nr = __detach_pid(task, type); | ||
197 | if (!nr) | ||
198 | return; | ||
199 | |||
200 | for (tmp = PIDTYPE_MAX; --tmp >= 0; ) | ||
201 | if (tmp != type && find_pid(tmp, nr)) | ||
202 | return; | ||
203 | |||
204 | free_pidmap(nr); | ||
205 | } | ||
206 | |||
207 | task_t *find_task_by_pid_type(int type, int nr) | ||
208 | { | ||
209 | struct pid *pid; | ||
210 | |||
211 | pid = find_pid(type, nr); | ||
212 | if (!pid) | ||
213 | return NULL; | ||
214 | |||
215 | return pid_task(&pid->pid_list, type); | ||
216 | } | ||
217 | |||
218 | EXPORT_SYMBOL(find_task_by_pid_type); | ||
219 | |||
220 | /* | ||
221 | * This function switches the PIDs if a non-leader thread calls | ||
222 | * sys_execve() - this must be done without releasing the PID. | ||
223 | * (which a detach_pid() would eventually do.) | ||
224 | */ | ||
225 | void switch_exec_pids(task_t *leader, task_t *thread) | ||
226 | { | ||
227 | __detach_pid(leader, PIDTYPE_PID); | ||
228 | __detach_pid(leader, PIDTYPE_TGID); | ||
229 | __detach_pid(leader, PIDTYPE_PGID); | ||
230 | __detach_pid(leader, PIDTYPE_SID); | ||
231 | |||
232 | __detach_pid(thread, PIDTYPE_PID); | ||
233 | __detach_pid(thread, PIDTYPE_TGID); | ||
234 | |||
235 | leader->pid = leader->tgid = thread->pid; | ||
236 | thread->pid = thread->tgid; | ||
237 | |||
238 | attach_pid(thread, PIDTYPE_PID, thread->pid); | ||
239 | attach_pid(thread, PIDTYPE_TGID, thread->tgid); | ||
240 | attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); | ||
241 | attach_pid(thread, PIDTYPE_SID, thread->signal->session); | ||
242 | list_add_tail(&thread->tasks, &init_task.tasks); | ||
243 | |||
244 | attach_pid(leader, PIDTYPE_PID, leader->pid); | ||
245 | attach_pid(leader, PIDTYPE_TGID, leader->tgid); | ||
246 | attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp); | ||
247 | attach_pid(leader, PIDTYPE_SID, leader->signal->session); | ||
248 | } | ||
249 | |||
250 | /* | ||
251 | * The pid hash table is scaled according to the amount of memory in the | ||
252 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or | ||
253 | * more. | ||
254 | */ | ||
255 | void __init pidhash_init(void) | ||
256 | { | ||
257 | int i, j, pidhash_size; | ||
258 | unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); | ||
259 | |||
260 | pidhash_shift = max(4, fls(megabytes * 4)); | ||
261 | pidhash_shift = min(12, pidhash_shift); | ||
262 | pidhash_size = 1 << pidhash_shift; | ||
263 | |||
264 | printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", | ||
265 | pidhash_size, pidhash_shift, | ||
266 | PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); | ||
267 | |||
268 | for (i = 0; i < PIDTYPE_MAX; i++) { | ||
269 | pid_hash[i] = alloc_bootmem(pidhash_size * | ||
270 | sizeof(*(pid_hash[i]))); | ||
271 | if (!pid_hash[i]) | ||
272 | panic("Could not alloc pidhash!\n"); | ||
273 | for (j = 0; j < pidhash_size; j++) | ||
274 | INIT_HLIST_HEAD(&pid_hash[i][j]); | ||
275 | } | ||
276 | } | ||
277 | |||
278 | void __init pidmap_init(void) | ||
279 | { | ||
280 | int i; | ||
281 | |||
282 | pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); | ||
283 | set_bit(0, pidmap_array->page); | ||
284 | atomic_dec(&pidmap_array->nr_free); | ||
285 | |||
286 | /* | ||
287 | * Allocate PID 0, and hash it via all PID types: | ||
288 | */ | ||
289 | |||
290 | for (i = 0; i < PIDTYPE_MAX; i++) | ||
291 | attach_pid(current, i, 0); | ||
292 | } | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c new file mode 100644 index 000000000000..ad85d3f0dcc4 --- /dev/null +++ b/kernel/posix-cpu-timers.c | |||
@@ -0,0 +1,1559 @@ | |||
1 | /* | ||
2 | * Implement CPU time clocks for the POSIX clock interface. | ||
3 | */ | ||
4 | |||
5 | #include <linux/sched.h> | ||
6 | #include <linux/posix-timers.h> | ||
7 | #include <asm/uaccess.h> | ||
8 | #include <linux/errno.h> | ||
9 | |||
10 | static int check_clock(clockid_t which_clock) | ||
11 | { | ||
12 | int error = 0; | ||
13 | struct task_struct *p; | ||
14 | const pid_t pid = CPUCLOCK_PID(which_clock); | ||
15 | |||
16 | if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) | ||
17 | return -EINVAL; | ||
18 | |||
19 | if (pid == 0) | ||
20 | return 0; | ||
21 | |||
22 | read_lock(&tasklist_lock); | ||
23 | p = find_task_by_pid(pid); | ||
24 | if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? | ||
25 | p->tgid != current->tgid : p->tgid != pid)) { | ||
26 | error = -EINVAL; | ||
27 | } | ||
28 | read_unlock(&tasklist_lock); | ||
29 | |||
30 | return error; | ||
31 | } | ||
32 | |||
33 | static inline union cpu_time_count | ||
34 | timespec_to_sample(clockid_t which_clock, const struct timespec *tp) | ||
35 | { | ||
36 | union cpu_time_count ret; | ||
37 | ret.sched = 0; /* high half always zero when .cpu used */ | ||
38 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
39 | ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; | ||
40 | } else { | ||
41 | ret.cpu = timespec_to_cputime(tp); | ||
42 | } | ||
43 | return ret; | ||
44 | } | ||
45 | |||
46 | static void sample_to_timespec(clockid_t which_clock, | ||
47 | union cpu_time_count cpu, | ||
48 | struct timespec *tp) | ||
49 | { | ||
50 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
51 | tp->tv_sec = div_long_long_rem(cpu.sched, | ||
52 | NSEC_PER_SEC, &tp->tv_nsec); | ||
53 | } else { | ||
54 | cputime_to_timespec(cpu.cpu, tp); | ||
55 | } | ||
56 | } | ||
57 | |||
58 | static inline int cpu_time_before(clockid_t which_clock, | ||
59 | union cpu_time_count now, | ||
60 | union cpu_time_count then) | ||
61 | { | ||
62 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
63 | return now.sched < then.sched; | ||
64 | } else { | ||
65 | return cputime_lt(now.cpu, then.cpu); | ||
66 | } | ||
67 | } | ||
68 | static inline void cpu_time_add(clockid_t which_clock, | ||
69 | union cpu_time_count *acc, | ||
70 | union cpu_time_count val) | ||
71 | { | ||
72 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
73 | acc->sched += val.sched; | ||
74 | } else { | ||
75 | acc->cpu = cputime_add(acc->cpu, val.cpu); | ||
76 | } | ||
77 | } | ||
78 | static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, | ||
79 | union cpu_time_count a, | ||
80 | union cpu_time_count b) | ||
81 | { | ||
82 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
83 | a.sched -= b.sched; | ||
84 | } else { | ||
85 | a.cpu = cputime_sub(a.cpu, b.cpu); | ||
86 | } | ||
87 | return a; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * Update expiry time from increment, and increase overrun count, | ||
92 | * given the current clock sample. | ||
93 | */ | ||
94 | static inline void bump_cpu_timer(struct k_itimer *timer, | ||
95 | union cpu_time_count now) | ||
96 | { | ||
97 | int i; | ||
98 | |||
99 | if (timer->it.cpu.incr.sched == 0) | ||
100 | return; | ||
101 | |||
102 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { | ||
103 | unsigned long long delta, incr; | ||
104 | |||
105 | if (now.sched < timer->it.cpu.expires.sched) | ||
106 | return; | ||
107 | incr = timer->it.cpu.incr.sched; | ||
108 | delta = now.sched + incr - timer->it.cpu.expires.sched; | ||
109 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ | ||
110 | for (i = 0; incr < delta - incr; i++) | ||
111 | incr = incr << 1; | ||
112 | for (; i >= 0; incr >>= 1, i--) { | ||
113 | if (delta <= incr) | ||
114 | continue; | ||
115 | timer->it.cpu.expires.sched += incr; | ||
116 | timer->it_overrun += 1 << i; | ||
117 | delta -= incr; | ||
118 | } | ||
119 | } else { | ||
120 | cputime_t delta, incr; | ||
121 | |||
122 | if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) | ||
123 | return; | ||
124 | incr = timer->it.cpu.incr.cpu; | ||
125 | delta = cputime_sub(cputime_add(now.cpu, incr), | ||
126 | timer->it.cpu.expires.cpu); | ||
127 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ | ||
128 | for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) | ||
129 | incr = cputime_add(incr, incr); | ||
130 | for (; i >= 0; incr = cputime_halve(incr), i--) { | ||
131 | if (cputime_le(delta, incr)) | ||
132 | continue; | ||
133 | timer->it.cpu.expires.cpu = | ||
134 | cputime_add(timer->it.cpu.expires.cpu, incr); | ||
135 | timer->it_overrun += 1 << i; | ||
136 | delta = cputime_sub(delta, incr); | ||
137 | } | ||
138 | } | ||
139 | } | ||
140 | |||
141 | static inline cputime_t prof_ticks(struct task_struct *p) | ||
142 | { | ||
143 | return cputime_add(p->utime, p->stime); | ||
144 | } | ||
145 | static inline cputime_t virt_ticks(struct task_struct *p) | ||
146 | { | ||
147 | return p->utime; | ||
148 | } | ||
149 | static inline unsigned long long sched_ns(struct task_struct *p) | ||
150 | { | ||
151 | return (p == current) ? current_sched_time(p) : p->sched_time; | ||
152 | } | ||
153 | |||
154 | int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | ||
155 | { | ||
156 | int error = check_clock(which_clock); | ||
157 | if (!error) { | ||
158 | tp->tv_sec = 0; | ||
159 | tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); | ||
160 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
161 | /* | ||
162 | * If sched_clock is using a cycle counter, we | ||
163 | * don't have any idea of its true resolution | ||
164 | * exported, but it is much more than 1s/HZ. | ||
165 | */ | ||
166 | tp->tv_nsec = 1; | ||
167 | } | ||
168 | } | ||
169 | return error; | ||
170 | } | ||
171 | |||
172 | int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp) | ||
173 | { | ||
174 | /* | ||
175 | * You can never reset a CPU clock, but we check for other errors | ||
176 | * in the call before failing with EPERM. | ||
177 | */ | ||
178 | int error = check_clock(which_clock); | ||
179 | if (error == 0) { | ||
180 | error = -EPERM; | ||
181 | } | ||
182 | return error; | ||
183 | } | ||
184 | |||
185 | |||
186 | /* | ||
187 | * Sample a per-thread clock for the given task. | ||
188 | */ | ||
189 | static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p, | ||
190 | union cpu_time_count *cpu) | ||
191 | { | ||
192 | switch (CPUCLOCK_WHICH(which_clock)) { | ||
193 | default: | ||
194 | return -EINVAL; | ||
195 | case CPUCLOCK_PROF: | ||
196 | cpu->cpu = prof_ticks(p); | ||
197 | break; | ||
198 | case CPUCLOCK_VIRT: | ||
199 | cpu->cpu = virt_ticks(p); | ||
200 | break; | ||
201 | case CPUCLOCK_SCHED: | ||
202 | cpu->sched = sched_ns(p); | ||
203 | break; | ||
204 | } | ||
205 | return 0; | ||
206 | } | ||
207 | |||
208 | /* | ||
209 | * Sample a process (thread group) clock for the given group_leader task. | ||
210 | * Must be called with tasklist_lock held for reading. | ||
211 | * Must be called with tasklist_lock held for reading, and p->sighand->siglock. | ||
212 | */ | ||
213 | static int cpu_clock_sample_group_locked(unsigned int clock_idx, | ||
214 | struct task_struct *p, | ||
215 | union cpu_time_count *cpu) | ||
216 | { | ||
217 | struct task_struct *t = p; | ||
218 | switch (clock_idx) { | ||
219 | default: | ||
220 | return -EINVAL; | ||
221 | case CPUCLOCK_PROF: | ||
222 | cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); | ||
223 | do { | ||
224 | cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); | ||
225 | t = next_thread(t); | ||
226 | } while (t != p); | ||
227 | break; | ||
228 | case CPUCLOCK_VIRT: | ||
229 | cpu->cpu = p->signal->utime; | ||
230 | do { | ||
231 | cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); | ||
232 | t = next_thread(t); | ||
233 | } while (t != p); | ||
234 | break; | ||
235 | case CPUCLOCK_SCHED: | ||
236 | cpu->sched = p->signal->sched_time; | ||
237 | /* Add in each other live thread. */ | ||
238 | while ((t = next_thread(t)) != p) { | ||
239 | cpu->sched += t->sched_time; | ||
240 | } | ||
241 | if (p->tgid == current->tgid) { | ||
242 | /* | ||
243 | * We're sampling ourselves, so include the | ||
244 | * cycles not yet banked. We still omit | ||
245 | * other threads running on other CPUs, | ||
246 | * so the total can always be behind as | ||
247 | * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ). | ||
248 | */ | ||
249 | cpu->sched += current_sched_time(current); | ||
250 | } else { | ||
251 | cpu->sched += p->sched_time; | ||
252 | } | ||
253 | break; | ||
254 | } | ||
255 | return 0; | ||
256 | } | ||
257 | |||
258 | /* | ||
259 | * Sample a process (thread group) clock for the given group_leader task. | ||
260 | * Must be called with tasklist_lock held for reading. | ||
261 | */ | ||
262 | static int cpu_clock_sample_group(clockid_t which_clock, | ||
263 | struct task_struct *p, | ||
264 | union cpu_time_count *cpu) | ||
265 | { | ||
266 | int ret; | ||
267 | unsigned long flags; | ||
268 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
269 | ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, | ||
270 | cpu); | ||
271 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
272 | return ret; | ||
273 | } | ||
274 | |||
275 | |||
276 | int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | ||
277 | { | ||
278 | const pid_t pid = CPUCLOCK_PID(which_clock); | ||
279 | int error = -EINVAL; | ||
280 | union cpu_time_count rtn; | ||
281 | |||
282 | if (pid == 0) { | ||
283 | /* | ||
284 | * Special case constant value for our own clocks. | ||
285 | * We don't have to do any lookup to find ourselves. | ||
286 | */ | ||
287 | if (CPUCLOCK_PERTHREAD(which_clock)) { | ||
288 | /* | ||
289 | * Sampling just ourselves we can do with no locking. | ||
290 | */ | ||
291 | error = cpu_clock_sample(which_clock, | ||
292 | current, &rtn); | ||
293 | } else { | ||
294 | read_lock(&tasklist_lock); | ||
295 | error = cpu_clock_sample_group(which_clock, | ||
296 | current, &rtn); | ||
297 | read_unlock(&tasklist_lock); | ||
298 | } | ||
299 | } else { | ||
300 | /* | ||
301 | * Find the given PID, and validate that the caller | ||
302 | * should be able to see it. | ||
303 | */ | ||
304 | struct task_struct *p; | ||
305 | read_lock(&tasklist_lock); | ||
306 | p = find_task_by_pid(pid); | ||
307 | if (p) { | ||
308 | if (CPUCLOCK_PERTHREAD(which_clock)) { | ||
309 | if (p->tgid == current->tgid) { | ||
310 | error = cpu_clock_sample(which_clock, | ||
311 | p, &rtn); | ||
312 | } | ||
313 | } else if (p->tgid == pid && p->signal) { | ||
314 | error = cpu_clock_sample_group(which_clock, | ||
315 | p, &rtn); | ||
316 | } | ||
317 | } | ||
318 | read_unlock(&tasklist_lock); | ||
319 | } | ||
320 | |||
321 | if (error) | ||
322 | return error; | ||
323 | sample_to_timespec(which_clock, rtn, tp); | ||
324 | return 0; | ||
325 | } | ||
326 | |||
327 | |||
328 | /* | ||
329 | * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. | ||
330 | * This is called from sys_timer_create with the new timer already locked. | ||
331 | */ | ||
332 | int posix_cpu_timer_create(struct k_itimer *new_timer) | ||
333 | { | ||
334 | int ret = 0; | ||
335 | const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); | ||
336 | struct task_struct *p; | ||
337 | |||
338 | if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) | ||
339 | return -EINVAL; | ||
340 | |||
341 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); | ||
342 | new_timer->it.cpu.incr.sched = 0; | ||
343 | new_timer->it.cpu.expires.sched = 0; | ||
344 | |||
345 | read_lock(&tasklist_lock); | ||
346 | if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { | ||
347 | if (pid == 0) { | ||
348 | p = current; | ||
349 | } else { | ||
350 | p = find_task_by_pid(pid); | ||
351 | if (p && p->tgid != current->tgid) | ||
352 | p = NULL; | ||
353 | } | ||
354 | } else { | ||
355 | if (pid == 0) { | ||
356 | p = current->group_leader; | ||
357 | } else { | ||
358 | p = find_task_by_pid(pid); | ||
359 | if (p && p->tgid != pid) | ||
360 | p = NULL; | ||
361 | } | ||
362 | } | ||
363 | new_timer->it.cpu.task = p; | ||
364 | if (p) { | ||
365 | get_task_struct(p); | ||
366 | } else { | ||
367 | ret = -EINVAL; | ||
368 | } | ||
369 | read_unlock(&tasklist_lock); | ||
370 | |||
371 | return ret; | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * Clean up a CPU-clock timer that is about to be destroyed. | ||
376 | * This is called from timer deletion with the timer already locked. | ||
377 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | ||
378 | * and try again. (This happens when the timer is in the middle of firing.) | ||
379 | */ | ||
380 | int posix_cpu_timer_del(struct k_itimer *timer) | ||
381 | { | ||
382 | struct task_struct *p = timer->it.cpu.task; | ||
383 | |||
384 | if (timer->it.cpu.firing) | ||
385 | return TIMER_RETRY; | ||
386 | |||
387 | if (unlikely(p == NULL)) | ||
388 | return 0; | ||
389 | |||
390 | if (!list_empty(&timer->it.cpu.entry)) { | ||
391 | read_lock(&tasklist_lock); | ||
392 | if (unlikely(p->signal == NULL)) { | ||
393 | /* | ||
394 | * We raced with the reaping of the task. | ||
395 | * The deletion should have cleared us off the list. | ||
396 | */ | ||
397 | BUG_ON(!list_empty(&timer->it.cpu.entry)); | ||
398 | } else { | ||
399 | /* | ||
400 | * Take us off the task's timer list. | ||
401 | */ | ||
402 | spin_lock(&p->sighand->siglock); | ||
403 | list_del(&timer->it.cpu.entry); | ||
404 | spin_unlock(&p->sighand->siglock); | ||
405 | } | ||
406 | read_unlock(&tasklist_lock); | ||
407 | } | ||
408 | put_task_struct(p); | ||
409 | |||
410 | return 0; | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Clean out CPU timers still ticking when a thread exited. The task | ||
415 | * pointer is cleared, and the expiry time is replaced with the residual | ||
416 | * time for later timer_gettime calls to return. | ||
417 | * This must be called with the siglock held. | ||
418 | */ | ||
419 | static void cleanup_timers(struct list_head *head, | ||
420 | cputime_t utime, cputime_t stime, | ||
421 | unsigned long long sched_time) | ||
422 | { | ||
423 | struct cpu_timer_list *timer, *next; | ||
424 | cputime_t ptime = cputime_add(utime, stime); | ||
425 | |||
426 | list_for_each_entry_safe(timer, next, head, entry) { | ||
427 | timer->task = NULL; | ||
428 | list_del_init(&timer->entry); | ||
429 | if (cputime_lt(timer->expires.cpu, ptime)) { | ||
430 | timer->expires.cpu = cputime_zero; | ||
431 | } else { | ||
432 | timer->expires.cpu = cputime_sub(timer->expires.cpu, | ||
433 | ptime); | ||
434 | } | ||
435 | } | ||
436 | |||
437 | ++head; | ||
438 | list_for_each_entry_safe(timer, next, head, entry) { | ||
439 | timer->task = NULL; | ||
440 | list_del_init(&timer->entry); | ||
441 | if (cputime_lt(timer->expires.cpu, utime)) { | ||
442 | timer->expires.cpu = cputime_zero; | ||
443 | } else { | ||
444 | timer->expires.cpu = cputime_sub(timer->expires.cpu, | ||
445 | utime); | ||
446 | } | ||
447 | } | ||
448 | |||
449 | ++head; | ||
450 | list_for_each_entry_safe(timer, next, head, entry) { | ||
451 | timer->task = NULL; | ||
452 | list_del_init(&timer->entry); | ||
453 | if (timer->expires.sched < sched_time) { | ||
454 | timer->expires.sched = 0; | ||
455 | } else { | ||
456 | timer->expires.sched -= sched_time; | ||
457 | } | ||
458 | } | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * These are both called with the siglock held, when the current thread | ||
463 | * is being reaped. When the final (leader) thread in the group is reaped, | ||
464 | * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. | ||
465 | */ | ||
466 | void posix_cpu_timers_exit(struct task_struct *tsk) | ||
467 | { | ||
468 | cleanup_timers(tsk->cpu_timers, | ||
469 | tsk->utime, tsk->stime, tsk->sched_time); | ||
470 | |||
471 | } | ||
472 | void posix_cpu_timers_exit_group(struct task_struct *tsk) | ||
473 | { | ||
474 | cleanup_timers(tsk->signal->cpu_timers, | ||
475 | cputime_add(tsk->utime, tsk->signal->utime), | ||
476 | cputime_add(tsk->stime, tsk->signal->stime), | ||
477 | tsk->sched_time + tsk->signal->sched_time); | ||
478 | } | ||
479 | |||
480 | |||
481 | /* | ||
482 | * Set the expiry times of all the threads in the process so one of them | ||
483 | * will go off before the process cumulative expiry total is reached. | ||
484 | */ | ||
485 | static void process_timer_rebalance(struct task_struct *p, | ||
486 | unsigned int clock_idx, | ||
487 | union cpu_time_count expires, | ||
488 | union cpu_time_count val) | ||
489 | { | ||
490 | cputime_t ticks, left; | ||
491 | unsigned long long ns, nsleft; | ||
492 | struct task_struct *t = p; | ||
493 | unsigned int nthreads = atomic_read(&p->signal->live); | ||
494 | |||
495 | switch (clock_idx) { | ||
496 | default: | ||
497 | BUG(); | ||
498 | break; | ||
499 | case CPUCLOCK_PROF: | ||
500 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), | ||
501 | nthreads); | ||
502 | do { | ||
503 | if (!unlikely(t->exit_state)) { | ||
504 | ticks = cputime_add(prof_ticks(t), left); | ||
505 | if (cputime_eq(t->it_prof_expires, | ||
506 | cputime_zero) || | ||
507 | cputime_gt(t->it_prof_expires, ticks)) { | ||
508 | t->it_prof_expires = ticks; | ||
509 | } | ||
510 | } | ||
511 | t = next_thread(t); | ||
512 | } while (t != p); | ||
513 | break; | ||
514 | case CPUCLOCK_VIRT: | ||
515 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), | ||
516 | nthreads); | ||
517 | do { | ||
518 | if (!unlikely(t->exit_state)) { | ||
519 | ticks = cputime_add(virt_ticks(t), left); | ||
520 | if (cputime_eq(t->it_virt_expires, | ||
521 | cputime_zero) || | ||
522 | cputime_gt(t->it_virt_expires, ticks)) { | ||
523 | t->it_virt_expires = ticks; | ||
524 | } | ||
525 | } | ||
526 | t = next_thread(t); | ||
527 | } while (t != p); | ||
528 | break; | ||
529 | case CPUCLOCK_SCHED: | ||
530 | nsleft = expires.sched - val.sched; | ||
531 | do_div(nsleft, nthreads); | ||
532 | do { | ||
533 | if (!unlikely(t->exit_state)) { | ||
534 | ns = t->sched_time + nsleft; | ||
535 | if (t->it_sched_expires == 0 || | ||
536 | t->it_sched_expires > ns) { | ||
537 | t->it_sched_expires = ns; | ||
538 | } | ||
539 | } | ||
540 | t = next_thread(t); | ||
541 | } while (t != p); | ||
542 | break; | ||
543 | } | ||
544 | } | ||
545 | |||
546 | static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) | ||
547 | { | ||
548 | /* | ||
549 | * That's all for this thread or process. | ||
550 | * We leave our residual in expires to be reported. | ||
551 | */ | ||
552 | put_task_struct(timer->it.cpu.task); | ||
553 | timer->it.cpu.task = NULL; | ||
554 | timer->it.cpu.expires = cpu_time_sub(timer->it_clock, | ||
555 | timer->it.cpu.expires, | ||
556 | now); | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * Insert the timer on the appropriate list before any timers that | ||
561 | * expire later. This must be called with the tasklist_lock held | ||
562 | * for reading, and interrupts disabled. | ||
563 | */ | ||
564 | static void arm_timer(struct k_itimer *timer, union cpu_time_count now) | ||
565 | { | ||
566 | struct task_struct *p = timer->it.cpu.task; | ||
567 | struct list_head *head, *listpos; | ||
568 | struct cpu_timer_list *const nt = &timer->it.cpu; | ||
569 | struct cpu_timer_list *next; | ||
570 | unsigned long i; | ||
571 | |||
572 | head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? | ||
573 | p->cpu_timers : p->signal->cpu_timers); | ||
574 | head += CPUCLOCK_WHICH(timer->it_clock); | ||
575 | |||
576 | BUG_ON(!irqs_disabled()); | ||
577 | spin_lock(&p->sighand->siglock); | ||
578 | |||
579 | listpos = head; | ||
580 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { | ||
581 | list_for_each_entry(next, head, entry) { | ||
582 | if (next->expires.sched > nt->expires.sched) { | ||
583 | listpos = &next->entry; | ||
584 | break; | ||
585 | } | ||
586 | } | ||
587 | } else { | ||
588 | list_for_each_entry(next, head, entry) { | ||
589 | if (cputime_gt(next->expires.cpu, nt->expires.cpu)) { | ||
590 | listpos = &next->entry; | ||
591 | break; | ||
592 | } | ||
593 | } | ||
594 | } | ||
595 | list_add(&nt->entry, listpos); | ||
596 | |||
597 | if (listpos == head) { | ||
598 | /* | ||
599 | * We are the new earliest-expiring timer. | ||
600 | * If we are a thread timer, there can always | ||
601 | * be a process timer telling us to stop earlier. | ||
602 | */ | ||
603 | |||
604 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | ||
605 | switch (CPUCLOCK_WHICH(timer->it_clock)) { | ||
606 | default: | ||
607 | BUG(); | ||
608 | case CPUCLOCK_PROF: | ||
609 | if (cputime_eq(p->it_prof_expires, | ||
610 | cputime_zero) || | ||
611 | cputime_gt(p->it_prof_expires, | ||
612 | nt->expires.cpu)) | ||
613 | p->it_prof_expires = nt->expires.cpu; | ||
614 | break; | ||
615 | case CPUCLOCK_VIRT: | ||
616 | if (cputime_eq(p->it_virt_expires, | ||
617 | cputime_zero) || | ||
618 | cputime_gt(p->it_virt_expires, | ||
619 | nt->expires.cpu)) | ||
620 | p->it_virt_expires = nt->expires.cpu; | ||
621 | break; | ||
622 | case CPUCLOCK_SCHED: | ||
623 | if (p->it_sched_expires == 0 || | ||
624 | p->it_sched_expires > nt->expires.sched) | ||
625 | p->it_sched_expires = nt->expires.sched; | ||
626 | break; | ||
627 | } | ||
628 | } else { | ||
629 | /* | ||
630 | * For a process timer, we must balance | ||
631 | * all the live threads' expirations. | ||
632 | */ | ||
633 | switch (CPUCLOCK_WHICH(timer->it_clock)) { | ||
634 | default: | ||
635 | BUG(); | ||
636 | case CPUCLOCK_VIRT: | ||
637 | if (!cputime_eq(p->signal->it_virt_expires, | ||
638 | cputime_zero) && | ||
639 | cputime_lt(p->signal->it_virt_expires, | ||
640 | timer->it.cpu.expires.cpu)) | ||
641 | break; | ||
642 | goto rebalance; | ||
643 | case CPUCLOCK_PROF: | ||
644 | if (!cputime_eq(p->signal->it_prof_expires, | ||
645 | cputime_zero) && | ||
646 | cputime_lt(p->signal->it_prof_expires, | ||
647 | timer->it.cpu.expires.cpu)) | ||
648 | break; | ||
649 | i = p->signal->rlim[RLIMIT_CPU].rlim_cur; | ||
650 | if (i != RLIM_INFINITY && | ||
651 | i <= cputime_to_secs(timer->it.cpu.expires.cpu)) | ||
652 | break; | ||
653 | goto rebalance; | ||
654 | case CPUCLOCK_SCHED: | ||
655 | rebalance: | ||
656 | process_timer_rebalance( | ||
657 | timer->it.cpu.task, | ||
658 | CPUCLOCK_WHICH(timer->it_clock), | ||
659 | timer->it.cpu.expires, now); | ||
660 | break; | ||
661 | } | ||
662 | } | ||
663 | } | ||
664 | |||
665 | spin_unlock(&p->sighand->siglock); | ||
666 | } | ||
667 | |||
668 | /* | ||
669 | * The timer is locked, fire it and arrange for its reload. | ||
670 | */ | ||
671 | static void cpu_timer_fire(struct k_itimer *timer) | ||
672 | { | ||
673 | if (unlikely(timer->sigq == NULL)) { | ||
674 | /* | ||
675 | * This a special case for clock_nanosleep, | ||
676 | * not a normal timer from sys_timer_create. | ||
677 | */ | ||
678 | wake_up_process(timer->it_process); | ||
679 | timer->it.cpu.expires.sched = 0; | ||
680 | } else if (timer->it.cpu.incr.sched == 0) { | ||
681 | /* | ||
682 | * One-shot timer. Clear it as soon as it's fired. | ||
683 | */ | ||
684 | posix_timer_event(timer, 0); | ||
685 | timer->it.cpu.expires.sched = 0; | ||
686 | } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { | ||
687 | /* | ||
688 | * The signal did not get queued because the signal | ||
689 | * was ignored, so we won't get any callback to | ||
690 | * reload the timer. But we need to keep it | ||
691 | * ticking in case the signal is deliverable next time. | ||
692 | */ | ||
693 | posix_cpu_timer_schedule(timer); | ||
694 | } | ||
695 | } | ||
696 | |||
697 | /* | ||
698 | * Guts of sys_timer_settime for CPU timers. | ||
699 | * This is called with the timer locked and interrupts disabled. | ||
700 | * If we return TIMER_RETRY, it's necessary to release the timer's lock | ||
701 | * and try again. (This happens when the timer is in the middle of firing.) | ||
702 | */ | ||
703 | int posix_cpu_timer_set(struct k_itimer *timer, int flags, | ||
704 | struct itimerspec *new, struct itimerspec *old) | ||
705 | { | ||
706 | struct task_struct *p = timer->it.cpu.task; | ||
707 | union cpu_time_count old_expires, new_expires, val; | ||
708 | int ret; | ||
709 | |||
710 | if (unlikely(p == NULL)) { | ||
711 | /* | ||
712 | * Timer refers to a dead task's clock. | ||
713 | */ | ||
714 | return -ESRCH; | ||
715 | } | ||
716 | |||
717 | new_expires = timespec_to_sample(timer->it_clock, &new->it_value); | ||
718 | |||
719 | read_lock(&tasklist_lock); | ||
720 | /* | ||
721 | * We need the tasklist_lock to protect against reaping that | ||
722 | * clears p->signal. If p has just been reaped, we can no | ||
723 | * longer get any information about it at all. | ||
724 | */ | ||
725 | if (unlikely(p->signal == NULL)) { | ||
726 | read_unlock(&tasklist_lock); | ||
727 | put_task_struct(p); | ||
728 | timer->it.cpu.task = NULL; | ||
729 | return -ESRCH; | ||
730 | } | ||
731 | |||
732 | /* | ||
733 | * Disarm any old timer after extracting its expiry time. | ||
734 | */ | ||
735 | BUG_ON(!irqs_disabled()); | ||
736 | spin_lock(&p->sighand->siglock); | ||
737 | old_expires = timer->it.cpu.expires; | ||
738 | list_del_init(&timer->it.cpu.entry); | ||
739 | spin_unlock(&p->sighand->siglock); | ||
740 | |||
741 | /* | ||
742 | * We need to sample the current value to convert the new | ||
743 | * value from to relative and absolute, and to convert the | ||
744 | * old value from absolute to relative. To set a process | ||
745 | * timer, we need a sample to balance the thread expiry | ||
746 | * times (in arm_timer). With an absolute time, we must | ||
747 | * check if it's already passed. In short, we need a sample. | ||
748 | */ | ||
749 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | ||
750 | cpu_clock_sample(timer->it_clock, p, &val); | ||
751 | } else { | ||
752 | cpu_clock_sample_group(timer->it_clock, p, &val); | ||
753 | } | ||
754 | |||
755 | if (old) { | ||
756 | if (old_expires.sched == 0) { | ||
757 | old->it_value.tv_sec = 0; | ||
758 | old->it_value.tv_nsec = 0; | ||
759 | } else { | ||
760 | /* | ||
761 | * Update the timer in case it has | ||
762 | * overrun already. If it has, | ||
763 | * we'll report it as having overrun | ||
764 | * and with the next reloaded timer | ||
765 | * already ticking, though we are | ||
766 | * swallowing that pending | ||
767 | * notification here to install the | ||
768 | * new setting. | ||
769 | */ | ||
770 | bump_cpu_timer(timer, val); | ||
771 | if (cpu_time_before(timer->it_clock, val, | ||
772 | timer->it.cpu.expires)) { | ||
773 | old_expires = cpu_time_sub( | ||
774 | timer->it_clock, | ||
775 | timer->it.cpu.expires, val); | ||
776 | sample_to_timespec(timer->it_clock, | ||
777 | old_expires, | ||
778 | &old->it_value); | ||
779 | } else { | ||
780 | old->it_value.tv_nsec = 1; | ||
781 | old->it_value.tv_sec = 0; | ||
782 | } | ||
783 | } | ||
784 | } | ||
785 | |||
786 | if (unlikely(timer->it.cpu.firing)) { | ||
787 | /* | ||
788 | * We are colliding with the timer actually firing. | ||
789 | * Punt after filling in the timer's old value, and | ||
790 | * disable this firing since we are already reporting | ||
791 | * it as an overrun (thanks to bump_cpu_timer above). | ||
792 | */ | ||
793 | read_unlock(&tasklist_lock); | ||
794 | timer->it.cpu.firing = -1; | ||
795 | ret = TIMER_RETRY; | ||
796 | goto out; | ||
797 | } | ||
798 | |||
799 | if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { | ||
800 | cpu_time_add(timer->it_clock, &new_expires, val); | ||
801 | } | ||
802 | |||
803 | /* | ||
804 | * Install the new expiry time (or zero). | ||
805 | * For a timer with no notification action, we don't actually | ||
806 | * arm the timer (we'll just fake it for timer_gettime). | ||
807 | */ | ||
808 | timer->it.cpu.expires = new_expires; | ||
809 | if (new_expires.sched != 0 && | ||
810 | (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && | ||
811 | cpu_time_before(timer->it_clock, val, new_expires)) { | ||
812 | arm_timer(timer, val); | ||
813 | } | ||
814 | |||
815 | read_unlock(&tasklist_lock); | ||
816 | |||
817 | /* | ||
818 | * Install the new reload setting, and | ||
819 | * set up the signal and overrun bookkeeping. | ||
820 | */ | ||
821 | timer->it.cpu.incr = timespec_to_sample(timer->it_clock, | ||
822 | &new->it_interval); | ||
823 | |||
824 | /* | ||
825 | * This acts as a modification timestamp for the timer, | ||
826 | * so any automatic reload attempt will punt on seeing | ||
827 | * that we have reset the timer manually. | ||
828 | */ | ||
829 | timer->it_requeue_pending = (timer->it_requeue_pending + 2) & | ||
830 | ~REQUEUE_PENDING; | ||
831 | timer->it_overrun_last = 0; | ||
832 | timer->it_overrun = -1; | ||
833 | |||
834 | if (new_expires.sched != 0 && | ||
835 | (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && | ||
836 | !cpu_time_before(timer->it_clock, val, new_expires)) { | ||
837 | /* | ||
838 | * The designated time already passed, so we notify | ||
839 | * immediately, even if the thread never runs to | ||
840 | * accumulate more time on this clock. | ||
841 | */ | ||
842 | cpu_timer_fire(timer); | ||
843 | } | ||
844 | |||
845 | ret = 0; | ||
846 | out: | ||
847 | if (old) { | ||
848 | sample_to_timespec(timer->it_clock, | ||
849 | timer->it.cpu.incr, &old->it_interval); | ||
850 | } | ||
851 | return ret; | ||
852 | } | ||
853 | |||
854 | void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | ||
855 | { | ||
856 | union cpu_time_count now; | ||
857 | struct task_struct *p = timer->it.cpu.task; | ||
858 | int clear_dead; | ||
859 | |||
860 | /* | ||
861 | * Easy part: convert the reload time. | ||
862 | */ | ||
863 | sample_to_timespec(timer->it_clock, | ||
864 | timer->it.cpu.incr, &itp->it_interval); | ||
865 | |||
866 | if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ | ||
867 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; | ||
868 | return; | ||
869 | } | ||
870 | |||
871 | if (unlikely(p == NULL)) { | ||
872 | /* | ||
873 | * This task already died and the timer will never fire. | ||
874 | * In this case, expires is actually the dead value. | ||
875 | */ | ||
876 | dead: | ||
877 | sample_to_timespec(timer->it_clock, timer->it.cpu.expires, | ||
878 | &itp->it_value); | ||
879 | return; | ||
880 | } | ||
881 | |||
882 | /* | ||
883 | * Sample the clock to take the difference with the expiry time. | ||
884 | */ | ||
885 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | ||
886 | cpu_clock_sample(timer->it_clock, p, &now); | ||
887 | clear_dead = p->exit_state; | ||
888 | } else { | ||
889 | read_lock(&tasklist_lock); | ||
890 | if (unlikely(p->signal == NULL)) { | ||
891 | /* | ||
892 | * The process has been reaped. | ||
893 | * We can't even collect a sample any more. | ||
894 | * Call the timer disarmed, nothing else to do. | ||
895 | */ | ||
896 | put_task_struct(p); | ||
897 | timer->it.cpu.task = NULL; | ||
898 | timer->it.cpu.expires.sched = 0; | ||
899 | read_unlock(&tasklist_lock); | ||
900 | goto dead; | ||
901 | } else { | ||
902 | cpu_clock_sample_group(timer->it_clock, p, &now); | ||
903 | clear_dead = (unlikely(p->exit_state) && | ||
904 | thread_group_empty(p)); | ||
905 | } | ||
906 | read_unlock(&tasklist_lock); | ||
907 | } | ||
908 | |||
909 | if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { | ||
910 | if (timer->it.cpu.incr.sched == 0 && | ||
911 | cpu_time_before(timer->it_clock, | ||
912 | timer->it.cpu.expires, now)) { | ||
913 | /* | ||
914 | * Do-nothing timer expired and has no reload, | ||
915 | * so it's as if it was never set. | ||
916 | */ | ||
917 | timer->it.cpu.expires.sched = 0; | ||
918 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; | ||
919 | return; | ||
920 | } | ||
921 | /* | ||
922 | * Account for any expirations and reloads that should | ||
923 | * have happened. | ||
924 | */ | ||
925 | bump_cpu_timer(timer, now); | ||
926 | } | ||
927 | |||
928 | if (unlikely(clear_dead)) { | ||
929 | /* | ||
930 | * We've noticed that the thread is dead, but | ||
931 | * not yet reaped. Take this opportunity to | ||
932 | * drop our task ref. | ||
933 | */ | ||
934 | clear_dead_task(timer, now); | ||
935 | goto dead; | ||
936 | } | ||
937 | |||
938 | if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { | ||
939 | sample_to_timespec(timer->it_clock, | ||
940 | cpu_time_sub(timer->it_clock, | ||
941 | timer->it.cpu.expires, now), | ||
942 | &itp->it_value); | ||
943 | } else { | ||
944 | /* | ||
945 | * The timer should have expired already, but the firing | ||
946 | * hasn't taken place yet. Say it's just about to expire. | ||
947 | */ | ||
948 | itp->it_value.tv_nsec = 1; | ||
949 | itp->it_value.tv_sec = 0; | ||
950 | } | ||
951 | } | ||
952 | |||
953 | /* | ||
954 | * Check for any per-thread CPU timers that have fired and move them off | ||
955 | * the tsk->cpu_timers[N] list onto the firing list. Here we update the | ||
956 | * tsk->it_*_expires values to reflect the remaining thread CPU timers. | ||
957 | */ | ||
958 | static void check_thread_timers(struct task_struct *tsk, | ||
959 | struct list_head *firing) | ||
960 | { | ||
961 | struct list_head *timers = tsk->cpu_timers; | ||
962 | |||
963 | tsk->it_prof_expires = cputime_zero; | ||
964 | while (!list_empty(timers)) { | ||
965 | struct cpu_timer_list *t = list_entry(timers->next, | ||
966 | struct cpu_timer_list, | ||
967 | entry); | ||
968 | if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) { | ||
969 | tsk->it_prof_expires = t->expires.cpu; | ||
970 | break; | ||
971 | } | ||
972 | t->firing = 1; | ||
973 | list_move_tail(&t->entry, firing); | ||
974 | } | ||
975 | |||
976 | ++timers; | ||
977 | tsk->it_virt_expires = cputime_zero; | ||
978 | while (!list_empty(timers)) { | ||
979 | struct cpu_timer_list *t = list_entry(timers->next, | ||
980 | struct cpu_timer_list, | ||
981 | entry); | ||
982 | if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) { | ||
983 | tsk->it_virt_expires = t->expires.cpu; | ||
984 | break; | ||
985 | } | ||
986 | t->firing = 1; | ||
987 | list_move_tail(&t->entry, firing); | ||
988 | } | ||
989 | |||
990 | ++timers; | ||
991 | tsk->it_sched_expires = 0; | ||
992 | while (!list_empty(timers)) { | ||
993 | struct cpu_timer_list *t = list_entry(timers->next, | ||
994 | struct cpu_timer_list, | ||
995 | entry); | ||
996 | if (tsk->sched_time < t->expires.sched) { | ||
997 | tsk->it_sched_expires = t->expires.sched; | ||
998 | break; | ||
999 | } | ||
1000 | t->firing = 1; | ||
1001 | list_move_tail(&t->entry, firing); | ||
1002 | } | ||
1003 | } | ||
1004 | |||
1005 | /* | ||
1006 | * Check for any per-thread CPU timers that have fired and move them | ||
1007 | * off the tsk->*_timers list onto the firing list. Per-thread timers | ||
1008 | * have already been taken off. | ||
1009 | */ | ||
1010 | static void check_process_timers(struct task_struct *tsk, | ||
1011 | struct list_head *firing) | ||
1012 | { | ||
1013 | struct signal_struct *const sig = tsk->signal; | ||
1014 | cputime_t utime, stime, ptime, virt_expires, prof_expires; | ||
1015 | unsigned long long sched_time, sched_expires; | ||
1016 | struct task_struct *t; | ||
1017 | struct list_head *timers = sig->cpu_timers; | ||
1018 | |||
1019 | /* | ||
1020 | * Don't sample the current process CPU clocks if there are no timers. | ||
1021 | */ | ||
1022 | if (list_empty(&timers[CPUCLOCK_PROF]) && | ||
1023 | cputime_eq(sig->it_prof_expires, cputime_zero) && | ||
1024 | sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && | ||
1025 | list_empty(&timers[CPUCLOCK_VIRT]) && | ||
1026 | cputime_eq(sig->it_virt_expires, cputime_zero) && | ||
1027 | list_empty(&timers[CPUCLOCK_SCHED])) | ||
1028 | return; | ||
1029 | |||
1030 | /* | ||
1031 | * Collect the current process totals. | ||
1032 | */ | ||
1033 | utime = sig->utime; | ||
1034 | stime = sig->stime; | ||
1035 | sched_time = sig->sched_time; | ||
1036 | t = tsk; | ||
1037 | do { | ||
1038 | utime = cputime_add(utime, t->utime); | ||
1039 | stime = cputime_add(stime, t->stime); | ||
1040 | sched_time += t->sched_time; | ||
1041 | t = next_thread(t); | ||
1042 | } while (t != tsk); | ||
1043 | ptime = cputime_add(utime, stime); | ||
1044 | |||
1045 | prof_expires = cputime_zero; | ||
1046 | while (!list_empty(timers)) { | ||
1047 | struct cpu_timer_list *t = list_entry(timers->next, | ||
1048 | struct cpu_timer_list, | ||
1049 | entry); | ||
1050 | if (cputime_lt(ptime, t->expires.cpu)) { | ||
1051 | prof_expires = t->expires.cpu; | ||
1052 | break; | ||
1053 | } | ||
1054 | t->firing = 1; | ||
1055 | list_move_tail(&t->entry, firing); | ||
1056 | } | ||
1057 | |||
1058 | ++timers; | ||
1059 | virt_expires = cputime_zero; | ||
1060 | while (!list_empty(timers)) { | ||
1061 | struct cpu_timer_list *t = list_entry(timers->next, | ||
1062 | struct cpu_timer_list, | ||
1063 | entry); | ||
1064 | if (cputime_lt(utime, t->expires.cpu)) { | ||
1065 | virt_expires = t->expires.cpu; | ||
1066 | break; | ||
1067 | } | ||
1068 | t->firing = 1; | ||
1069 | list_move_tail(&t->entry, firing); | ||
1070 | } | ||
1071 | |||
1072 | ++timers; | ||
1073 | sched_expires = 0; | ||
1074 | while (!list_empty(timers)) { | ||
1075 | struct cpu_timer_list *t = list_entry(timers->next, | ||
1076 | struct cpu_timer_list, | ||
1077 | entry); | ||
1078 | if (sched_time < t->expires.sched) { | ||
1079 | sched_expires = t->expires.sched; | ||
1080 | break; | ||
1081 | } | ||
1082 | t->firing = 1; | ||
1083 | list_move_tail(&t->entry, firing); | ||
1084 | } | ||
1085 | |||
1086 | /* | ||
1087 | * Check for the special case process timers. | ||
1088 | */ | ||
1089 | if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { | ||
1090 | if (cputime_ge(ptime, sig->it_prof_expires)) { | ||
1091 | /* ITIMER_PROF fires and reloads. */ | ||
1092 | sig->it_prof_expires = sig->it_prof_incr; | ||
1093 | if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { | ||
1094 | sig->it_prof_expires = cputime_add( | ||
1095 | sig->it_prof_expires, ptime); | ||
1096 | } | ||
1097 | __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); | ||
1098 | } | ||
1099 | if (!cputime_eq(sig->it_prof_expires, cputime_zero) && | ||
1100 | (cputime_eq(prof_expires, cputime_zero) || | ||
1101 | cputime_lt(sig->it_prof_expires, prof_expires))) { | ||
1102 | prof_expires = sig->it_prof_expires; | ||
1103 | } | ||
1104 | } | ||
1105 | if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { | ||
1106 | if (cputime_ge(utime, sig->it_virt_expires)) { | ||
1107 | /* ITIMER_VIRTUAL fires and reloads. */ | ||
1108 | sig->it_virt_expires = sig->it_virt_incr; | ||
1109 | if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { | ||
1110 | sig->it_virt_expires = cputime_add( | ||
1111 | sig->it_virt_expires, utime); | ||
1112 | } | ||
1113 | __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); | ||
1114 | } | ||
1115 | if (!cputime_eq(sig->it_virt_expires, cputime_zero) && | ||
1116 | (cputime_eq(virt_expires, cputime_zero) || | ||
1117 | cputime_lt(sig->it_virt_expires, virt_expires))) { | ||
1118 | virt_expires = sig->it_virt_expires; | ||
1119 | } | ||
1120 | } | ||
1121 | if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { | ||
1122 | unsigned long psecs = cputime_to_secs(ptime); | ||
1123 | cputime_t x; | ||
1124 | if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { | ||
1125 | /* | ||
1126 | * At the hard limit, we just die. | ||
1127 | * No need to calculate anything else now. | ||
1128 | */ | ||
1129 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | ||
1130 | return; | ||
1131 | } | ||
1132 | if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { | ||
1133 | /* | ||
1134 | * At the soft limit, send a SIGXCPU every second. | ||
1135 | */ | ||
1136 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | ||
1137 | if (sig->rlim[RLIMIT_CPU].rlim_cur | ||
1138 | < sig->rlim[RLIMIT_CPU].rlim_max) { | ||
1139 | sig->rlim[RLIMIT_CPU].rlim_cur++; | ||
1140 | } | ||
1141 | } | ||
1142 | x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | ||
1143 | if (cputime_eq(prof_expires, cputime_zero) || | ||
1144 | cputime_lt(x, prof_expires)) { | ||
1145 | prof_expires = x; | ||
1146 | } | ||
1147 | } | ||
1148 | |||
1149 | if (!cputime_eq(prof_expires, cputime_zero) || | ||
1150 | !cputime_eq(virt_expires, cputime_zero) || | ||
1151 | sched_expires != 0) { | ||
1152 | /* | ||
1153 | * Rebalance the threads' expiry times for the remaining | ||
1154 | * process CPU timers. | ||
1155 | */ | ||
1156 | |||
1157 | cputime_t prof_left, virt_left, ticks; | ||
1158 | unsigned long long sched_left, sched; | ||
1159 | const unsigned int nthreads = atomic_read(&sig->live); | ||
1160 | |||
1161 | prof_left = cputime_sub(prof_expires, utime); | ||
1162 | prof_left = cputime_sub(prof_left, stime); | ||
1163 | prof_left = cputime_div(prof_left, nthreads); | ||
1164 | virt_left = cputime_sub(virt_expires, utime); | ||
1165 | virt_left = cputime_div(virt_left, nthreads); | ||
1166 | if (sched_expires) { | ||
1167 | sched_left = sched_expires - sched_time; | ||
1168 | do_div(sched_left, nthreads); | ||
1169 | } else { | ||
1170 | sched_left = 0; | ||
1171 | } | ||
1172 | t = tsk; | ||
1173 | do { | ||
1174 | ticks = cputime_add(cputime_add(t->utime, t->stime), | ||
1175 | prof_left); | ||
1176 | if (!cputime_eq(prof_expires, cputime_zero) && | ||
1177 | (cputime_eq(t->it_prof_expires, cputime_zero) || | ||
1178 | cputime_gt(t->it_prof_expires, ticks))) { | ||
1179 | t->it_prof_expires = ticks; | ||
1180 | } | ||
1181 | |||
1182 | ticks = cputime_add(t->utime, virt_left); | ||
1183 | if (!cputime_eq(virt_expires, cputime_zero) && | ||
1184 | (cputime_eq(t->it_virt_expires, cputime_zero) || | ||
1185 | cputime_gt(t->it_virt_expires, ticks))) { | ||
1186 | t->it_virt_expires = ticks; | ||
1187 | } | ||
1188 | |||
1189 | sched = t->sched_time + sched_left; | ||
1190 | if (sched_expires && (t->it_sched_expires == 0 || | ||
1191 | t->it_sched_expires > sched)) { | ||
1192 | t->it_sched_expires = sched; | ||
1193 | } | ||
1194 | |||
1195 | do { | ||
1196 | t = next_thread(t); | ||
1197 | } while (unlikely(t->exit_state)); | ||
1198 | } while (t != tsk); | ||
1199 | } | ||
1200 | } | ||
1201 | |||
1202 | /* | ||
1203 | * This is called from the signal code (via do_schedule_next_timer) | ||
1204 | * when the last timer signal was delivered and we have to reload the timer. | ||
1205 | */ | ||
1206 | void posix_cpu_timer_schedule(struct k_itimer *timer) | ||
1207 | { | ||
1208 | struct task_struct *p = timer->it.cpu.task; | ||
1209 | union cpu_time_count now; | ||
1210 | |||
1211 | if (unlikely(p == NULL)) | ||
1212 | /* | ||
1213 | * The task was cleaned up already, no future firings. | ||
1214 | */ | ||
1215 | return; | ||
1216 | |||
1217 | /* | ||
1218 | * Fetch the current sample and update the timer's expiry time. | ||
1219 | */ | ||
1220 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) { | ||
1221 | cpu_clock_sample(timer->it_clock, p, &now); | ||
1222 | bump_cpu_timer(timer, now); | ||
1223 | if (unlikely(p->exit_state)) { | ||
1224 | clear_dead_task(timer, now); | ||
1225 | return; | ||
1226 | } | ||
1227 | read_lock(&tasklist_lock); /* arm_timer needs it. */ | ||
1228 | } else { | ||
1229 | read_lock(&tasklist_lock); | ||
1230 | if (unlikely(p->signal == NULL)) { | ||
1231 | /* | ||
1232 | * The process has been reaped. | ||
1233 | * We can't even collect a sample any more. | ||
1234 | */ | ||
1235 | put_task_struct(p); | ||
1236 | timer->it.cpu.task = p = NULL; | ||
1237 | timer->it.cpu.expires.sched = 0; | ||
1238 | read_unlock(&tasklist_lock); | ||
1239 | return; | ||
1240 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { | ||
1241 | /* | ||
1242 | * We've noticed that the thread is dead, but | ||
1243 | * not yet reaped. Take this opportunity to | ||
1244 | * drop our task ref. | ||
1245 | */ | ||
1246 | clear_dead_task(timer, now); | ||
1247 | read_unlock(&tasklist_lock); | ||
1248 | return; | ||
1249 | } | ||
1250 | cpu_clock_sample_group(timer->it_clock, p, &now); | ||
1251 | bump_cpu_timer(timer, now); | ||
1252 | /* Leave the tasklist_lock locked for the call below. */ | ||
1253 | } | ||
1254 | |||
1255 | /* | ||
1256 | * Now re-arm for the new expiry time. | ||
1257 | */ | ||
1258 | arm_timer(timer, now); | ||
1259 | |||
1260 | read_unlock(&tasklist_lock); | ||
1261 | } | ||
1262 | |||
1263 | /* | ||
1264 | * This is called from the timer interrupt handler. The irq handler has | ||
1265 | * already updated our counts. We need to check if any timers fire now. | ||
1266 | * Interrupts are disabled. | ||
1267 | */ | ||
1268 | void run_posix_cpu_timers(struct task_struct *tsk) | ||
1269 | { | ||
1270 | LIST_HEAD(firing); | ||
1271 | struct k_itimer *timer, *next; | ||
1272 | |||
1273 | BUG_ON(!irqs_disabled()); | ||
1274 | |||
1275 | #define UNEXPIRED(clock) \ | ||
1276 | (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ | ||
1277 | cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) | ||
1278 | |||
1279 | if (UNEXPIRED(prof) && UNEXPIRED(virt) && | ||
1280 | (tsk->it_sched_expires == 0 || | ||
1281 | tsk->sched_time < tsk->it_sched_expires)) | ||
1282 | return; | ||
1283 | |||
1284 | #undef UNEXPIRED | ||
1285 | |||
1286 | BUG_ON(tsk->exit_state); | ||
1287 | |||
1288 | /* | ||
1289 | * Double-check with locks held. | ||
1290 | */ | ||
1291 | read_lock(&tasklist_lock); | ||
1292 | spin_lock(&tsk->sighand->siglock); | ||
1293 | |||
1294 | /* | ||
1295 | * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] | ||
1296 | * all the timers that are firing, and put them on the firing list. | ||
1297 | */ | ||
1298 | check_thread_timers(tsk, &firing); | ||
1299 | check_process_timers(tsk, &firing); | ||
1300 | |||
1301 | /* | ||
1302 | * We must release these locks before taking any timer's lock. | ||
1303 | * There is a potential race with timer deletion here, as the | ||
1304 | * siglock now protects our private firing list. We have set | ||
1305 | * the firing flag in each timer, so that a deletion attempt | ||
1306 | * that gets the timer lock before we do will give it up and | ||
1307 | * spin until we've taken care of that timer below. | ||
1308 | */ | ||
1309 | spin_unlock(&tsk->sighand->siglock); | ||
1310 | read_unlock(&tasklist_lock); | ||
1311 | |||
1312 | /* | ||
1313 | * Now that all the timers on our list have the firing flag, | ||
1314 | * noone will touch their list entries but us. We'll take | ||
1315 | * each timer's lock before clearing its firing flag, so no | ||
1316 | * timer call will interfere. | ||
1317 | */ | ||
1318 | list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { | ||
1319 | int firing; | ||
1320 | spin_lock(&timer->it_lock); | ||
1321 | list_del_init(&timer->it.cpu.entry); | ||
1322 | firing = timer->it.cpu.firing; | ||
1323 | timer->it.cpu.firing = 0; | ||
1324 | /* | ||
1325 | * The firing flag is -1 if we collided with a reset | ||
1326 | * of the timer, which already reported this | ||
1327 | * almost-firing as an overrun. So don't generate an event. | ||
1328 | */ | ||
1329 | if (likely(firing >= 0)) { | ||
1330 | cpu_timer_fire(timer); | ||
1331 | } | ||
1332 | spin_unlock(&timer->it_lock); | ||
1333 | } | ||
1334 | } | ||
1335 | |||
1336 | /* | ||
1337 | * Set one of the process-wide special case CPU timers. | ||
1338 | * The tasklist_lock and tsk->sighand->siglock must be held by the caller. | ||
1339 | * The oldval argument is null for the RLIMIT_CPU timer, where *newval is | ||
1340 | * absolute; non-null for ITIMER_*, where *newval is relative and we update | ||
1341 | * it to be absolute, *oldval is absolute and we update it to be relative. | ||
1342 | */ | ||
1343 | void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | ||
1344 | cputime_t *newval, cputime_t *oldval) | ||
1345 | { | ||
1346 | union cpu_time_count now; | ||
1347 | struct list_head *head; | ||
1348 | |||
1349 | BUG_ON(clock_idx == CPUCLOCK_SCHED); | ||
1350 | cpu_clock_sample_group_locked(clock_idx, tsk, &now); | ||
1351 | |||
1352 | if (oldval) { | ||
1353 | if (!cputime_eq(*oldval, cputime_zero)) { | ||
1354 | if (cputime_le(*oldval, now.cpu)) { | ||
1355 | /* Just about to fire. */ | ||
1356 | *oldval = jiffies_to_cputime(1); | ||
1357 | } else { | ||
1358 | *oldval = cputime_sub(*oldval, now.cpu); | ||
1359 | } | ||
1360 | } | ||
1361 | |||
1362 | if (cputime_eq(*newval, cputime_zero)) | ||
1363 | return; | ||
1364 | *newval = cputime_add(*newval, now.cpu); | ||
1365 | |||
1366 | /* | ||
1367 | * If the RLIMIT_CPU timer will expire before the | ||
1368 | * ITIMER_PROF timer, we have nothing else to do. | ||
1369 | */ | ||
1370 | if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur | ||
1371 | < cputime_to_secs(*newval)) | ||
1372 | return; | ||
1373 | } | ||
1374 | |||
1375 | /* | ||
1376 | * Check whether there are any process timers already set to fire | ||
1377 | * before this one. If so, we don't have anything more to do. | ||
1378 | */ | ||
1379 | head = &tsk->signal->cpu_timers[clock_idx]; | ||
1380 | if (list_empty(head) || | ||
1381 | cputime_ge(list_entry(head->next, | ||
1382 | struct cpu_timer_list, entry)->expires.cpu, | ||
1383 | *newval)) { | ||
1384 | /* | ||
1385 | * Rejigger each thread's expiry time so that one will | ||
1386 | * notice before we hit the process-cumulative expiry time. | ||
1387 | */ | ||
1388 | union cpu_time_count expires = { .sched = 0 }; | ||
1389 | expires.cpu = *newval; | ||
1390 | process_timer_rebalance(tsk, clock_idx, expires, now); | ||
1391 | } | ||
1392 | } | ||
1393 | |||
1394 | static long posix_cpu_clock_nanosleep_restart(struct restart_block *); | ||
1395 | |||
1396 | int posix_cpu_nsleep(clockid_t which_clock, int flags, | ||
1397 | struct timespec *rqtp) | ||
1398 | { | ||
1399 | struct restart_block *restart_block = | ||
1400 | ¤t_thread_info()->restart_block; | ||
1401 | struct k_itimer timer; | ||
1402 | int error; | ||
1403 | |||
1404 | /* | ||
1405 | * Diagnose required errors first. | ||
1406 | */ | ||
1407 | if (CPUCLOCK_PERTHREAD(which_clock) && | ||
1408 | (CPUCLOCK_PID(which_clock) == 0 || | ||
1409 | CPUCLOCK_PID(which_clock) == current->pid)) | ||
1410 | return -EINVAL; | ||
1411 | |||
1412 | /* | ||
1413 | * Set up a temporary timer and then wait for it to go off. | ||
1414 | */ | ||
1415 | memset(&timer, 0, sizeof timer); | ||
1416 | spin_lock_init(&timer.it_lock); | ||
1417 | timer.it_clock = which_clock; | ||
1418 | timer.it_overrun = -1; | ||
1419 | error = posix_cpu_timer_create(&timer); | ||
1420 | timer.it_process = current; | ||
1421 | if (!error) { | ||
1422 | struct timespec __user *rmtp; | ||
1423 | static struct itimerspec zero_it; | ||
1424 | struct itimerspec it = { .it_value = *rqtp, | ||
1425 | .it_interval = {} }; | ||
1426 | |||
1427 | spin_lock_irq(&timer.it_lock); | ||
1428 | error = posix_cpu_timer_set(&timer, flags, &it, NULL); | ||
1429 | if (error) { | ||
1430 | spin_unlock_irq(&timer.it_lock); | ||
1431 | return error; | ||
1432 | } | ||
1433 | |||
1434 | while (!signal_pending(current)) { | ||
1435 | if (timer.it.cpu.expires.sched == 0) { | ||
1436 | /* | ||
1437 | * Our timer fired and was reset. | ||
1438 | */ | ||
1439 | spin_unlock_irq(&timer.it_lock); | ||
1440 | return 0; | ||
1441 | } | ||
1442 | |||
1443 | /* | ||
1444 | * Block until cpu_timer_fire (or a signal) wakes us. | ||
1445 | */ | ||
1446 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1447 | spin_unlock_irq(&timer.it_lock); | ||
1448 | schedule(); | ||
1449 | spin_lock_irq(&timer.it_lock); | ||
1450 | } | ||
1451 | |||
1452 | /* | ||
1453 | * We were interrupted by a signal. | ||
1454 | */ | ||
1455 | sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); | ||
1456 | posix_cpu_timer_set(&timer, 0, &zero_it, &it); | ||
1457 | spin_unlock_irq(&timer.it_lock); | ||
1458 | |||
1459 | if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { | ||
1460 | /* | ||
1461 | * It actually did fire already. | ||
1462 | */ | ||
1463 | return 0; | ||
1464 | } | ||
1465 | |||
1466 | /* | ||
1467 | * Report back to the user the time still remaining. | ||
1468 | */ | ||
1469 | rmtp = (struct timespec __user *) restart_block->arg1; | ||
1470 | if (rmtp != NULL && !(flags & TIMER_ABSTIME) && | ||
1471 | copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | ||
1472 | return -EFAULT; | ||
1473 | |||
1474 | restart_block->fn = posix_cpu_clock_nanosleep_restart; | ||
1475 | /* Caller already set restart_block->arg1 */ | ||
1476 | restart_block->arg0 = which_clock; | ||
1477 | restart_block->arg2 = rqtp->tv_sec; | ||
1478 | restart_block->arg3 = rqtp->tv_nsec; | ||
1479 | |||
1480 | error = -ERESTART_RESTARTBLOCK; | ||
1481 | } | ||
1482 | |||
1483 | return error; | ||
1484 | } | ||
1485 | |||
1486 | static long | ||
1487 | posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) | ||
1488 | { | ||
1489 | clockid_t which_clock = restart_block->arg0; | ||
1490 | struct timespec t = { .tv_sec = restart_block->arg2, | ||
1491 | .tv_nsec = restart_block->arg3 }; | ||
1492 | restart_block->fn = do_no_restart_syscall; | ||
1493 | return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t); | ||
1494 | } | ||
1495 | |||
1496 | |||
1497 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) | ||
1498 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) | ||
1499 | |||
1500 | static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | ||
1501 | { | ||
1502 | return posix_cpu_clock_getres(PROCESS_CLOCK, tp); | ||
1503 | } | ||
1504 | static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | ||
1505 | { | ||
1506 | return posix_cpu_clock_get(PROCESS_CLOCK, tp); | ||
1507 | } | ||
1508 | static int process_cpu_timer_create(struct k_itimer *timer) | ||
1509 | { | ||
1510 | timer->it_clock = PROCESS_CLOCK; | ||
1511 | return posix_cpu_timer_create(timer); | ||
1512 | } | ||
1513 | static int process_cpu_nsleep(clockid_t which_clock, int flags, | ||
1514 | struct timespec *rqtp) | ||
1515 | { | ||
1516 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); | ||
1517 | } | ||
1518 | static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | ||
1519 | { | ||
1520 | return posix_cpu_clock_getres(THREAD_CLOCK, tp); | ||
1521 | } | ||
1522 | static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | ||
1523 | { | ||
1524 | return posix_cpu_clock_get(THREAD_CLOCK, tp); | ||
1525 | } | ||
1526 | static int thread_cpu_timer_create(struct k_itimer *timer) | ||
1527 | { | ||
1528 | timer->it_clock = THREAD_CLOCK; | ||
1529 | return posix_cpu_timer_create(timer); | ||
1530 | } | ||
1531 | static int thread_cpu_nsleep(clockid_t which_clock, int flags, | ||
1532 | struct timespec *rqtp) | ||
1533 | { | ||
1534 | return -EINVAL; | ||
1535 | } | ||
1536 | |||
1537 | static __init int init_posix_cpu_timers(void) | ||
1538 | { | ||
1539 | struct k_clock process = { | ||
1540 | .clock_getres = process_cpu_clock_getres, | ||
1541 | .clock_get = process_cpu_clock_get, | ||
1542 | .clock_set = do_posix_clock_nosettime, | ||
1543 | .timer_create = process_cpu_timer_create, | ||
1544 | .nsleep = process_cpu_nsleep, | ||
1545 | }; | ||
1546 | struct k_clock thread = { | ||
1547 | .clock_getres = thread_cpu_clock_getres, | ||
1548 | .clock_get = thread_cpu_clock_get, | ||
1549 | .clock_set = do_posix_clock_nosettime, | ||
1550 | .timer_create = thread_cpu_timer_create, | ||
1551 | .nsleep = thread_cpu_nsleep, | ||
1552 | }; | ||
1553 | |||
1554 | register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); | ||
1555 | register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); | ||
1556 | |||
1557 | return 0; | ||
1558 | } | ||
1559 | __initcall(init_posix_cpu_timers); | ||
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c new file mode 100644 index 000000000000..fd316c272260 --- /dev/null +++ b/kernel/posix-timers.c | |||
@@ -0,0 +1,1584 @@ | |||
1 | /* | ||
2 | * linux/kernel/posix_timers.c | ||
3 | * | ||
4 | * | ||
5 | * 2002-10-15 Posix Clocks & timers | ||
6 | * by George Anzinger george@mvista.com | ||
7 | * | ||
8 | * Copyright (C) 2002 2003 by MontaVista Software. | ||
9 | * | ||
10 | * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. | ||
11 | * Copyright (C) 2004 Boris Hu | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or modify | ||
14 | * it under the terms of the GNU General Public License as published by | ||
15 | * the Free Software Foundation; either version 2 of the License, or (at | ||
16 | * your option) any later version. | ||
17 | * | ||
18 | * This program is distributed in the hope that it will be useful, but | ||
19 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
21 | * General Public License for more details. | ||
22 | |||
23 | * You should have received a copy of the GNU General Public License | ||
24 | * along with this program; if not, write to the Free Software | ||
25 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
26 | * | ||
27 | * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA | ||
28 | */ | ||
29 | |||
30 | /* These are all the functions necessary to implement | ||
31 | * POSIX clocks & timers | ||
32 | */ | ||
33 | #include <linux/mm.h> | ||
34 | #include <linux/smp_lock.h> | ||
35 | #include <linux/interrupt.h> | ||
36 | #include <linux/slab.h> | ||
37 | #include <linux/time.h> | ||
38 | |||
39 | #include <asm/uaccess.h> | ||
40 | #include <asm/semaphore.h> | ||
41 | #include <linux/list.h> | ||
42 | #include <linux/init.h> | ||
43 | #include <linux/compiler.h> | ||
44 | #include <linux/idr.h> | ||
45 | #include <linux/posix-timers.h> | ||
46 | #include <linux/syscalls.h> | ||
47 | #include <linux/wait.h> | ||
48 | #include <linux/workqueue.h> | ||
49 | #include <linux/module.h> | ||
50 | |||
51 | #ifndef div_long_long_rem | ||
52 | #include <asm/div64.h> | ||
53 | |||
54 | #define div_long_long_rem(dividend,divisor,remainder) ({ \ | ||
55 | u64 result = dividend; \ | ||
56 | *remainder = do_div(result,divisor); \ | ||
57 | result; }) | ||
58 | |||
59 | #endif | ||
60 | #define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */ | ||
61 | |||
62 | static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2) | ||
63 | { | ||
64 | return (u64)mpy1 * mpy2; | ||
65 | } | ||
66 | /* | ||
67 | * Management arrays for POSIX timers. Timers are kept in slab memory | ||
68 | * Timer ids are allocated by an external routine that keeps track of the | ||
69 | * id and the timer. The external interface is: | ||
70 | * | ||
71 | * void *idr_find(struct idr *idp, int id); to find timer_id <id> | ||
72 | * int idr_get_new(struct idr *idp, void *ptr); to get a new id and | ||
73 | * related it to <ptr> | ||
74 | * void idr_remove(struct idr *idp, int id); to release <id> | ||
75 | * void idr_init(struct idr *idp); to initialize <idp> | ||
76 | * which we supply. | ||
77 | * The idr_get_new *may* call slab for more memory so it must not be | ||
78 | * called under a spin lock. Likewise idr_remore may release memory | ||
79 | * (but it may be ok to do this under a lock...). | ||
80 | * idr_find is just a memory look up and is quite fast. A -1 return | ||
81 | * indicates that the requested id does not exist. | ||
82 | */ | ||
83 | |||
84 | /* | ||
85 | * Lets keep our timers in a slab cache :-) | ||
86 | */ | ||
87 | static kmem_cache_t *posix_timers_cache; | ||
88 | static struct idr posix_timers_id; | ||
89 | static DEFINE_SPINLOCK(idr_lock); | ||
90 | |||
91 | /* | ||
92 | * Just because the timer is not in the timer list does NOT mean it is | ||
93 | * inactive. It could be in the "fire" routine getting a new expire time. | ||
94 | */ | ||
95 | #define TIMER_INACTIVE 1 | ||
96 | |||
97 | #ifdef CONFIG_SMP | ||
98 | # define timer_active(tmr) \ | ||
99 | ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE) | ||
100 | # define set_timer_inactive(tmr) \ | ||
101 | do { \ | ||
102 | (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \ | ||
103 | } while (0) | ||
104 | #else | ||
105 | # define timer_active(tmr) BARFY // error to use outside of SMP | ||
106 | # define set_timer_inactive(tmr) do { } while (0) | ||
107 | #endif | ||
108 | /* | ||
109 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other | ||
110 | * SIGEV values. Here we put out an error if this assumption fails. | ||
111 | */ | ||
112 | #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ | ||
113 | ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) | ||
114 | #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" | ||
115 | #endif | ||
116 | |||
117 | |||
118 | /* | ||
119 | * The timer ID is turned into a timer address by idr_find(). | ||
120 | * Verifying a valid ID consists of: | ||
121 | * | ||
122 | * a) checking that idr_find() returns other than -1. | ||
123 | * b) checking that the timer id matches the one in the timer itself. | ||
124 | * c) that the timer owner is in the callers thread group. | ||
125 | */ | ||
126 | |||
127 | /* | ||
128 | * CLOCKs: The POSIX standard calls for a couple of clocks and allows us | ||
129 | * to implement others. This structure defines the various | ||
130 | * clocks and allows the possibility of adding others. We | ||
131 | * provide an interface to add clocks to the table and expect | ||
132 | * the "arch" code to add at least one clock that is high | ||
133 | * resolution. Here we define the standard CLOCK_REALTIME as a | ||
134 | * 1/HZ resolution clock. | ||
135 | * | ||
136 | * RESOLUTION: Clock resolution is used to round up timer and interval | ||
137 | * times, NOT to report clock times, which are reported with as | ||
138 | * much resolution as the system can muster. In some cases this | ||
139 | * resolution may depend on the underlying clock hardware and | ||
140 | * may not be quantifiable until run time, and only then is the | ||
141 | * necessary code is written. The standard says we should say | ||
142 | * something about this issue in the documentation... | ||
143 | * | ||
144 | * FUNCTIONS: The CLOCKs structure defines possible functions to handle | ||
145 | * various clock functions. For clocks that use the standard | ||
146 | * system timer code these entries should be NULL. This will | ||
147 | * allow dispatch without the overhead of indirect function | ||
148 | * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) | ||
149 | * must supply functions here, even if the function just returns | ||
150 | * ENOSYS. The standard POSIX timer management code assumes the | ||
151 | * following: 1.) The k_itimer struct (sched.h) is used for the | ||
152 | * timer. 2.) The list, it_lock, it_clock, it_id and it_process | ||
153 | * fields are not modified by timer code. | ||
154 | * | ||
155 | * At this time all functions EXCEPT clock_nanosleep can be | ||
156 | * redirected by the CLOCKS structure. Clock_nanosleep is in | ||
157 | * there, but the code ignores it. | ||
158 | * | ||
159 | * Permissions: It is assumed that the clock_settime() function defined | ||
160 | * for each clock will take care of permission checks. Some | ||
161 | * clocks may be set able by any user (i.e. local process | ||
162 | * clocks) others not. Currently the only set able clock we | ||
163 | * have is CLOCK_REALTIME and its high res counter part, both of | ||
164 | * which we beg off on and pass to do_sys_settimeofday(). | ||
165 | */ | ||
166 | |||
167 | static struct k_clock posix_clocks[MAX_CLOCKS]; | ||
168 | /* | ||
169 | * We only have one real clock that can be set so we need only one abs list, | ||
170 | * even if we should want to have several clocks with differing resolutions. | ||
171 | */ | ||
172 | static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list), | ||
173 | .lock = SPIN_LOCK_UNLOCKED}; | ||
174 | |||
175 | static void posix_timer_fn(unsigned long); | ||
176 | static u64 do_posix_clock_monotonic_gettime_parts( | ||
177 | struct timespec *tp, struct timespec *mo); | ||
178 | int do_posix_clock_monotonic_gettime(struct timespec *tp); | ||
179 | static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp); | ||
180 | |||
181 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | ||
182 | |||
183 | static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | ||
184 | { | ||
185 | spin_unlock_irqrestore(&timr->it_lock, flags); | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * Call the k_clock hook function if non-null, or the default function. | ||
190 | */ | ||
191 | #define CLOCK_DISPATCH(clock, call, arglist) \ | ||
192 | ((clock) < 0 ? posix_cpu_##call arglist : \ | ||
193 | (posix_clocks[clock].call != NULL \ | ||
194 | ? (*posix_clocks[clock].call) arglist : common_##call arglist)) | ||
195 | |||
196 | /* | ||
197 | * Default clock hook functions when the struct k_clock passed | ||
198 | * to register_posix_clock leaves a function pointer null. | ||
199 | * | ||
200 | * The function common_CALL is the default implementation for | ||
201 | * the function pointer CALL in struct k_clock. | ||
202 | */ | ||
203 | |||
204 | static inline int common_clock_getres(clockid_t which_clock, | ||
205 | struct timespec *tp) | ||
206 | { | ||
207 | tp->tv_sec = 0; | ||
208 | tp->tv_nsec = posix_clocks[which_clock].res; | ||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | static inline int common_clock_get(clockid_t which_clock, struct timespec *tp) | ||
213 | { | ||
214 | getnstimeofday(tp); | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | static inline int common_clock_set(clockid_t which_clock, struct timespec *tp) | ||
219 | { | ||
220 | return do_sys_settimeofday(tp, NULL); | ||
221 | } | ||
222 | |||
223 | static inline int common_timer_create(struct k_itimer *new_timer) | ||
224 | { | ||
225 | INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry); | ||
226 | init_timer(&new_timer->it.real.timer); | ||
227 | new_timer->it.real.timer.data = (unsigned long) new_timer; | ||
228 | new_timer->it.real.timer.function = posix_timer_fn; | ||
229 | set_timer_inactive(new_timer); | ||
230 | return 0; | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * These ones are defined below. | ||
235 | */ | ||
236 | static int common_nsleep(clockid_t, int flags, struct timespec *t); | ||
237 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | ||
238 | static int common_timer_set(struct k_itimer *, int, | ||
239 | struct itimerspec *, struct itimerspec *); | ||
240 | static int common_timer_del(struct k_itimer *timer); | ||
241 | |||
242 | /* | ||
243 | * Return nonzero iff we know a priori this clockid_t value is bogus. | ||
244 | */ | ||
245 | static inline int invalid_clockid(clockid_t which_clock) | ||
246 | { | ||
247 | if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ | ||
248 | return 0; | ||
249 | if ((unsigned) which_clock >= MAX_CLOCKS) | ||
250 | return 1; | ||
251 | if (posix_clocks[which_clock].clock_getres != NULL) | ||
252 | return 0; | ||
253 | #ifndef CLOCK_DISPATCH_DIRECT | ||
254 | if (posix_clocks[which_clock].res != 0) | ||
255 | return 0; | ||
256 | #endif | ||
257 | return 1; | ||
258 | } | ||
259 | |||
260 | |||
261 | /* | ||
262 | * Initialize everything, well, just everything in Posix clocks/timers ;) | ||
263 | */ | ||
264 | static __init int init_posix_timers(void) | ||
265 | { | ||
266 | struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES, | ||
267 | .abs_struct = &abs_list | ||
268 | }; | ||
269 | struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES, | ||
270 | .abs_struct = NULL, | ||
271 | .clock_get = do_posix_clock_monotonic_get, | ||
272 | .clock_set = do_posix_clock_nosettime | ||
273 | }; | ||
274 | |||
275 | register_posix_clock(CLOCK_REALTIME, &clock_realtime); | ||
276 | register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); | ||
277 | |||
278 | posix_timers_cache = kmem_cache_create("posix_timers_cache", | ||
279 | sizeof (struct k_itimer), 0, 0, NULL, NULL); | ||
280 | idr_init(&posix_timers_id); | ||
281 | return 0; | ||
282 | } | ||
283 | |||
284 | __initcall(init_posix_timers); | ||
285 | |||
286 | static void tstojiffie(struct timespec *tp, int res, u64 *jiff) | ||
287 | { | ||
288 | long sec = tp->tv_sec; | ||
289 | long nsec = tp->tv_nsec + res - 1; | ||
290 | |||
291 | if (nsec > NSEC_PER_SEC) { | ||
292 | sec++; | ||
293 | nsec -= NSEC_PER_SEC; | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * The scaling constants are defined in <linux/time.h> | ||
298 | * The difference between there and here is that we do the | ||
299 | * res rounding and compute a 64-bit result (well so does that | ||
300 | * but it then throws away the high bits). | ||
301 | */ | ||
302 | *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) + | ||
303 | (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> | ||
304 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * This function adjusts the timer as needed as a result of the clock | ||
309 | * being set. It should only be called for absolute timers, and then | ||
310 | * under the abs_list lock. It computes the time difference and sets | ||
311 | * the new jiffies value in the timer. It also updates the timers | ||
312 | * reference wall_to_monotonic value. It is complicated by the fact | ||
313 | * that tstojiffies() only handles positive times and it needs to work | ||
314 | * with both positive and negative times. Also, for negative offsets, | ||
315 | * we need to defeat the res round up. | ||
316 | * | ||
317 | * Return is true if there is a new time, else false. | ||
318 | */ | ||
319 | static long add_clockset_delta(struct k_itimer *timr, | ||
320 | struct timespec *new_wall_to) | ||
321 | { | ||
322 | struct timespec delta; | ||
323 | int sign = 0; | ||
324 | u64 exp; | ||
325 | |||
326 | set_normalized_timespec(&delta, | ||
327 | new_wall_to->tv_sec - | ||
328 | timr->it.real.wall_to_prev.tv_sec, | ||
329 | new_wall_to->tv_nsec - | ||
330 | timr->it.real.wall_to_prev.tv_nsec); | ||
331 | if (likely(!(delta.tv_sec | delta.tv_nsec))) | ||
332 | return 0; | ||
333 | if (delta.tv_sec < 0) { | ||
334 | set_normalized_timespec(&delta, | ||
335 | -delta.tv_sec, | ||
336 | 1 - delta.tv_nsec - | ||
337 | posix_clocks[timr->it_clock].res); | ||
338 | sign++; | ||
339 | } | ||
340 | tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp); | ||
341 | timr->it.real.wall_to_prev = *new_wall_to; | ||
342 | timr->it.real.timer.expires += (sign ? -exp : exp); | ||
343 | return 1; | ||
344 | } | ||
345 | |||
346 | static void remove_from_abslist(struct k_itimer *timr) | ||
347 | { | ||
348 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
349 | spin_lock(&abs_list.lock); | ||
350 | list_del_init(&timr->it.real.abs_timer_entry); | ||
351 | spin_unlock(&abs_list.lock); | ||
352 | } | ||
353 | } | ||
354 | |||
355 | static void schedule_next_timer(struct k_itimer *timr) | ||
356 | { | ||
357 | struct timespec new_wall_to; | ||
358 | struct now_struct now; | ||
359 | unsigned long seq; | ||
360 | |||
361 | /* | ||
362 | * Set up the timer for the next interval (if there is one). | ||
363 | * Note: this code uses the abs_timer_lock to protect | ||
364 | * it.real.wall_to_prev and must hold it until exp is set, not exactly | ||
365 | * obvious... | ||
366 | |||
367 | * This function is used for CLOCK_REALTIME* and | ||
368 | * CLOCK_MONOTONIC* timers. If we ever want to handle other | ||
369 | * CLOCKs, the calling code (do_schedule_next_timer) would need | ||
370 | * to pull the "clock" info from the timer and dispatch the | ||
371 | * "other" CLOCKs "next timer" code (which, I suppose should | ||
372 | * also be added to the k_clock structure). | ||
373 | */ | ||
374 | if (!timr->it.real.incr) | ||
375 | return; | ||
376 | |||
377 | do { | ||
378 | seq = read_seqbegin(&xtime_lock); | ||
379 | new_wall_to = wall_to_monotonic; | ||
380 | posix_get_now(&now); | ||
381 | } while (read_seqretry(&xtime_lock, seq)); | ||
382 | |||
383 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
384 | spin_lock(&abs_list.lock); | ||
385 | add_clockset_delta(timr, &new_wall_to); | ||
386 | |||
387 | posix_bump_timer(timr, now); | ||
388 | |||
389 | spin_unlock(&abs_list.lock); | ||
390 | } else { | ||
391 | posix_bump_timer(timr, now); | ||
392 | } | ||
393 | timr->it_overrun_last = timr->it_overrun; | ||
394 | timr->it_overrun = -1; | ||
395 | ++timr->it_requeue_pending; | ||
396 | add_timer(&timr->it.real.timer); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * This function is exported for use by the signal deliver code. It is | ||
401 | * called just prior to the info block being released and passes that | ||
402 | * block to us. It's function is to update the overrun entry AND to | ||
403 | * restart the timer. It should only be called if the timer is to be | ||
404 | * restarted (i.e. we have flagged this in the sys_private entry of the | ||
405 | * info block). | ||
406 | * | ||
407 | * To protect aginst the timer going away while the interrupt is queued, | ||
408 | * we require that the it_requeue_pending flag be set. | ||
409 | */ | ||
410 | void do_schedule_next_timer(struct siginfo *info) | ||
411 | { | ||
412 | struct k_itimer *timr; | ||
413 | unsigned long flags; | ||
414 | |||
415 | timr = lock_timer(info->si_tid, &flags); | ||
416 | |||
417 | if (!timr || timr->it_requeue_pending != info->si_sys_private) | ||
418 | goto exit; | ||
419 | |||
420 | if (timr->it_clock < 0) /* CPU clock */ | ||
421 | posix_cpu_timer_schedule(timr); | ||
422 | else | ||
423 | schedule_next_timer(timr); | ||
424 | info->si_overrun = timr->it_overrun_last; | ||
425 | exit: | ||
426 | if (timr) | ||
427 | unlock_timer(timr, flags); | ||
428 | } | ||
429 | |||
430 | int posix_timer_event(struct k_itimer *timr,int si_private) | ||
431 | { | ||
432 | memset(&timr->sigq->info, 0, sizeof(siginfo_t)); | ||
433 | timr->sigq->info.si_sys_private = si_private; | ||
434 | /* | ||
435 | * Send signal to the process that owns this timer. | ||
436 | |||
437 | * This code assumes that all the possible abs_lists share the | ||
438 | * same lock (there is only one list at this time). If this is | ||
439 | * not the case, the CLOCK info would need to be used to find | ||
440 | * the proper abs list lock. | ||
441 | */ | ||
442 | |||
443 | timr->sigq->info.si_signo = timr->it_sigev_signo; | ||
444 | timr->sigq->info.si_errno = 0; | ||
445 | timr->sigq->info.si_code = SI_TIMER; | ||
446 | timr->sigq->info.si_tid = timr->it_id; | ||
447 | timr->sigq->info.si_value = timr->it_sigev_value; | ||
448 | if (timr->it_sigev_notify & SIGEV_THREAD_ID) { | ||
449 | if (unlikely(timr->it_process->flags & PF_EXITING)) { | ||
450 | timr->it_sigev_notify = SIGEV_SIGNAL; | ||
451 | put_task_struct(timr->it_process); | ||
452 | timr->it_process = timr->it_process->group_leader; | ||
453 | goto group; | ||
454 | } | ||
455 | return send_sigqueue(timr->it_sigev_signo, timr->sigq, | ||
456 | timr->it_process); | ||
457 | } | ||
458 | else { | ||
459 | group: | ||
460 | return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, | ||
461 | timr->it_process); | ||
462 | } | ||
463 | } | ||
464 | EXPORT_SYMBOL_GPL(posix_timer_event); | ||
465 | |||
466 | /* | ||
467 | * This function gets called when a POSIX.1b interval timer expires. It | ||
468 | * is used as a callback from the kernel internal timer. The | ||
469 | * run_timer_list code ALWAYS calls with interrupts on. | ||
470 | |||
471 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. | ||
472 | */ | ||
473 | static void posix_timer_fn(unsigned long __data) | ||
474 | { | ||
475 | struct k_itimer *timr = (struct k_itimer *) __data; | ||
476 | unsigned long flags; | ||
477 | unsigned long seq; | ||
478 | struct timespec delta, new_wall_to; | ||
479 | u64 exp = 0; | ||
480 | int do_notify = 1; | ||
481 | |||
482 | spin_lock_irqsave(&timr->it_lock, flags); | ||
483 | set_timer_inactive(timr); | ||
484 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
485 | spin_lock(&abs_list.lock); | ||
486 | do { | ||
487 | seq = read_seqbegin(&xtime_lock); | ||
488 | new_wall_to = wall_to_monotonic; | ||
489 | } while (read_seqretry(&xtime_lock, seq)); | ||
490 | set_normalized_timespec(&delta, | ||
491 | new_wall_to.tv_sec - | ||
492 | timr->it.real.wall_to_prev.tv_sec, | ||
493 | new_wall_to.tv_nsec - | ||
494 | timr->it.real.wall_to_prev.tv_nsec); | ||
495 | if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) { | ||
496 | /* do nothing, timer is on time */ | ||
497 | } else if (delta.tv_sec < 0) { | ||
498 | /* do nothing, timer is already late */ | ||
499 | } else { | ||
500 | /* timer is early due to a clock set */ | ||
501 | tstojiffie(&delta, | ||
502 | posix_clocks[timr->it_clock].res, | ||
503 | &exp); | ||
504 | timr->it.real.wall_to_prev = new_wall_to; | ||
505 | timr->it.real.timer.expires += exp; | ||
506 | add_timer(&timr->it.real.timer); | ||
507 | do_notify = 0; | ||
508 | } | ||
509 | spin_unlock(&abs_list.lock); | ||
510 | |||
511 | } | ||
512 | if (do_notify) { | ||
513 | int si_private=0; | ||
514 | |||
515 | if (timr->it.real.incr) | ||
516 | si_private = ++timr->it_requeue_pending; | ||
517 | else { | ||
518 | remove_from_abslist(timr); | ||
519 | } | ||
520 | |||
521 | if (posix_timer_event(timr, si_private)) | ||
522 | /* | ||
523 | * signal was not sent because of sig_ignor | ||
524 | * we will not get a call back to restart it AND | ||
525 | * it should be restarted. | ||
526 | */ | ||
527 | schedule_next_timer(timr); | ||
528 | } | ||
529 | unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */ | ||
530 | } | ||
531 | |||
532 | |||
533 | static inline struct task_struct * good_sigevent(sigevent_t * event) | ||
534 | { | ||
535 | struct task_struct *rtn = current->group_leader; | ||
536 | |||
537 | if ((event->sigev_notify & SIGEV_THREAD_ID ) && | ||
538 | (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || | ||
539 | rtn->tgid != current->tgid || | ||
540 | (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) | ||
541 | return NULL; | ||
542 | |||
543 | if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && | ||
544 | ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) | ||
545 | return NULL; | ||
546 | |||
547 | return rtn; | ||
548 | } | ||
549 | |||
550 | void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock) | ||
551 | { | ||
552 | if ((unsigned) clock_id >= MAX_CLOCKS) { | ||
553 | printk("POSIX clock register failed for clock_id %d\n", | ||
554 | clock_id); | ||
555 | return; | ||
556 | } | ||
557 | |||
558 | posix_clocks[clock_id] = *new_clock; | ||
559 | } | ||
560 | EXPORT_SYMBOL_GPL(register_posix_clock); | ||
561 | |||
562 | static struct k_itimer * alloc_posix_timer(void) | ||
563 | { | ||
564 | struct k_itimer *tmr; | ||
565 | tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL); | ||
566 | if (!tmr) | ||
567 | return tmr; | ||
568 | memset(tmr, 0, sizeof (struct k_itimer)); | ||
569 | if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { | ||
570 | kmem_cache_free(posix_timers_cache, tmr); | ||
571 | tmr = NULL; | ||
572 | } | ||
573 | return tmr; | ||
574 | } | ||
575 | |||
576 | #define IT_ID_SET 1 | ||
577 | #define IT_ID_NOT_SET 0 | ||
578 | static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | ||
579 | { | ||
580 | if (it_id_set) { | ||
581 | unsigned long flags; | ||
582 | spin_lock_irqsave(&idr_lock, flags); | ||
583 | idr_remove(&posix_timers_id, tmr->it_id); | ||
584 | spin_unlock_irqrestore(&idr_lock, flags); | ||
585 | } | ||
586 | sigqueue_free(tmr->sigq); | ||
587 | if (unlikely(tmr->it_process) && | ||
588 | tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) | ||
589 | put_task_struct(tmr->it_process); | ||
590 | kmem_cache_free(posix_timers_cache, tmr); | ||
591 | } | ||
592 | |||
593 | /* Create a POSIX.1b interval timer. */ | ||
594 | |||
595 | asmlinkage long | ||
596 | sys_timer_create(clockid_t which_clock, | ||
597 | struct sigevent __user *timer_event_spec, | ||
598 | timer_t __user * created_timer_id) | ||
599 | { | ||
600 | int error = 0; | ||
601 | struct k_itimer *new_timer = NULL; | ||
602 | int new_timer_id; | ||
603 | struct task_struct *process = NULL; | ||
604 | unsigned long flags; | ||
605 | sigevent_t event; | ||
606 | int it_id_set = IT_ID_NOT_SET; | ||
607 | |||
608 | if (invalid_clockid(which_clock)) | ||
609 | return -EINVAL; | ||
610 | |||
611 | new_timer = alloc_posix_timer(); | ||
612 | if (unlikely(!new_timer)) | ||
613 | return -EAGAIN; | ||
614 | |||
615 | spin_lock_init(&new_timer->it_lock); | ||
616 | retry: | ||
617 | if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { | ||
618 | error = -EAGAIN; | ||
619 | goto out; | ||
620 | } | ||
621 | spin_lock_irq(&idr_lock); | ||
622 | error = idr_get_new(&posix_timers_id, | ||
623 | (void *) new_timer, | ||
624 | &new_timer_id); | ||
625 | spin_unlock_irq(&idr_lock); | ||
626 | if (error == -EAGAIN) | ||
627 | goto retry; | ||
628 | else if (error) { | ||
629 | /* | ||
630 | * Wierd looking, but we return EAGAIN if the IDR is | ||
631 | * full (proper POSIX return value for this) | ||
632 | */ | ||
633 | error = -EAGAIN; | ||
634 | goto out; | ||
635 | } | ||
636 | |||
637 | it_id_set = IT_ID_SET; | ||
638 | new_timer->it_id = (timer_t) new_timer_id; | ||
639 | new_timer->it_clock = which_clock; | ||
640 | new_timer->it_overrun = -1; | ||
641 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); | ||
642 | if (error) | ||
643 | goto out; | ||
644 | |||
645 | /* | ||
646 | * return the timer_id now. The next step is hard to | ||
647 | * back out if there is an error. | ||
648 | */ | ||
649 | if (copy_to_user(created_timer_id, | ||
650 | &new_timer_id, sizeof (new_timer_id))) { | ||
651 | error = -EFAULT; | ||
652 | goto out; | ||
653 | } | ||
654 | if (timer_event_spec) { | ||
655 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { | ||
656 | error = -EFAULT; | ||
657 | goto out; | ||
658 | } | ||
659 | new_timer->it_sigev_notify = event.sigev_notify; | ||
660 | new_timer->it_sigev_signo = event.sigev_signo; | ||
661 | new_timer->it_sigev_value = event.sigev_value; | ||
662 | |||
663 | read_lock(&tasklist_lock); | ||
664 | if ((process = good_sigevent(&event))) { | ||
665 | /* | ||
666 | * We may be setting up this process for another | ||
667 | * thread. It may be exiting. To catch this | ||
668 | * case the we check the PF_EXITING flag. If | ||
669 | * the flag is not set, the siglock will catch | ||
670 | * him before it is too late (in exit_itimers). | ||
671 | * | ||
672 | * The exec case is a bit more invloved but easy | ||
673 | * to code. If the process is in our thread | ||
674 | * group (and it must be or we would not allow | ||
675 | * it here) and is doing an exec, it will cause | ||
676 | * us to be killed. In this case it will wait | ||
677 | * for us to die which means we can finish this | ||
678 | * linkage with our last gasp. I.e. no code :) | ||
679 | */ | ||
680 | spin_lock_irqsave(&process->sighand->siglock, flags); | ||
681 | if (!(process->flags & PF_EXITING)) { | ||
682 | new_timer->it_process = process; | ||
683 | list_add(&new_timer->list, | ||
684 | &process->signal->posix_timers); | ||
685 | spin_unlock_irqrestore(&process->sighand->siglock, flags); | ||
686 | if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) | ||
687 | get_task_struct(process); | ||
688 | } else { | ||
689 | spin_unlock_irqrestore(&process->sighand->siglock, flags); | ||
690 | process = NULL; | ||
691 | } | ||
692 | } | ||
693 | read_unlock(&tasklist_lock); | ||
694 | if (!process) { | ||
695 | error = -EINVAL; | ||
696 | goto out; | ||
697 | } | ||
698 | } else { | ||
699 | new_timer->it_sigev_notify = SIGEV_SIGNAL; | ||
700 | new_timer->it_sigev_signo = SIGALRM; | ||
701 | new_timer->it_sigev_value.sival_int = new_timer->it_id; | ||
702 | process = current->group_leader; | ||
703 | spin_lock_irqsave(&process->sighand->siglock, flags); | ||
704 | new_timer->it_process = process; | ||
705 | list_add(&new_timer->list, &process->signal->posix_timers); | ||
706 | spin_unlock_irqrestore(&process->sighand->siglock, flags); | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * In the case of the timer belonging to another task, after | ||
711 | * the task is unlocked, the timer is owned by the other task | ||
712 | * and may cease to exist at any time. Don't use or modify | ||
713 | * new_timer after the unlock call. | ||
714 | */ | ||
715 | |||
716 | out: | ||
717 | if (error) | ||
718 | release_posix_timer(new_timer, it_id_set); | ||
719 | |||
720 | return error; | ||
721 | } | ||
722 | |||
723 | /* | ||
724 | * good_timespec | ||
725 | * | ||
726 | * This function checks the elements of a timespec structure. | ||
727 | * | ||
728 | * Arguments: | ||
729 | * ts : Pointer to the timespec structure to check | ||
730 | * | ||
731 | * Return value: | ||
732 | * If a NULL pointer was passed in, or the tv_nsec field was less than 0 | ||
733 | * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0, | ||
734 | * this function returns 0. Otherwise it returns 1. | ||
735 | */ | ||
736 | static int good_timespec(const struct timespec *ts) | ||
737 | { | ||
738 | if ((!ts) || (ts->tv_sec < 0) || | ||
739 | ((unsigned) ts->tv_nsec >= NSEC_PER_SEC)) | ||
740 | return 0; | ||
741 | return 1; | ||
742 | } | ||
743 | |||
744 | /* | ||
745 | * Locking issues: We need to protect the result of the id look up until | ||
746 | * we get the timer locked down so it is not deleted under us. The | ||
747 | * removal is done under the idr spinlock so we use that here to bridge | ||
748 | * the find to the timer lock. To avoid a dead lock, the timer id MUST | ||
749 | * be release with out holding the timer lock. | ||
750 | */ | ||
751 | static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) | ||
752 | { | ||
753 | struct k_itimer *timr; | ||
754 | /* | ||
755 | * Watch out here. We do a irqsave on the idr_lock and pass the | ||
756 | * flags part over to the timer lock. Must not let interrupts in | ||
757 | * while we are moving the lock. | ||
758 | */ | ||
759 | |||
760 | spin_lock_irqsave(&idr_lock, *flags); | ||
761 | timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); | ||
762 | if (timr) { | ||
763 | spin_lock(&timr->it_lock); | ||
764 | spin_unlock(&idr_lock); | ||
765 | |||
766 | if ((timr->it_id != timer_id) || !(timr->it_process) || | ||
767 | timr->it_process->tgid != current->tgid) { | ||
768 | unlock_timer(timr, *flags); | ||
769 | timr = NULL; | ||
770 | } | ||
771 | } else | ||
772 | spin_unlock_irqrestore(&idr_lock, *flags); | ||
773 | |||
774 | return timr; | ||
775 | } | ||
776 | |||
777 | /* | ||
778 | * Get the time remaining on a POSIX.1b interval timer. This function | ||
779 | * is ALWAYS called with spin_lock_irq on the timer, thus it must not | ||
780 | * mess with irq. | ||
781 | * | ||
782 | * We have a couple of messes to clean up here. First there is the case | ||
783 | * of a timer that has a requeue pending. These timers should appear to | ||
784 | * be in the timer list with an expiry as if we were to requeue them | ||
785 | * now. | ||
786 | * | ||
787 | * The second issue is the SIGEV_NONE timer which may be active but is | ||
788 | * not really ever put in the timer list (to save system resources). | ||
789 | * This timer may be expired, and if so, we will do it here. Otherwise | ||
790 | * it is the same as a requeue pending timer WRT to what we should | ||
791 | * report. | ||
792 | */ | ||
793 | static void | ||
794 | common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | ||
795 | { | ||
796 | unsigned long expires; | ||
797 | struct now_struct now; | ||
798 | |||
799 | do | ||
800 | expires = timr->it.real.timer.expires; | ||
801 | while ((volatile long) (timr->it.real.timer.expires) != expires); | ||
802 | |||
803 | posix_get_now(&now); | ||
804 | |||
805 | if (expires && | ||
806 | ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) && | ||
807 | !timr->it.real.incr && | ||
808 | posix_time_before(&timr->it.real.timer, &now)) | ||
809 | timr->it.real.timer.expires = expires = 0; | ||
810 | if (expires) { | ||
811 | if (timr->it_requeue_pending & REQUEUE_PENDING || | ||
812 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { | ||
813 | posix_bump_timer(timr, now); | ||
814 | expires = timr->it.real.timer.expires; | ||
815 | } | ||
816 | else | ||
817 | if (!timer_pending(&timr->it.real.timer)) | ||
818 | expires = 0; | ||
819 | if (expires) | ||
820 | expires -= now.jiffies; | ||
821 | } | ||
822 | jiffies_to_timespec(expires, &cur_setting->it_value); | ||
823 | jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval); | ||
824 | |||
825 | if (cur_setting->it_value.tv_sec < 0) { | ||
826 | cur_setting->it_value.tv_nsec = 1; | ||
827 | cur_setting->it_value.tv_sec = 0; | ||
828 | } | ||
829 | } | ||
830 | |||
831 | /* Get the time remaining on a POSIX.1b interval timer. */ | ||
832 | asmlinkage long | ||
833 | sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) | ||
834 | { | ||
835 | struct k_itimer *timr; | ||
836 | struct itimerspec cur_setting; | ||
837 | unsigned long flags; | ||
838 | |||
839 | timr = lock_timer(timer_id, &flags); | ||
840 | if (!timr) | ||
841 | return -EINVAL; | ||
842 | |||
843 | CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); | ||
844 | |||
845 | unlock_timer(timr, flags); | ||
846 | |||
847 | if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) | ||
848 | return -EFAULT; | ||
849 | |||
850 | return 0; | ||
851 | } | ||
852 | /* | ||
853 | * Get the number of overruns of a POSIX.1b interval timer. This is to | ||
854 | * be the overrun of the timer last delivered. At the same time we are | ||
855 | * accumulating overruns on the next timer. The overrun is frozen when | ||
856 | * the signal is delivered, either at the notify time (if the info block | ||
857 | * is not queued) or at the actual delivery time (as we are informed by | ||
858 | * the call back to do_schedule_next_timer(). So all we need to do is | ||
859 | * to pick up the frozen overrun. | ||
860 | */ | ||
861 | |||
862 | asmlinkage long | ||
863 | sys_timer_getoverrun(timer_t timer_id) | ||
864 | { | ||
865 | struct k_itimer *timr; | ||
866 | int overrun; | ||
867 | long flags; | ||
868 | |||
869 | timr = lock_timer(timer_id, &flags); | ||
870 | if (!timr) | ||
871 | return -EINVAL; | ||
872 | |||
873 | overrun = timr->it_overrun_last; | ||
874 | unlock_timer(timr, flags); | ||
875 | |||
876 | return overrun; | ||
877 | } | ||
878 | /* | ||
879 | * Adjust for absolute time | ||
880 | * | ||
881 | * If absolute time is given and it is not CLOCK_MONOTONIC, we need to | ||
882 | * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and | ||
883 | * what ever clock he is using. | ||
884 | * | ||
885 | * If it is relative time, we need to add the current (CLOCK_MONOTONIC) | ||
886 | * time to it to get the proper time for the timer. | ||
887 | */ | ||
888 | static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, | ||
889 | int abs, u64 *exp, struct timespec *wall_to) | ||
890 | { | ||
891 | struct timespec now; | ||
892 | struct timespec oc = *tp; | ||
893 | u64 jiffies_64_f; | ||
894 | int rtn =0; | ||
895 | |||
896 | if (abs) { | ||
897 | /* | ||
898 | * The mask pick up the 4 basic clocks | ||
899 | */ | ||
900 | if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) { | ||
901 | jiffies_64_f = do_posix_clock_monotonic_gettime_parts( | ||
902 | &now, wall_to); | ||
903 | /* | ||
904 | * If we are doing a MONOTONIC clock | ||
905 | */ | ||
906 | if((clock - &posix_clocks[0]) & CLOCKS_MONO){ | ||
907 | now.tv_sec += wall_to->tv_sec; | ||
908 | now.tv_nsec += wall_to->tv_nsec; | ||
909 | } | ||
910 | } else { | ||
911 | /* | ||
912 | * Not one of the basic clocks | ||
913 | */ | ||
914 | clock->clock_get(clock - posix_clocks, &now); | ||
915 | jiffies_64_f = get_jiffies_64(); | ||
916 | } | ||
917 | /* | ||
918 | * Take away now to get delta | ||
919 | */ | ||
920 | oc.tv_sec -= now.tv_sec; | ||
921 | oc.tv_nsec -= now.tv_nsec; | ||
922 | /* | ||
923 | * Normalize... | ||
924 | */ | ||
925 | while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) { | ||
926 | oc.tv_nsec -= NSEC_PER_SEC; | ||
927 | oc.tv_sec++; | ||
928 | } | ||
929 | while ((oc.tv_nsec) < 0) { | ||
930 | oc.tv_nsec += NSEC_PER_SEC; | ||
931 | oc.tv_sec--; | ||
932 | } | ||
933 | }else{ | ||
934 | jiffies_64_f = get_jiffies_64(); | ||
935 | } | ||
936 | /* | ||
937 | * Check if the requested time is prior to now (if so set now) | ||
938 | */ | ||
939 | if (oc.tv_sec < 0) | ||
940 | oc.tv_sec = oc.tv_nsec = 0; | ||
941 | |||
942 | if (oc.tv_sec | oc.tv_nsec) | ||
943 | set_normalized_timespec(&oc, oc.tv_sec, | ||
944 | oc.tv_nsec + clock->res); | ||
945 | tstojiffie(&oc, clock->res, exp); | ||
946 | |||
947 | /* | ||
948 | * Check if the requested time is more than the timer code | ||
949 | * can handle (if so we error out but return the value too). | ||
950 | */ | ||
951 | if (*exp > ((u64)MAX_JIFFY_OFFSET)) | ||
952 | /* | ||
953 | * This is a considered response, not exactly in | ||
954 | * line with the standard (in fact it is silent on | ||
955 | * possible overflows). We assume such a large | ||
956 | * value is ALMOST always a programming error and | ||
957 | * try not to compound it by setting a really dumb | ||
958 | * value. | ||
959 | */ | ||
960 | rtn = -EINVAL; | ||
961 | /* | ||
962 | * return the actual jiffies expire time, full 64 bits | ||
963 | */ | ||
964 | *exp += jiffies_64_f; | ||
965 | return rtn; | ||
966 | } | ||
967 | |||
968 | /* Set a POSIX.1b interval timer. */ | ||
969 | /* timr->it_lock is taken. */ | ||
970 | static inline int | ||
971 | common_timer_set(struct k_itimer *timr, int flags, | ||
972 | struct itimerspec *new_setting, struct itimerspec *old_setting) | ||
973 | { | ||
974 | struct k_clock *clock = &posix_clocks[timr->it_clock]; | ||
975 | u64 expire_64; | ||
976 | |||
977 | if (old_setting) | ||
978 | common_timer_get(timr, old_setting); | ||
979 | |||
980 | /* disable the timer */ | ||
981 | timr->it.real.incr = 0; | ||
982 | /* | ||
983 | * careful here. If smp we could be in the "fire" routine which will | ||
984 | * be spinning as we hold the lock. But this is ONLY an SMP issue. | ||
985 | */ | ||
986 | #ifdef CONFIG_SMP | ||
987 | if (timer_active(timr) && !del_timer(&timr->it.real.timer)) | ||
988 | /* | ||
989 | * It can only be active if on an other cpu. Since | ||
990 | * we have cleared the interval stuff above, it should | ||
991 | * clear once we release the spin lock. Of course once | ||
992 | * we do that anything could happen, including the | ||
993 | * complete melt down of the timer. So return with | ||
994 | * a "retry" exit status. | ||
995 | */ | ||
996 | return TIMER_RETRY; | ||
997 | |||
998 | set_timer_inactive(timr); | ||
999 | #else | ||
1000 | del_timer(&timr->it.real.timer); | ||
1001 | #endif | ||
1002 | remove_from_abslist(timr); | ||
1003 | |||
1004 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & | ||
1005 | ~REQUEUE_PENDING; | ||
1006 | timr->it_overrun_last = 0; | ||
1007 | timr->it_overrun = -1; | ||
1008 | /* | ||
1009 | *switch off the timer when it_value is zero | ||
1010 | */ | ||
1011 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) { | ||
1012 | timr->it.real.timer.expires = 0; | ||
1013 | return 0; | ||
1014 | } | ||
1015 | |||
1016 | if (adjust_abs_time(clock, | ||
1017 | &new_setting->it_value, flags & TIMER_ABSTIME, | ||
1018 | &expire_64, &(timr->it.real.wall_to_prev))) { | ||
1019 | return -EINVAL; | ||
1020 | } | ||
1021 | timr->it.real.timer.expires = (unsigned long)expire_64; | ||
1022 | tstojiffie(&new_setting->it_interval, clock->res, &expire_64); | ||
1023 | timr->it.real.incr = (unsigned long)expire_64; | ||
1024 | |||
1025 | /* | ||
1026 | * We do not even queue SIGEV_NONE timers! But we do put them | ||
1027 | * in the abs list so we can do that right. | ||
1028 | */ | ||
1029 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)) | ||
1030 | add_timer(&timr->it.real.timer); | ||
1031 | |||
1032 | if (flags & TIMER_ABSTIME && clock->abs_struct) { | ||
1033 | spin_lock(&clock->abs_struct->lock); | ||
1034 | list_add_tail(&(timr->it.real.abs_timer_entry), | ||
1035 | &(clock->abs_struct->list)); | ||
1036 | spin_unlock(&clock->abs_struct->lock); | ||
1037 | } | ||
1038 | return 0; | ||
1039 | } | ||
1040 | |||
1041 | /* Set a POSIX.1b interval timer */ | ||
1042 | asmlinkage long | ||
1043 | sys_timer_settime(timer_t timer_id, int flags, | ||
1044 | const struct itimerspec __user *new_setting, | ||
1045 | struct itimerspec __user *old_setting) | ||
1046 | { | ||
1047 | struct k_itimer *timr; | ||
1048 | struct itimerspec new_spec, old_spec; | ||
1049 | int error = 0; | ||
1050 | long flag; | ||
1051 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; | ||
1052 | |||
1053 | if (!new_setting) | ||
1054 | return -EINVAL; | ||
1055 | |||
1056 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) | ||
1057 | return -EFAULT; | ||
1058 | |||
1059 | if ((!good_timespec(&new_spec.it_interval)) || | ||
1060 | (!good_timespec(&new_spec.it_value))) | ||
1061 | return -EINVAL; | ||
1062 | retry: | ||
1063 | timr = lock_timer(timer_id, &flag); | ||
1064 | if (!timr) | ||
1065 | return -EINVAL; | ||
1066 | |||
1067 | error = CLOCK_DISPATCH(timr->it_clock, timer_set, | ||
1068 | (timr, flags, &new_spec, rtn)); | ||
1069 | |||
1070 | unlock_timer(timr, flag); | ||
1071 | if (error == TIMER_RETRY) { | ||
1072 | rtn = NULL; // We already got the old time... | ||
1073 | goto retry; | ||
1074 | } | ||
1075 | |||
1076 | if (old_setting && !error && copy_to_user(old_setting, | ||
1077 | &old_spec, sizeof (old_spec))) | ||
1078 | error = -EFAULT; | ||
1079 | |||
1080 | return error; | ||
1081 | } | ||
1082 | |||
1083 | static inline int common_timer_del(struct k_itimer *timer) | ||
1084 | { | ||
1085 | timer->it.real.incr = 0; | ||
1086 | #ifdef CONFIG_SMP | ||
1087 | if (timer_active(timer) && !del_timer(&timer->it.real.timer)) | ||
1088 | /* | ||
1089 | * It can only be active if on an other cpu. Since | ||
1090 | * we have cleared the interval stuff above, it should | ||
1091 | * clear once we release the spin lock. Of course once | ||
1092 | * we do that anything could happen, including the | ||
1093 | * complete melt down of the timer. So return with | ||
1094 | * a "retry" exit status. | ||
1095 | */ | ||
1096 | return TIMER_RETRY; | ||
1097 | #else | ||
1098 | del_timer(&timer->it.real.timer); | ||
1099 | #endif | ||
1100 | remove_from_abslist(timer); | ||
1101 | |||
1102 | return 0; | ||
1103 | } | ||
1104 | |||
1105 | static inline int timer_delete_hook(struct k_itimer *timer) | ||
1106 | { | ||
1107 | return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); | ||
1108 | } | ||
1109 | |||
1110 | /* Delete a POSIX.1b interval timer. */ | ||
1111 | asmlinkage long | ||
1112 | sys_timer_delete(timer_t timer_id) | ||
1113 | { | ||
1114 | struct k_itimer *timer; | ||
1115 | long flags; | ||
1116 | |||
1117 | #ifdef CONFIG_SMP | ||
1118 | int error; | ||
1119 | retry_delete: | ||
1120 | #endif | ||
1121 | timer = lock_timer(timer_id, &flags); | ||
1122 | if (!timer) | ||
1123 | return -EINVAL; | ||
1124 | |||
1125 | #ifdef CONFIG_SMP | ||
1126 | error = timer_delete_hook(timer); | ||
1127 | |||
1128 | if (error == TIMER_RETRY) { | ||
1129 | unlock_timer(timer, flags); | ||
1130 | goto retry_delete; | ||
1131 | } | ||
1132 | #else | ||
1133 | timer_delete_hook(timer); | ||
1134 | #endif | ||
1135 | spin_lock(¤t->sighand->siglock); | ||
1136 | list_del(&timer->list); | ||
1137 | spin_unlock(¤t->sighand->siglock); | ||
1138 | /* | ||
1139 | * This keeps any tasks waiting on the spin lock from thinking | ||
1140 | * they got something (see the lock code above). | ||
1141 | */ | ||
1142 | if (timer->it_process) { | ||
1143 | if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) | ||
1144 | put_task_struct(timer->it_process); | ||
1145 | timer->it_process = NULL; | ||
1146 | } | ||
1147 | unlock_timer(timer, flags); | ||
1148 | release_posix_timer(timer, IT_ID_SET); | ||
1149 | return 0; | ||
1150 | } | ||
1151 | /* | ||
1152 | * return timer owned by the process, used by exit_itimers | ||
1153 | */ | ||
1154 | static inline void itimer_delete(struct k_itimer *timer) | ||
1155 | { | ||
1156 | unsigned long flags; | ||
1157 | |||
1158 | #ifdef CONFIG_SMP | ||
1159 | int error; | ||
1160 | retry_delete: | ||
1161 | #endif | ||
1162 | spin_lock_irqsave(&timer->it_lock, flags); | ||
1163 | |||
1164 | #ifdef CONFIG_SMP | ||
1165 | error = timer_delete_hook(timer); | ||
1166 | |||
1167 | if (error == TIMER_RETRY) { | ||
1168 | unlock_timer(timer, flags); | ||
1169 | goto retry_delete; | ||
1170 | } | ||
1171 | #else | ||
1172 | timer_delete_hook(timer); | ||
1173 | #endif | ||
1174 | list_del(&timer->list); | ||
1175 | /* | ||
1176 | * This keeps any tasks waiting on the spin lock from thinking | ||
1177 | * they got something (see the lock code above). | ||
1178 | */ | ||
1179 | if (timer->it_process) { | ||
1180 | if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) | ||
1181 | put_task_struct(timer->it_process); | ||
1182 | timer->it_process = NULL; | ||
1183 | } | ||
1184 | unlock_timer(timer, flags); | ||
1185 | release_posix_timer(timer, IT_ID_SET); | ||
1186 | } | ||
1187 | |||
1188 | /* | ||
1189 | * This is called by __exit_signal, only when there are no more | ||
1190 | * references to the shared signal_struct. | ||
1191 | */ | ||
1192 | void exit_itimers(struct signal_struct *sig) | ||
1193 | { | ||
1194 | struct k_itimer *tmr; | ||
1195 | |||
1196 | while (!list_empty(&sig->posix_timers)) { | ||
1197 | tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); | ||
1198 | itimer_delete(tmr); | ||
1199 | } | ||
1200 | } | ||
1201 | |||
1202 | /* | ||
1203 | * And now for the "clock" calls | ||
1204 | * | ||
1205 | * These functions are called both from timer functions (with the timer | ||
1206 | * spin_lock_irq() held and from clock calls with no locking. They must | ||
1207 | * use the save flags versions of locks. | ||
1208 | */ | ||
1209 | |||
1210 | /* | ||
1211 | * We do ticks here to avoid the irq lock ( they take sooo long). | ||
1212 | * The seqlock is great here. Since we a reader, we don't really care | ||
1213 | * if we are interrupted since we don't take lock that will stall us or | ||
1214 | * any other cpu. Voila, no irq lock is needed. | ||
1215 | * | ||
1216 | */ | ||
1217 | |||
1218 | static u64 do_posix_clock_monotonic_gettime_parts( | ||
1219 | struct timespec *tp, struct timespec *mo) | ||
1220 | { | ||
1221 | u64 jiff; | ||
1222 | unsigned int seq; | ||
1223 | |||
1224 | do { | ||
1225 | seq = read_seqbegin(&xtime_lock); | ||
1226 | getnstimeofday(tp); | ||
1227 | *mo = wall_to_monotonic; | ||
1228 | jiff = jiffies_64; | ||
1229 | |||
1230 | } while(read_seqretry(&xtime_lock, seq)); | ||
1231 | |||
1232 | return jiff; | ||
1233 | } | ||
1234 | |||
1235 | static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp) | ||
1236 | { | ||
1237 | struct timespec wall_to_mono; | ||
1238 | |||
1239 | do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); | ||
1240 | |||
1241 | tp->tv_sec += wall_to_mono.tv_sec; | ||
1242 | tp->tv_nsec += wall_to_mono.tv_nsec; | ||
1243 | |||
1244 | if ((tp->tv_nsec - NSEC_PER_SEC) > 0) { | ||
1245 | tp->tv_nsec -= NSEC_PER_SEC; | ||
1246 | tp->tv_sec++; | ||
1247 | } | ||
1248 | return 0; | ||
1249 | } | ||
1250 | |||
1251 | int do_posix_clock_monotonic_gettime(struct timespec *tp) | ||
1252 | { | ||
1253 | return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp); | ||
1254 | } | ||
1255 | |||
1256 | int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp) | ||
1257 | { | ||
1258 | return -EINVAL; | ||
1259 | } | ||
1260 | EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); | ||
1261 | |||
1262 | int do_posix_clock_notimer_create(struct k_itimer *timer) | ||
1263 | { | ||
1264 | return -EINVAL; | ||
1265 | } | ||
1266 | EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); | ||
1267 | |||
1268 | int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) | ||
1269 | { | ||
1270 | #ifndef ENOTSUP | ||
1271 | return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ | ||
1272 | #else /* parisc does define it separately. */ | ||
1273 | return -ENOTSUP; | ||
1274 | #endif | ||
1275 | } | ||
1276 | EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); | ||
1277 | |||
1278 | asmlinkage long | ||
1279 | sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp) | ||
1280 | { | ||
1281 | struct timespec new_tp; | ||
1282 | |||
1283 | if (invalid_clockid(which_clock)) | ||
1284 | return -EINVAL; | ||
1285 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | ||
1286 | return -EFAULT; | ||
1287 | |||
1288 | return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); | ||
1289 | } | ||
1290 | |||
1291 | asmlinkage long | ||
1292 | sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp) | ||
1293 | { | ||
1294 | struct timespec kernel_tp; | ||
1295 | int error; | ||
1296 | |||
1297 | if (invalid_clockid(which_clock)) | ||
1298 | return -EINVAL; | ||
1299 | error = CLOCK_DISPATCH(which_clock, clock_get, | ||
1300 | (which_clock, &kernel_tp)); | ||
1301 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | ||
1302 | error = -EFAULT; | ||
1303 | |||
1304 | return error; | ||
1305 | |||
1306 | } | ||
1307 | |||
1308 | asmlinkage long | ||
1309 | sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) | ||
1310 | { | ||
1311 | struct timespec rtn_tp; | ||
1312 | int error; | ||
1313 | |||
1314 | if (invalid_clockid(which_clock)) | ||
1315 | return -EINVAL; | ||
1316 | |||
1317 | error = CLOCK_DISPATCH(which_clock, clock_getres, | ||
1318 | (which_clock, &rtn_tp)); | ||
1319 | |||
1320 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { | ||
1321 | error = -EFAULT; | ||
1322 | } | ||
1323 | |||
1324 | return error; | ||
1325 | } | ||
1326 | |||
1327 | static void nanosleep_wake_up(unsigned long __data) | ||
1328 | { | ||
1329 | struct task_struct *p = (struct task_struct *) __data; | ||
1330 | |||
1331 | wake_up_process(p); | ||
1332 | } | ||
1333 | |||
1334 | /* | ||
1335 | * The standard says that an absolute nanosleep call MUST wake up at | ||
1336 | * the requested time in spite of clock settings. Here is what we do: | ||
1337 | * For each nanosleep call that needs it (only absolute and not on | ||
1338 | * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure | ||
1339 | * into the "nanosleep_abs_list". All we need is the task_struct pointer. | ||
1340 | * When ever the clock is set we just wake up all those tasks. The rest | ||
1341 | * is done by the while loop in clock_nanosleep(). | ||
1342 | * | ||
1343 | * On locking, clock_was_set() is called from update_wall_clock which | ||
1344 | * holds (or has held for it) a write_lock_irq( xtime_lock) and is | ||
1345 | * called from the timer bh code. Thus we need the irq save locks. | ||
1346 | * | ||
1347 | * Also, on the call from update_wall_clock, that is done as part of a | ||
1348 | * softirq thing. We don't want to delay the system that much (possibly | ||
1349 | * long list of timers to fix), so we defer that work to keventd. | ||
1350 | */ | ||
1351 | |||
1352 | static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue); | ||
1353 | static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL); | ||
1354 | |||
1355 | static DECLARE_MUTEX(clock_was_set_lock); | ||
1356 | |||
1357 | void clock_was_set(void) | ||
1358 | { | ||
1359 | struct k_itimer *timr; | ||
1360 | struct timespec new_wall_to; | ||
1361 | LIST_HEAD(cws_list); | ||
1362 | unsigned long seq; | ||
1363 | |||
1364 | |||
1365 | if (unlikely(in_interrupt())) { | ||
1366 | schedule_work(&clock_was_set_work); | ||
1367 | return; | ||
1368 | } | ||
1369 | wake_up_all(&nanosleep_abs_wqueue); | ||
1370 | |||
1371 | /* | ||
1372 | * Check if there exist TIMER_ABSTIME timers to correct. | ||
1373 | * | ||
1374 | * Notes on locking: This code is run in task context with irq | ||
1375 | * on. We CAN be interrupted! All other usage of the abs list | ||
1376 | * lock is under the timer lock which holds the irq lock as | ||
1377 | * well. We REALLY don't want to scan the whole list with the | ||
1378 | * interrupt system off, AND we would like a sequence lock on | ||
1379 | * this code as well. Since we assume that the clock will not | ||
1380 | * be set often, it seems ok to take and release the irq lock | ||
1381 | * for each timer. In fact add_timer will do this, so this is | ||
1382 | * not an issue. So we know when we are done, we will move the | ||
1383 | * whole list to a new location. Then as we process each entry, | ||
1384 | * we will move it to the actual list again. This way, when our | ||
1385 | * copy is empty, we are done. We are not all that concerned | ||
1386 | * about preemption so we will use a semaphore lock to protect | ||
1387 | * aginst reentry. This way we will not stall another | ||
1388 | * processor. It is possible that this may delay some timers | ||
1389 | * that should have expired, given the new clock, but even this | ||
1390 | * will be minimal as we will always update to the current time, | ||
1391 | * even if it was set by a task that is waiting for entry to | ||
1392 | * this code. Timers that expire too early will be caught by | ||
1393 | * the expire code and restarted. | ||
1394 | |||
1395 | * Absolute timers that repeat are left in the abs list while | ||
1396 | * waiting for the task to pick up the signal. This means we | ||
1397 | * may find timers that are not in the "add_timer" list, but are | ||
1398 | * in the abs list. We do the same thing for these, save | ||
1399 | * putting them back in the "add_timer" list. (Note, these are | ||
1400 | * left in the abs list mainly to indicate that they are | ||
1401 | * ABSOLUTE timers, a fact that is used by the re-arm code, and | ||
1402 | * for which we have no other flag.) | ||
1403 | |||
1404 | */ | ||
1405 | |||
1406 | down(&clock_was_set_lock); | ||
1407 | spin_lock_irq(&abs_list.lock); | ||
1408 | list_splice_init(&abs_list.list, &cws_list); | ||
1409 | spin_unlock_irq(&abs_list.lock); | ||
1410 | do { | ||
1411 | do { | ||
1412 | seq = read_seqbegin(&xtime_lock); | ||
1413 | new_wall_to = wall_to_monotonic; | ||
1414 | } while (read_seqretry(&xtime_lock, seq)); | ||
1415 | |||
1416 | spin_lock_irq(&abs_list.lock); | ||
1417 | if (list_empty(&cws_list)) { | ||
1418 | spin_unlock_irq(&abs_list.lock); | ||
1419 | break; | ||
1420 | } | ||
1421 | timr = list_entry(cws_list.next, struct k_itimer, | ||
1422 | it.real.abs_timer_entry); | ||
1423 | |||
1424 | list_del_init(&timr->it.real.abs_timer_entry); | ||
1425 | if (add_clockset_delta(timr, &new_wall_to) && | ||
1426 | del_timer(&timr->it.real.timer)) /* timer run yet? */ | ||
1427 | add_timer(&timr->it.real.timer); | ||
1428 | list_add(&timr->it.real.abs_timer_entry, &abs_list.list); | ||
1429 | spin_unlock_irq(&abs_list.lock); | ||
1430 | } while (1); | ||
1431 | |||
1432 | up(&clock_was_set_lock); | ||
1433 | } | ||
1434 | |||
1435 | long clock_nanosleep_restart(struct restart_block *restart_block); | ||
1436 | |||
1437 | asmlinkage long | ||
1438 | sys_clock_nanosleep(clockid_t which_clock, int flags, | ||
1439 | const struct timespec __user *rqtp, | ||
1440 | struct timespec __user *rmtp) | ||
1441 | { | ||
1442 | struct timespec t; | ||
1443 | struct restart_block *restart_block = | ||
1444 | &(current_thread_info()->restart_block); | ||
1445 | int ret; | ||
1446 | |||
1447 | if (invalid_clockid(which_clock)) | ||
1448 | return -EINVAL; | ||
1449 | |||
1450 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | ||
1451 | return -EFAULT; | ||
1452 | |||
1453 | if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0) | ||
1454 | return -EINVAL; | ||
1455 | |||
1456 | /* | ||
1457 | * Do this here as nsleep function does not have the real address. | ||
1458 | */ | ||
1459 | restart_block->arg1 = (unsigned long)rmtp; | ||
1460 | |||
1461 | ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t)); | ||
1462 | |||
1463 | if ((ret == -ERESTART_RESTARTBLOCK) && rmtp && | ||
1464 | copy_to_user(rmtp, &t, sizeof (t))) | ||
1465 | return -EFAULT; | ||
1466 | return ret; | ||
1467 | } | ||
1468 | |||
1469 | |||
1470 | static int common_nsleep(clockid_t which_clock, | ||
1471 | int flags, struct timespec *tsave) | ||
1472 | { | ||
1473 | struct timespec t, dum; | ||
1474 | struct timer_list new_timer; | ||
1475 | DECLARE_WAITQUEUE(abs_wqueue, current); | ||
1476 | u64 rq_time = (u64)0; | ||
1477 | s64 left; | ||
1478 | int abs; | ||
1479 | struct restart_block *restart_block = | ||
1480 | ¤t_thread_info()->restart_block; | ||
1481 | |||
1482 | abs_wqueue.flags = 0; | ||
1483 | init_timer(&new_timer); | ||
1484 | new_timer.expires = 0; | ||
1485 | new_timer.data = (unsigned long) current; | ||
1486 | new_timer.function = nanosleep_wake_up; | ||
1487 | abs = flags & TIMER_ABSTIME; | ||
1488 | |||
1489 | if (restart_block->fn == clock_nanosleep_restart) { | ||
1490 | /* | ||
1491 | * Interrupted by a non-delivered signal, pick up remaining | ||
1492 | * time and continue. Remaining time is in arg2 & 3. | ||
1493 | */ | ||
1494 | restart_block->fn = do_no_restart_syscall; | ||
1495 | |||
1496 | rq_time = restart_block->arg3; | ||
1497 | rq_time = (rq_time << 32) + restart_block->arg2; | ||
1498 | if (!rq_time) | ||
1499 | return -EINTR; | ||
1500 | left = rq_time - get_jiffies_64(); | ||
1501 | if (left <= (s64)0) | ||
1502 | return 0; /* Already passed */ | ||
1503 | } | ||
1504 | |||
1505 | if (abs && (posix_clocks[which_clock].clock_get != | ||
1506 | posix_clocks[CLOCK_MONOTONIC].clock_get)) | ||
1507 | add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue); | ||
1508 | |||
1509 | do { | ||
1510 | t = *tsave; | ||
1511 | if (abs || !rq_time) { | ||
1512 | adjust_abs_time(&posix_clocks[which_clock], &t, abs, | ||
1513 | &rq_time, &dum); | ||
1514 | } | ||
1515 | |||
1516 | left = rq_time - get_jiffies_64(); | ||
1517 | if (left >= (s64)MAX_JIFFY_OFFSET) | ||
1518 | left = (s64)MAX_JIFFY_OFFSET; | ||
1519 | if (left < (s64)0) | ||
1520 | break; | ||
1521 | |||
1522 | new_timer.expires = jiffies + left; | ||
1523 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1524 | add_timer(&new_timer); | ||
1525 | |||
1526 | schedule(); | ||
1527 | |||
1528 | del_timer_sync(&new_timer); | ||
1529 | left = rq_time - get_jiffies_64(); | ||
1530 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); | ||
1531 | |||
1532 | if (abs_wqueue.task_list.next) | ||
1533 | finish_wait(&nanosleep_abs_wqueue, &abs_wqueue); | ||
1534 | |||
1535 | if (left > (s64)0) { | ||
1536 | |||
1537 | /* | ||
1538 | * Always restart abs calls from scratch to pick up any | ||
1539 | * clock shifting that happened while we are away. | ||
1540 | */ | ||
1541 | if (abs) | ||
1542 | return -ERESTARTNOHAND; | ||
1543 | |||
1544 | left *= TICK_NSEC; | ||
1545 | tsave->tv_sec = div_long_long_rem(left, | ||
1546 | NSEC_PER_SEC, | ||
1547 | &tsave->tv_nsec); | ||
1548 | /* | ||
1549 | * Restart works by saving the time remaing in | ||
1550 | * arg2 & 3 (it is 64-bits of jiffies). The other | ||
1551 | * info we need is the clock_id (saved in arg0). | ||
1552 | * The sys_call interface needs the users | ||
1553 | * timespec return address which _it_ saves in arg1. | ||
1554 | * Since we have cast the nanosleep call to a clock_nanosleep | ||
1555 | * both can be restarted with the same code. | ||
1556 | */ | ||
1557 | restart_block->fn = clock_nanosleep_restart; | ||
1558 | restart_block->arg0 = which_clock; | ||
1559 | /* | ||
1560 | * Caller sets arg1 | ||
1561 | */ | ||
1562 | restart_block->arg2 = rq_time & 0xffffffffLL; | ||
1563 | restart_block->arg3 = rq_time >> 32; | ||
1564 | |||
1565 | return -ERESTART_RESTARTBLOCK; | ||
1566 | } | ||
1567 | |||
1568 | return 0; | ||
1569 | } | ||
1570 | /* | ||
1571 | * This will restart clock_nanosleep. | ||
1572 | */ | ||
1573 | long | ||
1574 | clock_nanosleep_restart(struct restart_block *restart_block) | ||
1575 | { | ||
1576 | struct timespec t; | ||
1577 | int ret = common_nsleep(restart_block->arg0, 0, &t); | ||
1578 | |||
1579 | if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 && | ||
1580 | copy_to_user((struct timespec __user *)(restart_block->arg1), &t, | ||
1581 | sizeof (t))) | ||
1582 | return -EFAULT; | ||
1583 | return ret; | ||
1584 | } | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig new file mode 100644 index 000000000000..696387ffe49c --- /dev/null +++ b/kernel/power/Kconfig | |||
@@ -0,0 +1,74 @@ | |||
1 | config PM | ||
2 | bool "Power Management support" | ||
3 | ---help--- | ||
4 | "Power Management" means that parts of your computer are shut | ||
5 | off or put into a power conserving "sleep" mode if they are not | ||
6 | being used. There are two competing standards for doing this: APM | ||
7 | and ACPI. If you want to use either one, say Y here and then also | ||
8 | to the requisite support below. | ||
9 | |||
10 | Power Management is most important for battery powered laptop | ||
11 | computers; if you have a laptop, check out the Linux Laptop home | ||
12 | page on the WWW at <http://www.linux-on-laptops.com/> or | ||
13 | Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> | ||
14 | and the Battery Powered Linux mini-HOWTO, available from | ||
15 | <http://www.tldp.org/docs.html#howto>. | ||
16 | |||
17 | Note that, even if you say N here, Linux on the x86 architecture | ||
18 | will issue the hlt instruction if nothing is to be done, thereby | ||
19 | sending the processor to sleep and saving power. | ||
20 | |||
21 | config PM_DEBUG | ||
22 | bool "Power Management Debug Support" | ||
23 | depends on PM | ||
24 | ---help--- | ||
25 | This option enables verbose debugging support in the Power Management | ||
26 | code. This is helpful when debugging and reporting various PM bugs, | ||
27 | like suspend support. | ||
28 | |||
29 | config SOFTWARE_SUSPEND | ||
30 | bool "Software Suspend (EXPERIMENTAL)" | ||
31 | depends on EXPERIMENTAL && PM && SWAP | ||
32 | ---help--- | ||
33 | Enable the possibility of suspending the machine. | ||
34 | It doesn't need APM. | ||
35 | You may suspend your machine by 'swsusp' or 'shutdown -z <time>' | ||
36 | (patch for sysvinit needed). | ||
37 | |||
38 | It creates an image which is saved in your active swap. Upon next | ||
39 | boot, pass the 'resume=/dev/swappartition' argument to the kernel to | ||
40 | have it detect the saved image, restore memory state from it, and | ||
41 | continue to run as before. If you do not want the previous state to | ||
42 | be reloaded, then use the 'noresume' kernel argument. However, note | ||
43 | that your partitions will be fsck'd and you must re-mkswap your swap | ||
44 | partitions. It does not work with swap files. | ||
45 | |||
46 | Right now you may boot without resuming and then later resume but | ||
47 | in meantime you cannot use those swap partitions/files which were | ||
48 | involved in suspending. Also in this case there is a risk that buffers | ||
49 | on disk won't match with saved ones. | ||
50 | |||
51 | For more information take a look at <file:Documentation/power/swsusp.txt>. | ||
52 | |||
53 | config PM_STD_PARTITION | ||
54 | string "Default resume partition" | ||
55 | depends on SOFTWARE_SUSPEND | ||
56 | default "" | ||
57 | ---help--- | ||
58 | The default resume partition is the partition that the suspend- | ||
59 | to-disk implementation will look for a suspended disk image. | ||
60 | |||
61 | The partition specified here will be different for almost every user. | ||
62 | It should be a valid swap partition (at least for now) that is turned | ||
63 | on before suspending. | ||
64 | |||
65 | The partition specified can be overridden by specifying: | ||
66 | |||
67 | resume=/dev/<other device> | ||
68 | |||
69 | which will set the resume partition to the device specified. | ||
70 | |||
71 | Note there is currently not a way to specify which device to save the | ||
72 | suspended image to. It will simply pick the first available swap | ||
73 | device. | ||
74 | |||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile new file mode 100644 index 000000000000..fbdc634135a7 --- /dev/null +++ b/kernel/power/Makefile | |||
@@ -0,0 +1,11 @@ | |||
1 | |||
2 | ifeq ($(CONFIG_PM_DEBUG),y) | ||
3 | EXTRA_CFLAGS += -DDEBUG | ||
4 | endif | ||
5 | |||
6 | swsusp-smp-$(CONFIG_SMP) += smp.o | ||
7 | |||
8 | obj-y := main.o process.o console.o pm.o | ||
9 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o | ||
10 | |||
11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | ||
diff --git a/kernel/power/console.c b/kernel/power/console.c new file mode 100644 index 000000000000..7ff375e7c95f --- /dev/null +++ b/kernel/power/console.c | |||
@@ -0,0 +1,58 @@ | |||
1 | /* | ||
2 | * drivers/power/process.c - Functions for saving/restoring console. | ||
3 | * | ||
4 | * Originally from swsusp. | ||
5 | */ | ||
6 | |||
7 | #include <linux/vt_kern.h> | ||
8 | #include <linux/kbd_kern.h> | ||
9 | #include <linux/console.h> | ||
10 | #include "power.h" | ||
11 | |||
12 | static int new_loglevel = 10; | ||
13 | static int orig_loglevel; | ||
14 | #ifdef SUSPEND_CONSOLE | ||
15 | static int orig_fgconsole, orig_kmsg; | ||
16 | #endif | ||
17 | |||
18 | int pm_prepare_console(void) | ||
19 | { | ||
20 | orig_loglevel = console_loglevel; | ||
21 | console_loglevel = new_loglevel; | ||
22 | |||
23 | #ifdef SUSPEND_CONSOLE | ||
24 | acquire_console_sem(); | ||
25 | |||
26 | orig_fgconsole = fg_console; | ||
27 | |||
28 | if (vc_allocate(SUSPEND_CONSOLE)) { | ||
29 | /* we can't have a free VC for now. Too bad, | ||
30 | * we don't want to mess the screen for now. */ | ||
31 | release_console_sem(); | ||
32 | return 1; | ||
33 | } | ||
34 | |||
35 | set_console(SUSPEND_CONSOLE); | ||
36 | release_console_sem(); | ||
37 | |||
38 | if (vt_waitactive(SUSPEND_CONSOLE)) { | ||
39 | pr_debug("Suspend: Can't switch VCs."); | ||
40 | return 1; | ||
41 | } | ||
42 | orig_kmsg = kmsg_redirect; | ||
43 | kmsg_redirect = SUSPEND_CONSOLE; | ||
44 | #endif | ||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | void pm_restore_console(void) | ||
49 | { | ||
50 | console_loglevel = orig_loglevel; | ||
51 | #ifdef SUSPEND_CONSOLE | ||
52 | acquire_console_sem(); | ||
53 | set_console(orig_fgconsole); | ||
54 | release_console_sem(); | ||
55 | kmsg_redirect = orig_kmsg; | ||
56 | #endif | ||
57 | return; | ||
58 | } | ||
diff --git a/kernel/power/disk.c b/kernel/power/disk.c new file mode 100644 index 000000000000..02b6764034dc --- /dev/null +++ b/kernel/power/disk.c | |||
@@ -0,0 +1,431 @@ | |||
1 | /* | ||
2 | * kernel/power/disk.c - Suspend-to-disk support. | ||
3 | * | ||
4 | * Copyright (c) 2003 Patrick Mochel | ||
5 | * Copyright (c) 2003 Open Source Development Lab | ||
6 | * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> | ||
7 | * | ||
8 | * This file is released under the GPLv2. | ||
9 | * | ||
10 | */ | ||
11 | |||
12 | #include <linux/suspend.h> | ||
13 | #include <linux/syscalls.h> | ||
14 | #include <linux/reboot.h> | ||
15 | #include <linux/string.h> | ||
16 | #include <linux/device.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/fs.h> | ||
19 | #include "power.h" | ||
20 | |||
21 | |||
22 | extern suspend_disk_method_t pm_disk_mode; | ||
23 | extern struct pm_ops * pm_ops; | ||
24 | |||
25 | extern int swsusp_suspend(void); | ||
26 | extern int swsusp_write(void); | ||
27 | extern int swsusp_check(void); | ||
28 | extern int swsusp_read(void); | ||
29 | extern void swsusp_close(void); | ||
30 | extern int swsusp_resume(void); | ||
31 | extern int swsusp_free(void); | ||
32 | |||
33 | |||
34 | static int noresume = 0; | ||
35 | char resume_file[256] = CONFIG_PM_STD_PARTITION; | ||
36 | dev_t swsusp_resume_device; | ||
37 | |||
38 | /** | ||
39 | * power_down - Shut machine down for hibernate. | ||
40 | * @mode: Suspend-to-disk mode | ||
41 | * | ||
42 | * Use the platform driver, if configured so, and return gracefully if it | ||
43 | * fails. | ||
44 | * Otherwise, try to power off and reboot. If they fail, halt the machine, | ||
45 | * there ain't no turning back. | ||
46 | */ | ||
47 | |||
48 | static void power_down(suspend_disk_method_t mode) | ||
49 | { | ||
50 | unsigned long flags; | ||
51 | int error = 0; | ||
52 | |||
53 | local_irq_save(flags); | ||
54 | switch(mode) { | ||
55 | case PM_DISK_PLATFORM: | ||
56 | device_shutdown(); | ||
57 | error = pm_ops->enter(PM_SUSPEND_DISK); | ||
58 | break; | ||
59 | case PM_DISK_SHUTDOWN: | ||
60 | printk("Powering off system\n"); | ||
61 | device_shutdown(); | ||
62 | machine_power_off(); | ||
63 | break; | ||
64 | case PM_DISK_REBOOT: | ||
65 | device_shutdown(); | ||
66 | machine_restart(NULL); | ||
67 | break; | ||
68 | } | ||
69 | machine_halt(); | ||
70 | /* Valid image is on the disk, if we continue we risk serious data corruption | ||
71 | after resume. */ | ||
72 | printk(KERN_CRIT "Please power me down manually\n"); | ||
73 | while(1); | ||
74 | } | ||
75 | |||
76 | |||
77 | static int in_suspend __nosavedata = 0; | ||
78 | |||
79 | |||
80 | /** | ||
81 | * free_some_memory - Try to free as much memory as possible | ||
82 | * | ||
83 | * ... but do not OOM-kill anyone | ||
84 | * | ||
85 | * Notice: all userland should be stopped at this point, or | ||
86 | * livelock is possible. | ||
87 | */ | ||
88 | |||
89 | static void free_some_memory(void) | ||
90 | { | ||
91 | unsigned int i = 0; | ||
92 | unsigned int tmp; | ||
93 | unsigned long pages = 0; | ||
94 | char *p = "-\\|/"; | ||
95 | |||
96 | printk("Freeing memory... "); | ||
97 | while ((tmp = shrink_all_memory(10000))) { | ||
98 | pages += tmp; | ||
99 | printk("\b%c", p[i]); | ||
100 | i++; | ||
101 | if (i > 3) | ||
102 | i = 0; | ||
103 | } | ||
104 | printk("\bdone (%li pages freed)\n", pages); | ||
105 | } | ||
106 | |||
107 | |||
108 | static inline void platform_finish(void) | ||
109 | { | ||
110 | if (pm_disk_mode == PM_DISK_PLATFORM) { | ||
111 | if (pm_ops && pm_ops->finish) | ||
112 | pm_ops->finish(PM_SUSPEND_DISK); | ||
113 | } | ||
114 | } | ||
115 | |||
116 | static void finish(void) | ||
117 | { | ||
118 | device_resume(); | ||
119 | platform_finish(); | ||
120 | enable_nonboot_cpus(); | ||
121 | thaw_processes(); | ||
122 | pm_restore_console(); | ||
123 | } | ||
124 | |||
125 | |||
126 | static int prepare_processes(void) | ||
127 | { | ||
128 | int error; | ||
129 | |||
130 | pm_prepare_console(); | ||
131 | |||
132 | sys_sync(); | ||
133 | |||
134 | if (freeze_processes()) { | ||
135 | error = -EBUSY; | ||
136 | return error; | ||
137 | } | ||
138 | |||
139 | if (pm_disk_mode == PM_DISK_PLATFORM) { | ||
140 | if (pm_ops && pm_ops->prepare) { | ||
141 | if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) | ||
142 | return error; | ||
143 | } | ||
144 | } | ||
145 | |||
146 | /* Free memory before shutting down devices. */ | ||
147 | free_some_memory(); | ||
148 | |||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | static void unprepare_processes(void) | ||
153 | { | ||
154 | enable_nonboot_cpus(); | ||
155 | thaw_processes(); | ||
156 | pm_restore_console(); | ||
157 | } | ||
158 | |||
159 | static int prepare_devices(void) | ||
160 | { | ||
161 | int error; | ||
162 | |||
163 | disable_nonboot_cpus(); | ||
164 | if ((error = device_suspend(PMSG_FREEZE))) { | ||
165 | printk("Some devices failed to suspend\n"); | ||
166 | platform_finish(); | ||
167 | enable_nonboot_cpus(); | ||
168 | return error; | ||
169 | } | ||
170 | |||
171 | return 0; | ||
172 | } | ||
173 | |||
174 | /** | ||
175 | * pm_suspend_disk - The granpappy of power management. | ||
176 | * | ||
177 | * If we're going through the firmware, then get it over with quickly. | ||
178 | * | ||
179 | * If not, then call swsusp to do its thing, then figure out how | ||
180 | * to power down the system. | ||
181 | */ | ||
182 | |||
183 | int pm_suspend_disk(void) | ||
184 | { | ||
185 | int error; | ||
186 | |||
187 | error = prepare_processes(); | ||
188 | if (!error) { | ||
189 | error = prepare_devices(); | ||
190 | } | ||
191 | |||
192 | if (error) { | ||
193 | unprepare_processes(); | ||
194 | return error; | ||
195 | } | ||
196 | |||
197 | pr_debug("PM: Attempting to suspend to disk.\n"); | ||
198 | if (pm_disk_mode == PM_DISK_FIRMWARE) | ||
199 | return pm_ops->enter(PM_SUSPEND_DISK); | ||
200 | |||
201 | pr_debug("PM: snapshotting memory.\n"); | ||
202 | in_suspend = 1; | ||
203 | if ((error = swsusp_suspend())) | ||
204 | goto Done; | ||
205 | |||
206 | if (in_suspend) { | ||
207 | pr_debug("PM: writing image.\n"); | ||
208 | error = swsusp_write(); | ||
209 | if (!error) | ||
210 | power_down(pm_disk_mode); | ||
211 | } else | ||
212 | pr_debug("PM: Image restored successfully.\n"); | ||
213 | swsusp_free(); | ||
214 | Done: | ||
215 | finish(); | ||
216 | return error; | ||
217 | } | ||
218 | |||
219 | |||
220 | /** | ||
221 | * software_resume - Resume from a saved image. | ||
222 | * | ||
223 | * Called as a late_initcall (so all devices are discovered and | ||
224 | * initialized), we call swsusp to see if we have a saved image or not. | ||
225 | * If so, we quiesce devices, the restore the saved image. We will | ||
226 | * return above (in pm_suspend_disk() ) if everything goes well. | ||
227 | * Otherwise, we fail gracefully and return to the normally | ||
228 | * scheduled program. | ||
229 | * | ||
230 | */ | ||
231 | |||
232 | static int software_resume(void) | ||
233 | { | ||
234 | int error; | ||
235 | |||
236 | if (noresume) { | ||
237 | /** | ||
238 | * FIXME: If noresume is specified, we need to find the partition | ||
239 | * and reset it back to normal swap space. | ||
240 | */ | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | pr_debug("PM: Checking swsusp image.\n"); | ||
245 | |||
246 | if ((error = swsusp_check())) | ||
247 | goto Done; | ||
248 | |||
249 | pr_debug("PM: Preparing processes for restore.\n"); | ||
250 | |||
251 | if ((error = prepare_processes())) { | ||
252 | swsusp_close(); | ||
253 | goto Cleanup; | ||
254 | } | ||
255 | |||
256 | pr_debug("PM: Reading swsusp image.\n"); | ||
257 | |||
258 | if ((error = swsusp_read())) | ||
259 | goto Cleanup; | ||
260 | |||
261 | pr_debug("PM: Preparing devices for restore.\n"); | ||
262 | |||
263 | if ((error = prepare_devices())) | ||
264 | goto Free; | ||
265 | |||
266 | mb(); | ||
267 | |||
268 | pr_debug("PM: Restoring saved image.\n"); | ||
269 | swsusp_resume(); | ||
270 | pr_debug("PM: Restore failed, recovering.n"); | ||
271 | finish(); | ||
272 | Free: | ||
273 | swsusp_free(); | ||
274 | Cleanup: | ||
275 | unprepare_processes(); | ||
276 | Done: | ||
277 | pr_debug("PM: Resume from disk failed.\n"); | ||
278 | return 0; | ||
279 | } | ||
280 | |||
281 | late_initcall(software_resume); | ||
282 | |||
283 | |||
284 | static char * pm_disk_modes[] = { | ||
285 | [PM_DISK_FIRMWARE] = "firmware", | ||
286 | [PM_DISK_PLATFORM] = "platform", | ||
287 | [PM_DISK_SHUTDOWN] = "shutdown", | ||
288 | [PM_DISK_REBOOT] = "reboot", | ||
289 | }; | ||
290 | |||
291 | /** | ||
292 | * disk - Control suspend-to-disk mode | ||
293 | * | ||
294 | * Suspend-to-disk can be handled in several ways. The greatest | ||
295 | * distinction is who writes memory to disk - the firmware or the OS. | ||
296 | * If the firmware does it, we assume that it also handles suspending | ||
297 | * the system. | ||
298 | * If the OS does it, then we have three options for putting the system | ||
299 | * to sleep - using the platform driver (e.g. ACPI or other PM registers), | ||
300 | * powering off the system or rebooting the system (for testing). | ||
301 | * | ||
302 | * The system will support either 'firmware' or 'platform', and that is | ||
303 | * known a priori (and encoded in pm_ops). But, the user may choose | ||
304 | * 'shutdown' or 'reboot' as alternatives. | ||
305 | * | ||
306 | * show() will display what the mode is currently set to. | ||
307 | * store() will accept one of | ||
308 | * | ||
309 | * 'firmware' | ||
310 | * 'platform' | ||
311 | * 'shutdown' | ||
312 | * 'reboot' | ||
313 | * | ||
314 | * It will only change to 'firmware' or 'platform' if the system | ||
315 | * supports it (as determined from pm_ops->pm_disk_mode). | ||
316 | */ | ||
317 | |||
318 | static ssize_t disk_show(struct subsystem * subsys, char * buf) | ||
319 | { | ||
320 | return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]); | ||
321 | } | ||
322 | |||
323 | |||
324 | static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) | ||
325 | { | ||
326 | int error = 0; | ||
327 | int i; | ||
328 | int len; | ||
329 | char *p; | ||
330 | suspend_disk_method_t mode = 0; | ||
331 | |||
332 | p = memchr(buf, '\n', n); | ||
333 | len = p ? p - buf : n; | ||
334 | |||
335 | down(&pm_sem); | ||
336 | for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { | ||
337 | if (!strncmp(buf, pm_disk_modes[i], len)) { | ||
338 | mode = i; | ||
339 | break; | ||
340 | } | ||
341 | } | ||
342 | if (mode) { | ||
343 | if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) | ||
344 | pm_disk_mode = mode; | ||
345 | else { | ||
346 | if (pm_ops && pm_ops->enter && | ||
347 | (mode == pm_ops->pm_disk_mode)) | ||
348 | pm_disk_mode = mode; | ||
349 | else | ||
350 | error = -EINVAL; | ||
351 | } | ||
352 | } else | ||
353 | error = -EINVAL; | ||
354 | |||
355 | pr_debug("PM: suspend-to-disk mode set to '%s'\n", | ||
356 | pm_disk_modes[mode]); | ||
357 | up(&pm_sem); | ||
358 | return error ? error : n; | ||
359 | } | ||
360 | |||
361 | power_attr(disk); | ||
362 | |||
363 | static ssize_t resume_show(struct subsystem * subsys, char *buf) | ||
364 | { | ||
365 | return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), | ||
366 | MINOR(swsusp_resume_device)); | ||
367 | } | ||
368 | |||
369 | static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) | ||
370 | { | ||
371 | int len; | ||
372 | char *p; | ||
373 | unsigned int maj, min; | ||
374 | int error = -EINVAL; | ||
375 | dev_t res; | ||
376 | |||
377 | p = memchr(buf, '\n', n); | ||
378 | len = p ? p - buf : n; | ||
379 | |||
380 | if (sscanf(buf, "%u:%u", &maj, &min) == 2) { | ||
381 | res = MKDEV(maj,min); | ||
382 | if (maj == MAJOR(res) && min == MINOR(res)) { | ||
383 | swsusp_resume_device = res; | ||
384 | printk("Attempting manual resume\n"); | ||
385 | noresume = 0; | ||
386 | software_resume(); | ||
387 | } | ||
388 | } | ||
389 | |||
390 | return error >= 0 ? n : error; | ||
391 | } | ||
392 | |||
393 | power_attr(resume); | ||
394 | |||
395 | static struct attribute * g[] = { | ||
396 | &disk_attr.attr, | ||
397 | &resume_attr.attr, | ||
398 | NULL, | ||
399 | }; | ||
400 | |||
401 | |||
402 | static struct attribute_group attr_group = { | ||
403 | .attrs = g, | ||
404 | }; | ||
405 | |||
406 | |||
407 | static int __init pm_disk_init(void) | ||
408 | { | ||
409 | return sysfs_create_group(&power_subsys.kset.kobj,&attr_group); | ||
410 | } | ||
411 | |||
412 | core_initcall(pm_disk_init); | ||
413 | |||
414 | |||
415 | static int __init resume_setup(char *str) | ||
416 | { | ||
417 | if (noresume) | ||
418 | return 1; | ||
419 | |||
420 | strncpy( resume_file, str, 255 ); | ||
421 | return 1; | ||
422 | } | ||
423 | |||
424 | static int __init noresume_setup(char *str) | ||
425 | { | ||
426 | noresume = 1; | ||
427 | return 1; | ||
428 | } | ||
429 | |||
430 | __setup("noresume", noresume_setup); | ||
431 | __setup("resume=", resume_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c new file mode 100644 index 000000000000..7960ddf04a57 --- /dev/null +++ b/kernel/power/main.c | |||
@@ -0,0 +1,269 @@ | |||
1 | /* | ||
2 | * kernel/power/main.c - PM subsystem core functionality. | ||
3 | * | ||
4 | * Copyright (c) 2003 Patrick Mochel | ||
5 | * Copyright (c) 2003 Open Source Development Lab | ||
6 | * | ||
7 | * This file is released under the GPLv2 | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | #include <linux/suspend.h> | ||
12 | #include <linux/kobject.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/delay.h> | ||
15 | #include <linux/errno.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/pm.h> | ||
18 | |||
19 | |||
20 | #include "power.h" | ||
21 | |||
22 | DECLARE_MUTEX(pm_sem); | ||
23 | |||
24 | struct pm_ops * pm_ops = NULL; | ||
25 | suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; | ||
26 | |||
27 | /** | ||
28 | * pm_set_ops - Set the global power method table. | ||
29 | * @ops: Pointer to ops structure. | ||
30 | */ | ||
31 | |||
32 | void pm_set_ops(struct pm_ops * ops) | ||
33 | { | ||
34 | down(&pm_sem); | ||
35 | pm_ops = ops; | ||
36 | up(&pm_sem); | ||
37 | } | ||
38 | |||
39 | |||
40 | /** | ||
41 | * suspend_prepare - Do prep work before entering low-power state. | ||
42 | * @state: State we're entering. | ||
43 | * | ||
44 | * This is common code that is called for each state that we're | ||
45 | * entering. Allocate a console, stop all processes, then make sure | ||
46 | * the platform can enter the requested state. | ||
47 | */ | ||
48 | |||
49 | static int suspend_prepare(suspend_state_t state) | ||
50 | { | ||
51 | int error = 0; | ||
52 | |||
53 | if (!pm_ops || !pm_ops->enter) | ||
54 | return -EPERM; | ||
55 | |||
56 | pm_prepare_console(); | ||
57 | |||
58 | if (freeze_processes()) { | ||
59 | error = -EAGAIN; | ||
60 | goto Thaw; | ||
61 | } | ||
62 | |||
63 | if (pm_ops->prepare) { | ||
64 | if ((error = pm_ops->prepare(state))) | ||
65 | goto Thaw; | ||
66 | } | ||
67 | |||
68 | if ((error = device_suspend(PMSG_SUSPEND))) { | ||
69 | printk(KERN_ERR "Some devices failed to suspend\n"); | ||
70 | goto Finish; | ||
71 | } | ||
72 | return 0; | ||
73 | Finish: | ||
74 | if (pm_ops->finish) | ||
75 | pm_ops->finish(state); | ||
76 | Thaw: | ||
77 | thaw_processes(); | ||
78 | pm_restore_console(); | ||
79 | return error; | ||
80 | } | ||
81 | |||
82 | |||
83 | static int suspend_enter(suspend_state_t state) | ||
84 | { | ||
85 | int error = 0; | ||
86 | unsigned long flags; | ||
87 | |||
88 | local_irq_save(flags); | ||
89 | |||
90 | if ((error = device_power_down(PMSG_SUSPEND))) { | ||
91 | printk(KERN_ERR "Some devices failed to power down\n"); | ||
92 | goto Done; | ||
93 | } | ||
94 | error = pm_ops->enter(state); | ||
95 | device_power_up(); | ||
96 | Done: | ||
97 | local_irq_restore(flags); | ||
98 | return error; | ||
99 | } | ||
100 | |||
101 | |||
102 | /** | ||
103 | * suspend_finish - Do final work before exiting suspend sequence. | ||
104 | * @state: State we're coming out of. | ||
105 | * | ||
106 | * Call platform code to clean up, restart processes, and free the | ||
107 | * console that we've allocated. This is not called for suspend-to-disk. | ||
108 | */ | ||
109 | |||
110 | static void suspend_finish(suspend_state_t state) | ||
111 | { | ||
112 | device_resume(); | ||
113 | if (pm_ops && pm_ops->finish) | ||
114 | pm_ops->finish(state); | ||
115 | thaw_processes(); | ||
116 | pm_restore_console(); | ||
117 | } | ||
118 | |||
119 | |||
120 | |||
121 | |||
122 | static char * pm_states[] = { | ||
123 | [PM_SUSPEND_STANDBY] = "standby", | ||
124 | [PM_SUSPEND_MEM] = "mem", | ||
125 | [PM_SUSPEND_DISK] = "disk", | ||
126 | NULL, | ||
127 | }; | ||
128 | |||
129 | |||
130 | /** | ||
131 | * enter_state - Do common work of entering low-power state. | ||
132 | * @state: pm_state structure for state we're entering. | ||
133 | * | ||
134 | * Make sure we're the only ones trying to enter a sleep state. Fail | ||
135 | * if someone has beat us to it, since we don't want anything weird to | ||
136 | * happen when we wake up. | ||
137 | * Then, do the setup for suspend, enter the state, and cleaup (after | ||
138 | * we've woken up). | ||
139 | */ | ||
140 | |||
141 | static int enter_state(suspend_state_t state) | ||
142 | { | ||
143 | int error; | ||
144 | |||
145 | if (down_trylock(&pm_sem)) | ||
146 | return -EBUSY; | ||
147 | |||
148 | if (state == PM_SUSPEND_DISK) { | ||
149 | error = pm_suspend_disk(); | ||
150 | goto Unlock; | ||
151 | } | ||
152 | |||
153 | /* Suspend is hard to get right on SMP. */ | ||
154 | if (num_online_cpus() != 1) { | ||
155 | error = -EPERM; | ||
156 | goto Unlock; | ||
157 | } | ||
158 | |||
159 | pr_debug("PM: Preparing system for suspend\n"); | ||
160 | if ((error = suspend_prepare(state))) | ||
161 | goto Unlock; | ||
162 | |||
163 | pr_debug("PM: Entering state.\n"); | ||
164 | error = suspend_enter(state); | ||
165 | |||
166 | pr_debug("PM: Finishing up.\n"); | ||
167 | suspend_finish(state); | ||
168 | Unlock: | ||
169 | up(&pm_sem); | ||
170 | return error; | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * This is main interface to the outside world. It needs to be | ||
175 | * called from process context. | ||
176 | */ | ||
177 | int software_suspend(void) | ||
178 | { | ||
179 | return enter_state(PM_SUSPEND_DISK); | ||
180 | } | ||
181 | |||
182 | |||
183 | /** | ||
184 | * pm_suspend - Externally visible function for suspending system. | ||
185 | * @state: Enumarted value of state to enter. | ||
186 | * | ||
187 | * Determine whether or not value is within range, get state | ||
188 | * structure, and enter (above). | ||
189 | */ | ||
190 | |||
191 | int pm_suspend(suspend_state_t state) | ||
192 | { | ||
193 | if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) | ||
194 | return enter_state(state); | ||
195 | return -EINVAL; | ||
196 | } | ||
197 | |||
198 | |||
199 | |||
200 | decl_subsys(power,NULL,NULL); | ||
201 | |||
202 | |||
203 | /** | ||
204 | * state - control system power state. | ||
205 | * | ||
206 | * show() returns what states are supported, which is hard-coded to | ||
207 | * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and | ||
208 | * 'disk' (Suspend-to-Disk). | ||
209 | * | ||
210 | * store() accepts one of those strings, translates it into the | ||
211 | * proper enumerated value, and initiates a suspend transition. | ||
212 | */ | ||
213 | |||
214 | static ssize_t state_show(struct subsystem * subsys, char * buf) | ||
215 | { | ||
216 | int i; | ||
217 | char * s = buf; | ||
218 | |||
219 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | ||
220 | if (pm_states[i]) | ||
221 | s += sprintf(s,"%s ",pm_states[i]); | ||
222 | } | ||
223 | s += sprintf(s,"\n"); | ||
224 | return (s - buf); | ||
225 | } | ||
226 | |||
227 | static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) | ||
228 | { | ||
229 | suspend_state_t state = PM_SUSPEND_STANDBY; | ||
230 | char ** s; | ||
231 | char *p; | ||
232 | int error; | ||
233 | int len; | ||
234 | |||
235 | p = memchr(buf, '\n', n); | ||
236 | len = p ? p - buf : n; | ||
237 | |||
238 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { | ||
239 | if (*s && !strncmp(buf, *s, len)) | ||
240 | break; | ||
241 | } | ||
242 | if (*s) | ||
243 | error = enter_state(state); | ||
244 | else | ||
245 | error = -EINVAL; | ||
246 | return error ? error : n; | ||
247 | } | ||
248 | |||
249 | power_attr(state); | ||
250 | |||
251 | static struct attribute * g[] = { | ||
252 | &state_attr.attr, | ||
253 | NULL, | ||
254 | }; | ||
255 | |||
256 | static struct attribute_group attr_group = { | ||
257 | .attrs = g, | ||
258 | }; | ||
259 | |||
260 | |||
261 | static int __init pm_init(void) | ||
262 | { | ||
263 | int error = subsystem_register(&power_subsys); | ||
264 | if (!error) | ||
265 | error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group); | ||
266 | return error; | ||
267 | } | ||
268 | |||
269 | core_initcall(pm_init); | ||
diff --git a/kernel/power/pm.c b/kernel/power/pm.c new file mode 100644 index 000000000000..61deda04e39e --- /dev/null +++ b/kernel/power/pm.c | |||
@@ -0,0 +1,265 @@ | |||
1 | /* | ||
2 | * pm.c - Power management interface | ||
3 | * | ||
4 | * Copyright (C) 2000 Andrew Henroid | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/spinlock.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/pm.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | |||
28 | int pm_active; | ||
29 | |||
30 | /* | ||
31 | * Locking notes: | ||
32 | * pm_devs_lock can be a semaphore providing pm ops are not called | ||
33 | * from an interrupt handler (already a bad idea so no change here). Each | ||
34 | * change must be protected so that an unlink of an entry doesn't clash | ||
35 | * with a pm send - which is permitted to sleep in the current architecture | ||
36 | * | ||
37 | * Module unloads clashing with pm events now work out safely, the module | ||
38 | * unload path will block until the event has been sent. It may well block | ||
39 | * until a resume but that will be fine. | ||
40 | */ | ||
41 | |||
42 | static DECLARE_MUTEX(pm_devs_lock); | ||
43 | static LIST_HEAD(pm_devs); | ||
44 | |||
45 | /** | ||
46 | * pm_register - register a device with power management | ||
47 | * @type: device type | ||
48 | * @id: device ID | ||
49 | * @callback: callback function | ||
50 | * | ||
51 | * Add a device to the list of devices that wish to be notified about | ||
52 | * power management events. A &pm_dev structure is returned on success, | ||
53 | * on failure the return is %NULL. | ||
54 | * | ||
55 | * The callback function will be called in process context and | ||
56 | * it may sleep. | ||
57 | */ | ||
58 | |||
59 | struct pm_dev *pm_register(pm_dev_t type, | ||
60 | unsigned long id, | ||
61 | pm_callback callback) | ||
62 | { | ||
63 | struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); | ||
64 | if (dev) { | ||
65 | memset(dev, 0, sizeof(*dev)); | ||
66 | dev->type = type; | ||
67 | dev->id = id; | ||
68 | dev->callback = callback; | ||
69 | |||
70 | down(&pm_devs_lock); | ||
71 | list_add(&dev->entry, &pm_devs); | ||
72 | up(&pm_devs_lock); | ||
73 | } | ||
74 | return dev; | ||
75 | } | ||
76 | |||
77 | /** | ||
78 | * pm_unregister - unregister a device with power management | ||
79 | * @dev: device to unregister | ||
80 | * | ||
81 | * Remove a device from the power management notification lists. The | ||
82 | * dev passed must be a handle previously returned by pm_register. | ||
83 | */ | ||
84 | |||
85 | void pm_unregister(struct pm_dev *dev) | ||
86 | { | ||
87 | if (dev) { | ||
88 | down(&pm_devs_lock); | ||
89 | list_del(&dev->entry); | ||
90 | up(&pm_devs_lock); | ||
91 | |||
92 | kfree(dev); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | static void __pm_unregister(struct pm_dev *dev) | ||
97 | { | ||
98 | if (dev) { | ||
99 | list_del(&dev->entry); | ||
100 | kfree(dev); | ||
101 | } | ||
102 | } | ||
103 | |||
104 | /** | ||
105 | * pm_unregister_all - unregister all devices with matching callback | ||
106 | * @callback: callback function pointer | ||
107 | * | ||
108 | * Unregister every device that would call the callback passed. This | ||
109 | * is primarily meant as a helper function for loadable modules. It | ||
110 | * enables a module to give up all its managed devices without keeping | ||
111 | * its own private list. | ||
112 | */ | ||
113 | |||
114 | void pm_unregister_all(pm_callback callback) | ||
115 | { | ||
116 | struct list_head *entry; | ||
117 | |||
118 | if (!callback) | ||
119 | return; | ||
120 | |||
121 | down(&pm_devs_lock); | ||
122 | entry = pm_devs.next; | ||
123 | while (entry != &pm_devs) { | ||
124 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
125 | entry = entry->next; | ||
126 | if (dev->callback == callback) | ||
127 | __pm_unregister(dev); | ||
128 | } | ||
129 | up(&pm_devs_lock); | ||
130 | } | ||
131 | |||
132 | /** | ||
133 | * pm_send - send request to a single device | ||
134 | * @dev: device to send to | ||
135 | * @rqst: power management request | ||
136 | * @data: data for the callback | ||
137 | * | ||
138 | * Issue a power management request to a given device. The | ||
139 | * %PM_SUSPEND and %PM_RESUME events are handled specially. The | ||
140 | * data field must hold the intended next state. No call is made | ||
141 | * if the state matches. | ||
142 | * | ||
143 | * BUGS: what stops two power management requests occurring in parallel | ||
144 | * and conflicting. | ||
145 | * | ||
146 | * WARNING: Calling pm_send directly is not generally recommended, in | ||
147 | * particular there is no locking against the pm_dev going away. The | ||
148 | * caller must maintain all needed locking or have 'inside knowledge' | ||
149 | * on the safety. Also remember that this function is not locked against | ||
150 | * pm_unregister. This means that you must handle SMP races on callback | ||
151 | * execution and unload yourself. | ||
152 | */ | ||
153 | |||
154 | static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data) | ||
155 | { | ||
156 | int status = 0; | ||
157 | unsigned long prev_state, next_state; | ||
158 | |||
159 | if (in_interrupt()) | ||
160 | BUG(); | ||
161 | |||
162 | switch (rqst) { | ||
163 | case PM_SUSPEND: | ||
164 | case PM_RESUME: | ||
165 | prev_state = dev->state; | ||
166 | next_state = (unsigned long) data; | ||
167 | if (prev_state != next_state) { | ||
168 | if (dev->callback) | ||
169 | status = (*dev->callback)(dev, rqst, data); | ||
170 | if (!status) { | ||
171 | dev->state = next_state; | ||
172 | dev->prev_state = prev_state; | ||
173 | } | ||
174 | } | ||
175 | else { | ||
176 | dev->prev_state = prev_state; | ||
177 | } | ||
178 | break; | ||
179 | default: | ||
180 | if (dev->callback) | ||
181 | status = (*dev->callback)(dev, rqst, data); | ||
182 | break; | ||
183 | } | ||
184 | return status; | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * Undo incomplete request | ||
189 | */ | ||
190 | static void pm_undo_all(struct pm_dev *last) | ||
191 | { | ||
192 | struct list_head *entry = last->entry.prev; | ||
193 | while (entry != &pm_devs) { | ||
194 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
195 | if (dev->state != dev->prev_state) { | ||
196 | /* previous state was zero (running) resume or | ||
197 | * previous state was non-zero (suspended) suspend | ||
198 | */ | ||
199 | pm_request_t undo = (dev->prev_state | ||
200 | ? PM_SUSPEND:PM_RESUME); | ||
201 | pm_send(dev, undo, (void*) dev->prev_state); | ||
202 | } | ||
203 | entry = entry->prev; | ||
204 | } | ||
205 | } | ||
206 | |||
207 | /** | ||
208 | * pm_send_all - send request to all managed devices | ||
209 | * @rqst: power management request | ||
210 | * @data: data for the callback | ||
211 | * | ||
212 | * Issue a power management request to a all devices. The | ||
213 | * %PM_SUSPEND events are handled specially. Any device is | ||
214 | * permitted to fail a suspend by returning a non zero (error) | ||
215 | * value from its callback function. If any device vetoes a | ||
216 | * suspend request then all other devices that have suspended | ||
217 | * during the processing of this request are restored to their | ||
218 | * previous state. | ||
219 | * | ||
220 | * WARNING: This function takes the pm_devs_lock. The lock is not dropped until | ||
221 | * the callbacks have completed. This prevents races against pm locking | ||
222 | * functions, races against module unload pm_unregister code. It does | ||
223 | * mean however that you must not issue pm_ functions within the callback | ||
224 | * or you will deadlock and users will hate you. | ||
225 | * | ||
226 | * Zero is returned on success. If a suspend fails then the status | ||
227 | * from the device that vetoes the suspend is returned. | ||
228 | * | ||
229 | * BUGS: what stops two power management requests occurring in parallel | ||
230 | * and conflicting. | ||
231 | */ | ||
232 | |||
233 | int pm_send_all(pm_request_t rqst, void *data) | ||
234 | { | ||
235 | struct list_head *entry; | ||
236 | |||
237 | down(&pm_devs_lock); | ||
238 | entry = pm_devs.next; | ||
239 | while (entry != &pm_devs) { | ||
240 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
241 | if (dev->callback) { | ||
242 | int status = pm_send(dev, rqst, data); | ||
243 | if (status) { | ||
244 | /* return devices to previous state on | ||
245 | * failed suspend request | ||
246 | */ | ||
247 | if (rqst == PM_SUSPEND) | ||
248 | pm_undo_all(dev); | ||
249 | up(&pm_devs_lock); | ||
250 | return status; | ||
251 | } | ||
252 | } | ||
253 | entry = entry->next; | ||
254 | } | ||
255 | up(&pm_devs_lock); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | EXPORT_SYMBOL(pm_register); | ||
260 | EXPORT_SYMBOL(pm_unregister); | ||
261 | EXPORT_SYMBOL(pm_unregister_all); | ||
262 | EXPORT_SYMBOL(pm_send_all); | ||
263 | EXPORT_SYMBOL(pm_active); | ||
264 | |||
265 | |||
diff --git a/kernel/power/power.h b/kernel/power/power.h new file mode 100644 index 000000000000..cd6a3493cc0d --- /dev/null +++ b/kernel/power/power.h | |||
@@ -0,0 +1,52 @@ | |||
1 | #include <linux/suspend.h> | ||
2 | #include <linux/utsname.h> | ||
3 | |||
4 | /* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but | ||
5 | we probably do not take enough locks for switching consoles, etc, | ||
6 | so bad things might happen. | ||
7 | */ | ||
8 | #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) | ||
9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | ||
10 | #endif | ||
11 | |||
12 | |||
13 | struct swsusp_info { | ||
14 | struct new_utsname uts; | ||
15 | u32 version_code; | ||
16 | unsigned long num_physpages; | ||
17 | int cpus; | ||
18 | unsigned long image_pages; | ||
19 | unsigned long pagedir_pages; | ||
20 | suspend_pagedir_t * suspend_pagedir; | ||
21 | swp_entry_t pagedir[768]; | ||
22 | } __attribute__((aligned(PAGE_SIZE))); | ||
23 | |||
24 | |||
25 | |||
26 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
27 | extern int pm_suspend_disk(void); | ||
28 | |||
29 | #else | ||
30 | static inline int pm_suspend_disk(void) | ||
31 | { | ||
32 | return -EPERM; | ||
33 | } | ||
34 | #endif | ||
35 | extern struct semaphore pm_sem; | ||
36 | #define power_attr(_name) \ | ||
37 | static struct subsys_attribute _name##_attr = { \ | ||
38 | .attr = { \ | ||
39 | .name = __stringify(_name), \ | ||
40 | .mode = 0644, \ | ||
41 | }, \ | ||
42 | .show = _name##_show, \ | ||
43 | .store = _name##_store, \ | ||
44 | } | ||
45 | |||
46 | extern struct subsystem power_subsys; | ||
47 | |||
48 | extern int freeze_processes(void); | ||
49 | extern void thaw_processes(void); | ||
50 | |||
51 | extern int pm_prepare_console(void); | ||
52 | extern void pm_restore_console(void); | ||
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c new file mode 100644 index 000000000000..715081b2d829 --- /dev/null +++ b/kernel/power/poweroff.c | |||
@@ -0,0 +1,45 @@ | |||
1 | /* | ||
2 | * poweroff.c - sysrq handler to gracefully power down machine. | ||
3 | * | ||
4 | * This file is released under the GPL v2 | ||
5 | */ | ||
6 | |||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/sysrq.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/pm.h> | ||
11 | #include <linux/workqueue.h> | ||
12 | |||
13 | /* | ||
14 | * When the user hits Sys-Rq o to power down the machine this is the | ||
15 | * callback we use. | ||
16 | */ | ||
17 | |||
18 | static void do_poweroff(void *dummy) | ||
19 | { | ||
20 | if (pm_power_off) | ||
21 | pm_power_off(); | ||
22 | } | ||
23 | |||
24 | static DECLARE_WORK(poweroff_work, do_poweroff, NULL); | ||
25 | |||
26 | static void handle_poweroff(int key, struct pt_regs *pt_regs, | ||
27 | struct tty_struct *tty) | ||
28 | { | ||
29 | schedule_work(&poweroff_work); | ||
30 | } | ||
31 | |||
32 | static struct sysrq_key_op sysrq_poweroff_op = { | ||
33 | .handler = handle_poweroff, | ||
34 | .help_msg = "powerOff", | ||
35 | .action_msg = "Power Off", | ||
36 | .enable_mask = SYSRQ_ENABLE_BOOT, | ||
37 | }; | ||
38 | |||
39 | static int pm_sysrq_init(void) | ||
40 | { | ||
41 | register_sysrq_key('o', &sysrq_poweroff_op); | ||
42 | return 0; | ||
43 | } | ||
44 | |||
45 | subsys_initcall(pm_sysrq_init); | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c new file mode 100644 index 000000000000..78d92dc6a1ed --- /dev/null +++ b/kernel/power/process.c | |||
@@ -0,0 +1,121 @@ | |||
1 | /* | ||
2 | * drivers/power/process.c - Functions for starting/stopping processes on | ||
3 | * suspend transitions. | ||
4 | * | ||
5 | * Originally from swsusp. | ||
6 | */ | ||
7 | |||
8 | |||
9 | #undef DEBUG | ||
10 | |||
11 | #include <linux/smp_lock.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | #include <linux/suspend.h> | ||
14 | #include <linux/module.h> | ||
15 | |||
16 | /* | ||
17 | * Timeout for stopping processes | ||
18 | */ | ||
19 | #define TIMEOUT (6 * HZ) | ||
20 | |||
21 | |||
22 | static inline int freezeable(struct task_struct * p) | ||
23 | { | ||
24 | if ((p == current) || | ||
25 | (p->flags & PF_NOFREEZE) || | ||
26 | (p->exit_state == EXIT_ZOMBIE) || | ||
27 | (p->exit_state == EXIT_DEAD) || | ||
28 | (p->state == TASK_STOPPED) || | ||
29 | (p->state == TASK_TRACED)) | ||
30 | return 0; | ||
31 | return 1; | ||
32 | } | ||
33 | |||
34 | /* Refrigerator is place where frozen processes are stored :-). */ | ||
35 | void refrigerator(unsigned long flag) | ||
36 | { | ||
37 | /* Hmm, should we be allowed to suspend when there are realtime | ||
38 | processes around? */ | ||
39 | long save; | ||
40 | save = current->state; | ||
41 | current->state = TASK_UNINTERRUPTIBLE; | ||
42 | pr_debug("%s entered refrigerator\n", current->comm); | ||
43 | printk("="); | ||
44 | current->flags &= ~PF_FREEZE; | ||
45 | |||
46 | spin_lock_irq(¤t->sighand->siglock); | ||
47 | recalc_sigpending(); /* We sent fake signal, clean it up */ | ||
48 | spin_unlock_irq(¤t->sighand->siglock); | ||
49 | |||
50 | current->flags |= PF_FROZEN; | ||
51 | while (current->flags & PF_FROZEN) | ||
52 | schedule(); | ||
53 | pr_debug("%s left refrigerator\n", current->comm); | ||
54 | current->state = save; | ||
55 | } | ||
56 | |||
57 | /* 0 = success, else # of processes that we failed to stop */ | ||
58 | int freeze_processes(void) | ||
59 | { | ||
60 | int todo; | ||
61 | unsigned long start_time; | ||
62 | struct task_struct *g, *p; | ||
63 | |||
64 | printk( "Stopping tasks: " ); | ||
65 | start_time = jiffies; | ||
66 | do { | ||
67 | todo = 0; | ||
68 | read_lock(&tasklist_lock); | ||
69 | do_each_thread(g, p) { | ||
70 | unsigned long flags; | ||
71 | if (!freezeable(p)) | ||
72 | continue; | ||
73 | if ((p->flags & PF_FROZEN) || | ||
74 | (p->state == TASK_TRACED) || | ||
75 | (p->state == TASK_STOPPED)) | ||
76 | continue; | ||
77 | |||
78 | /* FIXME: smp problem here: we may not access other process' flags | ||
79 | without locking */ | ||
80 | p->flags |= PF_FREEZE; | ||
81 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
82 | signal_wake_up(p, 0); | ||
83 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
84 | todo++; | ||
85 | } while_each_thread(g, p); | ||
86 | read_unlock(&tasklist_lock); | ||
87 | yield(); /* Yield is okay here */ | ||
88 | if (time_after(jiffies, start_time + TIMEOUT)) { | ||
89 | printk( "\n" ); | ||
90 | printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); | ||
91 | return todo; | ||
92 | } | ||
93 | } while(todo); | ||
94 | |||
95 | printk( "|\n" ); | ||
96 | BUG_ON(in_atomic()); | ||
97 | return 0; | ||
98 | } | ||
99 | |||
100 | void thaw_processes(void) | ||
101 | { | ||
102 | struct task_struct *g, *p; | ||
103 | |||
104 | printk( "Restarting tasks..." ); | ||
105 | read_lock(&tasklist_lock); | ||
106 | do_each_thread(g, p) { | ||
107 | if (!freezeable(p)) | ||
108 | continue; | ||
109 | if (p->flags & PF_FROZEN) { | ||
110 | p->flags &= ~PF_FROZEN; | ||
111 | wake_up_process(p); | ||
112 | } else | ||
113 | printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); | ||
114 | } while_each_thread(g, p); | ||
115 | |||
116 | read_unlock(&tasklist_lock); | ||
117 | schedule(); | ||
118 | printk( " done\n" ); | ||
119 | } | ||
120 | |||
121 | EXPORT_SYMBOL(refrigerator); | ||
diff --git a/kernel/power/smp.c b/kernel/power/smp.c new file mode 100644 index 000000000000..7fa7f6e2b7fb --- /dev/null +++ b/kernel/power/smp.c | |||
@@ -0,0 +1,85 @@ | |||
1 | /* | ||
2 | * drivers/power/smp.c - Functions for stopping other CPUs. | ||
3 | * | ||
4 | * Copyright 2004 Pavel Machek <pavel@suse.cz> | ||
5 | * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz> | ||
6 | * | ||
7 | * This file is released under the GPLv2. | ||
8 | */ | ||
9 | |||
10 | #undef DEBUG | ||
11 | |||
12 | #include <linux/smp_lock.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/suspend.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <asm/atomic.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | |||
19 | static atomic_t cpu_counter, freeze; | ||
20 | |||
21 | |||
22 | static void smp_pause(void * data) | ||
23 | { | ||
24 | struct saved_context ctxt; | ||
25 | __save_processor_state(&ctxt); | ||
26 | printk("Sleeping in:\n"); | ||
27 | dump_stack(); | ||
28 | atomic_inc(&cpu_counter); | ||
29 | while (atomic_read(&freeze)) { | ||
30 | /* FIXME: restore takes place at random piece inside this. | ||
31 | This should probably be written in assembly, and | ||
32 | preserve general-purpose registers, too | ||
33 | |||
34 | What about stack? We may need to move to new stack here. | ||
35 | |||
36 | This should better be ran with interrupts disabled. | ||
37 | */ | ||
38 | cpu_relax(); | ||
39 | barrier(); | ||
40 | } | ||
41 | atomic_dec(&cpu_counter); | ||
42 | __restore_processor_state(&ctxt); | ||
43 | } | ||
44 | |||
45 | static cpumask_t oldmask; | ||
46 | |||
47 | void disable_nonboot_cpus(void) | ||
48 | { | ||
49 | printk("Freezing CPUs (at %d)", smp_processor_id()); | ||
50 | oldmask = current->cpus_allowed; | ||
51 | set_cpus_allowed(current, cpumask_of_cpu(0)); | ||
52 | current->state = TASK_INTERRUPTIBLE; | ||
53 | schedule_timeout(HZ); | ||
54 | printk("..."); | ||
55 | BUG_ON(smp_processor_id() != 0); | ||
56 | |||
57 | /* FIXME: for this to work, all the CPUs must be running | ||
58 | * "idle" thread (or we deadlock). Is that guaranteed? */ | ||
59 | |||
60 | atomic_set(&cpu_counter, 0); | ||
61 | atomic_set(&freeze, 1); | ||
62 | smp_call_function(smp_pause, NULL, 0, 0); | ||
63 | while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { | ||
64 | cpu_relax(); | ||
65 | barrier(); | ||
66 | } | ||
67 | printk("ok\n"); | ||
68 | } | ||
69 | |||
70 | void enable_nonboot_cpus(void) | ||
71 | { | ||
72 | printk("Restarting CPUs"); | ||
73 | atomic_set(&freeze, 0); | ||
74 | while (atomic_read(&cpu_counter)) { | ||
75 | cpu_relax(); | ||
76 | barrier(); | ||
77 | } | ||
78 | printk("..."); | ||
79 | set_cpus_allowed(current, oldmask); | ||
80 | schedule(); | ||
81 | printk("ok\n"); | ||
82 | |||
83 | } | ||
84 | |||
85 | |||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c new file mode 100644 index 000000000000..ae5bebc3b18f --- /dev/null +++ b/kernel/power/swsusp.c | |||
@@ -0,0 +1,1433 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/swsusp.c | ||
3 | * | ||
4 | * This file is to realize architecture-independent | ||
5 | * machine suspend feature using pretty near only high-level routines | ||
6 | * | ||
7 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> | ||
8 | * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> | ||
9 | * | ||
10 | * This file is released under the GPLv2. | ||
11 | * | ||
12 | * I'd like to thank the following people for their work: | ||
13 | * | ||
14 | * Pavel Machek <pavel@ucw.cz>: | ||
15 | * Modifications, defectiveness pointing, being with me at the very beginning, | ||
16 | * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. | ||
17 | * | ||
18 | * Steve Doddi <dirk@loth.demon.co.uk>: | ||
19 | * Support the possibility of hardware state restoring. | ||
20 | * | ||
21 | * Raph <grey.havens@earthling.net>: | ||
22 | * Support for preserving states of network devices and virtual console | ||
23 | * (including X and svgatextmode) | ||
24 | * | ||
25 | * Kurt Garloff <garloff@suse.de>: | ||
26 | * Straightened the critical function in order to prevent compilers from | ||
27 | * playing tricks with local variables. | ||
28 | * | ||
29 | * Andreas Mohr <a.mohr@mailto.de> | ||
30 | * | ||
31 | * Alex Badea <vampire@go.ro>: | ||
32 | * Fixed runaway init | ||
33 | * | ||
34 | * More state savers are welcome. Especially for the scsi layer... | ||
35 | * | ||
36 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt | ||
37 | */ | ||
38 | |||
39 | #include <linux/module.h> | ||
40 | #include <linux/mm.h> | ||
41 | #include <linux/suspend.h> | ||
42 | #include <linux/smp_lock.h> | ||
43 | #include <linux/file.h> | ||
44 | #include <linux/utsname.h> | ||
45 | #include <linux/version.h> | ||
46 | #include <linux/delay.h> | ||
47 | #include <linux/reboot.h> | ||
48 | #include <linux/bitops.h> | ||
49 | #include <linux/vt_kern.h> | ||
50 | #include <linux/kbd_kern.h> | ||
51 | #include <linux/keyboard.h> | ||
52 | #include <linux/spinlock.h> | ||
53 | #include <linux/genhd.h> | ||
54 | #include <linux/kernel.h> | ||
55 | #include <linux/major.h> | ||
56 | #include <linux/swap.h> | ||
57 | #include <linux/pm.h> | ||
58 | #include <linux/device.h> | ||
59 | #include <linux/buffer_head.h> | ||
60 | #include <linux/swapops.h> | ||
61 | #include <linux/bootmem.h> | ||
62 | #include <linux/syscalls.h> | ||
63 | #include <linux/console.h> | ||
64 | #include <linux/highmem.h> | ||
65 | #include <linux/bio.h> | ||
66 | |||
67 | #include <asm/uaccess.h> | ||
68 | #include <asm/mmu_context.h> | ||
69 | #include <asm/pgtable.h> | ||
70 | #include <asm/tlbflush.h> | ||
71 | #include <asm/io.h> | ||
72 | |||
73 | #include "power.h" | ||
74 | |||
75 | /* References to section boundaries */ | ||
76 | extern const void __nosave_begin, __nosave_end; | ||
77 | |||
78 | /* Variables to be preserved over suspend */ | ||
79 | static int nr_copy_pages_check; | ||
80 | |||
81 | extern char resume_file[]; | ||
82 | |||
83 | /* Local variables that should not be affected by save */ | ||
84 | unsigned int nr_copy_pages __nosavedata = 0; | ||
85 | |||
86 | /* Suspend pagedir is allocated before final copy, therefore it | ||
87 | must be freed after resume | ||
88 | |||
89 | Warning: this is evil. There are actually two pagedirs at time of | ||
90 | resume. One is "pagedir_save", which is empty frame allocated at | ||
91 | time of suspend, that must be freed. Second is "pagedir_nosave", | ||
92 | allocated at time of resume, that travels through memory not to | ||
93 | collide with anything. | ||
94 | |||
95 | Warning: this is even more evil than it seems. Pagedirs this file | ||
96 | talks about are completely different from page directories used by | ||
97 | MMU hardware. | ||
98 | */ | ||
99 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; | ||
100 | static suspend_pagedir_t *pagedir_save; | ||
101 | |||
102 | #define SWSUSP_SIG "S1SUSPEND" | ||
103 | |||
104 | static struct swsusp_header { | ||
105 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; | ||
106 | swp_entry_t swsusp_info; | ||
107 | char orig_sig[10]; | ||
108 | char sig[10]; | ||
109 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | ||
110 | |||
111 | static struct swsusp_info swsusp_info; | ||
112 | |||
113 | /* | ||
114 | * XXX: We try to keep some more pages free so that I/O operations succeed | ||
115 | * without paging. Might this be more? | ||
116 | */ | ||
117 | #define PAGES_FOR_IO 512 | ||
118 | |||
119 | /* | ||
120 | * Saving part... | ||
121 | */ | ||
122 | |||
123 | /* We memorize in swapfile_used what swap devices are used for suspension */ | ||
124 | #define SWAPFILE_UNUSED 0 | ||
125 | #define SWAPFILE_SUSPEND 1 /* This is the suspending device */ | ||
126 | #define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */ | ||
127 | |||
128 | static unsigned short swapfile_used[MAX_SWAPFILES]; | ||
129 | static unsigned short root_swap; | ||
130 | |||
131 | static int mark_swapfiles(swp_entry_t prev) | ||
132 | { | ||
133 | int error; | ||
134 | |||
135 | rw_swap_page_sync(READ, | ||
136 | swp_entry(root_swap, 0), | ||
137 | virt_to_page((unsigned long)&swsusp_header)); | ||
138 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | ||
139 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | ||
140 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | ||
141 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | ||
142 | swsusp_header.swsusp_info = prev; | ||
143 | error = rw_swap_page_sync(WRITE, | ||
144 | swp_entry(root_swap, 0), | ||
145 | virt_to_page((unsigned long) | ||
146 | &swsusp_header)); | ||
147 | } else { | ||
148 | pr_debug("swsusp: Partition is not swap space.\n"); | ||
149 | error = -ENODEV; | ||
150 | } | ||
151 | return error; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * Check whether the swap device is the specified resume | ||
156 | * device, irrespective of whether they are specified by | ||
157 | * identical names. | ||
158 | * | ||
159 | * (Thus, device inode aliasing is allowed. You can say /dev/hda4 | ||
160 | * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs] | ||
161 | * and they'll be considered the same device. This is *necessary* for | ||
162 | * devfs, since the resume code can only recognize the form /dev/hda4, | ||
163 | * but the suspend code would see the long name.) | ||
164 | */ | ||
165 | static int is_resume_device(const struct swap_info_struct *swap_info) | ||
166 | { | ||
167 | struct file *file = swap_info->swap_file; | ||
168 | struct inode *inode = file->f_dentry->d_inode; | ||
169 | |||
170 | return S_ISBLK(inode->i_mode) && | ||
171 | swsusp_resume_device == MKDEV(imajor(inode), iminor(inode)); | ||
172 | } | ||
173 | |||
174 | static int swsusp_swap_check(void) /* This is called before saving image */ | ||
175 | { | ||
176 | int i, len; | ||
177 | |||
178 | len=strlen(resume_file); | ||
179 | root_swap = 0xFFFF; | ||
180 | |||
181 | swap_list_lock(); | ||
182 | for(i=0; i<MAX_SWAPFILES; i++) { | ||
183 | if (swap_info[i].flags == 0) { | ||
184 | swapfile_used[i]=SWAPFILE_UNUSED; | ||
185 | } else { | ||
186 | if(!len) { | ||
187 | printk(KERN_WARNING "resume= option should be used to set suspend device" ); | ||
188 | if(root_swap == 0xFFFF) { | ||
189 | swapfile_used[i] = SWAPFILE_SUSPEND; | ||
190 | root_swap = i; | ||
191 | } else | ||
192 | swapfile_used[i] = SWAPFILE_IGNORED; | ||
193 | } else { | ||
194 | /* we ignore all swap devices that are not the resume_file */ | ||
195 | if (is_resume_device(&swap_info[i])) { | ||
196 | swapfile_used[i] = SWAPFILE_SUSPEND; | ||
197 | root_swap = i; | ||
198 | } else { | ||
199 | swapfile_used[i] = SWAPFILE_IGNORED; | ||
200 | } | ||
201 | } | ||
202 | } | ||
203 | } | ||
204 | swap_list_unlock(); | ||
205 | return (root_swap != 0xffff) ? 0 : -ENODEV; | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * This is called after saving image so modification | ||
210 | * will be lost after resume... and that's what we want. | ||
211 | * we make the device unusable. A new call to | ||
212 | * lock_swapdevices can unlock the devices. | ||
213 | */ | ||
214 | static void lock_swapdevices(void) | ||
215 | { | ||
216 | int i; | ||
217 | |||
218 | swap_list_lock(); | ||
219 | for(i = 0; i< MAX_SWAPFILES; i++) | ||
220 | if(swapfile_used[i] == SWAPFILE_IGNORED) { | ||
221 | swap_info[i].flags ^= 0xFF; | ||
222 | } | ||
223 | swap_list_unlock(); | ||
224 | } | ||
225 | |||
226 | /** | ||
227 | * write_swap_page - Write one page to a fresh swap location. | ||
228 | * @addr: Address we're writing. | ||
229 | * @loc: Place to store the entry we used. | ||
230 | * | ||
231 | * Allocate a new swap entry and 'sync' it. Note we discard -EIO | ||
232 | * errors. That is an artifact left over from swsusp. It did not | ||
233 | * check the return of rw_swap_page_sync() at all, since most pages | ||
234 | * written back to swap would return -EIO. | ||
235 | * This is a partial improvement, since we will at least return other | ||
236 | * errors, though we need to eventually fix the damn code. | ||
237 | */ | ||
238 | static int write_page(unsigned long addr, swp_entry_t * loc) | ||
239 | { | ||
240 | swp_entry_t entry; | ||
241 | int error = 0; | ||
242 | |||
243 | entry = get_swap_page(); | ||
244 | if (swp_offset(entry) && | ||
245 | swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { | ||
246 | error = rw_swap_page_sync(WRITE, entry, | ||
247 | virt_to_page(addr)); | ||
248 | if (error == -EIO) | ||
249 | error = 0; | ||
250 | if (!error) | ||
251 | *loc = entry; | ||
252 | } else | ||
253 | error = -ENOSPC; | ||
254 | return error; | ||
255 | } | ||
256 | |||
257 | /** | ||
258 | * data_free - Free the swap entries used by the saved image. | ||
259 | * | ||
260 | * Walk the list of used swap entries and free each one. | ||
261 | * This is only used for cleanup when suspend fails. | ||
262 | */ | ||
263 | static void data_free(void) | ||
264 | { | ||
265 | swp_entry_t entry; | ||
266 | int i; | ||
267 | |||
268 | for (i = 0; i < nr_copy_pages; i++) { | ||
269 | entry = (pagedir_nosave + i)->swap_address; | ||
270 | if (entry.val) | ||
271 | swap_free(entry); | ||
272 | else | ||
273 | break; | ||
274 | (pagedir_nosave + i)->swap_address = (swp_entry_t){0}; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | /** | ||
279 | * data_write - Write saved image to swap. | ||
280 | * | ||
281 | * Walk the list of pages in the image and sync each one to swap. | ||
282 | */ | ||
283 | static int data_write(void) | ||
284 | { | ||
285 | int error = 0, i = 0; | ||
286 | unsigned int mod = nr_copy_pages / 100; | ||
287 | struct pbe *p; | ||
288 | |||
289 | if (!mod) | ||
290 | mod = 1; | ||
291 | |||
292 | printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); | ||
293 | for_each_pbe(p, pagedir_nosave) { | ||
294 | if (!(i%mod)) | ||
295 | printk( "\b\b\b\b%3d%%", i / mod ); | ||
296 | if ((error = write_page(p->address, &(p->swap_address)))) | ||
297 | return error; | ||
298 | i++; | ||
299 | } | ||
300 | printk("\b\b\b\bdone\n"); | ||
301 | return error; | ||
302 | } | ||
303 | |||
304 | static void dump_info(void) | ||
305 | { | ||
306 | pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code); | ||
307 | pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages); | ||
308 | pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname); | ||
309 | pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename); | ||
310 | pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release); | ||
311 | pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version); | ||
312 | pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine); | ||
313 | pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); | ||
314 | pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); | ||
315 | pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); | ||
316 | pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); | ||
317 | } | ||
318 | |||
319 | static void init_header(void) | ||
320 | { | ||
321 | memset(&swsusp_info, 0, sizeof(swsusp_info)); | ||
322 | swsusp_info.version_code = LINUX_VERSION_CODE; | ||
323 | swsusp_info.num_physpages = num_physpages; | ||
324 | memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); | ||
325 | |||
326 | swsusp_info.suspend_pagedir = pagedir_nosave; | ||
327 | swsusp_info.cpus = num_online_cpus(); | ||
328 | swsusp_info.image_pages = nr_copy_pages; | ||
329 | } | ||
330 | |||
331 | static int close_swap(void) | ||
332 | { | ||
333 | swp_entry_t entry; | ||
334 | int error; | ||
335 | |||
336 | dump_info(); | ||
337 | error = write_page((unsigned long)&swsusp_info, &entry); | ||
338 | if (!error) { | ||
339 | printk( "S" ); | ||
340 | error = mark_swapfiles(entry); | ||
341 | printk( "|\n" ); | ||
342 | } | ||
343 | return error; | ||
344 | } | ||
345 | |||
346 | /** | ||
347 | * free_pagedir_entries - Free pages used by the page directory. | ||
348 | * | ||
349 | * This is used during suspend for error recovery. | ||
350 | */ | ||
351 | |||
352 | static void free_pagedir_entries(void) | ||
353 | { | ||
354 | int i; | ||
355 | |||
356 | for (i = 0; i < swsusp_info.pagedir_pages; i++) | ||
357 | swap_free(swsusp_info.pagedir[i]); | ||
358 | } | ||
359 | |||
360 | |||
361 | /** | ||
362 | * write_pagedir - Write the array of pages holding the page directory. | ||
363 | * @last: Last swap entry we write (needed for header). | ||
364 | */ | ||
365 | |||
366 | static int write_pagedir(void) | ||
367 | { | ||
368 | int error = 0; | ||
369 | unsigned n = 0; | ||
370 | struct pbe * pbe; | ||
371 | |||
372 | printk( "Writing pagedir..."); | ||
373 | for_each_pb_page(pbe, pagedir_nosave) { | ||
374 | if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) | ||
375 | return error; | ||
376 | } | ||
377 | |||
378 | swsusp_info.pagedir_pages = n; | ||
379 | printk("done (%u pages)\n", n); | ||
380 | return error; | ||
381 | } | ||
382 | |||
383 | /** | ||
384 | * write_suspend_image - Write entire image and metadata. | ||
385 | * | ||
386 | */ | ||
387 | |||
388 | static int write_suspend_image(void) | ||
389 | { | ||
390 | int error; | ||
391 | |||
392 | init_header(); | ||
393 | if ((error = data_write())) | ||
394 | goto FreeData; | ||
395 | |||
396 | if ((error = write_pagedir())) | ||
397 | goto FreePagedir; | ||
398 | |||
399 | if ((error = close_swap())) | ||
400 | goto FreePagedir; | ||
401 | Done: | ||
402 | return error; | ||
403 | FreePagedir: | ||
404 | free_pagedir_entries(); | ||
405 | FreeData: | ||
406 | data_free(); | ||
407 | goto Done; | ||
408 | } | ||
409 | |||
410 | |||
411 | #ifdef CONFIG_HIGHMEM | ||
412 | struct highmem_page { | ||
413 | char *data; | ||
414 | struct page *page; | ||
415 | struct highmem_page *next; | ||
416 | }; | ||
417 | |||
418 | static struct highmem_page *highmem_copy; | ||
419 | |||
420 | static int save_highmem_zone(struct zone *zone) | ||
421 | { | ||
422 | unsigned long zone_pfn; | ||
423 | mark_free_pages(zone); | ||
424 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
425 | struct page *page; | ||
426 | struct highmem_page *save; | ||
427 | void *kaddr; | ||
428 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
429 | |||
430 | if (!(pfn%1000)) | ||
431 | printk("."); | ||
432 | if (!pfn_valid(pfn)) | ||
433 | continue; | ||
434 | page = pfn_to_page(pfn); | ||
435 | /* | ||
436 | * This condition results from rvmalloc() sans vmalloc_32() | ||
437 | * and architectural memory reservations. This should be | ||
438 | * corrected eventually when the cases giving rise to this | ||
439 | * are better understood. | ||
440 | */ | ||
441 | if (PageReserved(page)) { | ||
442 | printk("highmem reserved page?!\n"); | ||
443 | continue; | ||
444 | } | ||
445 | BUG_ON(PageNosave(page)); | ||
446 | if (PageNosaveFree(page)) | ||
447 | continue; | ||
448 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
449 | if (!save) | ||
450 | return -ENOMEM; | ||
451 | save->next = highmem_copy; | ||
452 | save->page = page; | ||
453 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
454 | if (!save->data) { | ||
455 | kfree(save); | ||
456 | return -ENOMEM; | ||
457 | } | ||
458 | kaddr = kmap_atomic(page, KM_USER0); | ||
459 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
460 | kunmap_atomic(kaddr, KM_USER0); | ||
461 | highmem_copy = save; | ||
462 | } | ||
463 | return 0; | ||
464 | } | ||
465 | #endif /* CONFIG_HIGHMEM */ | ||
466 | |||
467 | |||
468 | static int save_highmem(void) | ||
469 | { | ||
470 | #ifdef CONFIG_HIGHMEM | ||
471 | struct zone *zone; | ||
472 | int res = 0; | ||
473 | |||
474 | pr_debug("swsusp: Saving Highmem\n"); | ||
475 | for_each_zone(zone) { | ||
476 | if (is_highmem(zone)) | ||
477 | res = save_highmem_zone(zone); | ||
478 | if (res) | ||
479 | return res; | ||
480 | } | ||
481 | #endif | ||
482 | return 0; | ||
483 | } | ||
484 | |||
485 | static int restore_highmem(void) | ||
486 | { | ||
487 | #ifdef CONFIG_HIGHMEM | ||
488 | printk("swsusp: Restoring Highmem\n"); | ||
489 | while (highmem_copy) { | ||
490 | struct highmem_page *save = highmem_copy; | ||
491 | void *kaddr; | ||
492 | highmem_copy = save->next; | ||
493 | |||
494 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
495 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
496 | kunmap_atomic(kaddr, KM_USER0); | ||
497 | free_page((long) save->data); | ||
498 | kfree(save); | ||
499 | } | ||
500 | #endif | ||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | |||
505 | static int pfn_is_nosave(unsigned long pfn) | ||
506 | { | ||
507 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; | ||
508 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | ||
509 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
510 | } | ||
511 | |||
512 | /** | ||
513 | * saveable - Determine whether a page should be cloned or not. | ||
514 | * @pfn: The page | ||
515 | * | ||
516 | * We save a page if it's Reserved, and not in the range of pages | ||
517 | * statically defined as 'unsaveable', or if it isn't reserved, and | ||
518 | * isn't part of a free chunk of pages. | ||
519 | */ | ||
520 | |||
521 | static int saveable(struct zone * zone, unsigned long * zone_pfn) | ||
522 | { | ||
523 | unsigned long pfn = *zone_pfn + zone->zone_start_pfn; | ||
524 | struct page * page; | ||
525 | |||
526 | if (!pfn_valid(pfn)) | ||
527 | return 0; | ||
528 | |||
529 | page = pfn_to_page(pfn); | ||
530 | BUG_ON(PageReserved(page) && PageNosave(page)); | ||
531 | if (PageNosave(page)) | ||
532 | return 0; | ||
533 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | ||
534 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
535 | return 0; | ||
536 | } | ||
537 | if (PageNosaveFree(page)) | ||
538 | return 0; | ||
539 | |||
540 | return 1; | ||
541 | } | ||
542 | |||
543 | static void count_data_pages(void) | ||
544 | { | ||
545 | struct zone *zone; | ||
546 | unsigned long zone_pfn; | ||
547 | |||
548 | nr_copy_pages = 0; | ||
549 | |||
550 | for_each_zone(zone) { | ||
551 | if (is_highmem(zone)) | ||
552 | continue; | ||
553 | mark_free_pages(zone); | ||
554 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
555 | nr_copy_pages += saveable(zone, &zone_pfn); | ||
556 | } | ||
557 | } | ||
558 | |||
559 | |||
560 | static void copy_data_pages(void) | ||
561 | { | ||
562 | struct zone *zone; | ||
563 | unsigned long zone_pfn; | ||
564 | struct pbe * pbe = pagedir_nosave; | ||
565 | |||
566 | pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); | ||
567 | for_each_zone(zone) { | ||
568 | if (is_highmem(zone)) | ||
569 | continue; | ||
570 | mark_free_pages(zone); | ||
571 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
572 | if (saveable(zone, &zone_pfn)) { | ||
573 | struct page * page; | ||
574 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
575 | BUG_ON(!pbe); | ||
576 | pbe->orig_address = (long) page_address(page); | ||
577 | /* copy_page is not usable for copying task structs. */ | ||
578 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | ||
579 | pbe = pbe->next; | ||
580 | } | ||
581 | } | ||
582 | } | ||
583 | BUG_ON(pbe); | ||
584 | } | ||
585 | |||
586 | |||
587 | /** | ||
588 | * calc_nr - Determine the number of pages needed for a pbe list. | ||
589 | */ | ||
590 | |||
591 | static int calc_nr(int nr_copy) | ||
592 | { | ||
593 | int extra = 0; | ||
594 | int mod = !!(nr_copy % PBES_PER_PAGE); | ||
595 | int diff = (nr_copy / PBES_PER_PAGE) + mod; | ||
596 | |||
597 | do { | ||
598 | extra += diff; | ||
599 | nr_copy += diff; | ||
600 | mod = !!(nr_copy % PBES_PER_PAGE); | ||
601 | diff = (nr_copy / PBES_PER_PAGE) + mod - extra; | ||
602 | } while (diff > 0); | ||
603 | |||
604 | return nr_copy; | ||
605 | } | ||
606 | |||
607 | /** | ||
608 | * free_pagedir - free pages allocated with alloc_pagedir() | ||
609 | */ | ||
610 | |||
611 | static inline void free_pagedir(struct pbe *pblist) | ||
612 | { | ||
613 | struct pbe *pbe; | ||
614 | |||
615 | while (pblist) { | ||
616 | pbe = (pblist + PB_PAGE_SKIP)->next; | ||
617 | free_page((unsigned long)pblist); | ||
618 | pblist = pbe; | ||
619 | } | ||
620 | } | ||
621 | |||
622 | /** | ||
623 | * fill_pb_page - Create a list of PBEs on a given memory page | ||
624 | */ | ||
625 | |||
626 | static inline void fill_pb_page(struct pbe *pbpage) | ||
627 | { | ||
628 | struct pbe *p; | ||
629 | |||
630 | p = pbpage; | ||
631 | pbpage += PB_PAGE_SKIP; | ||
632 | do | ||
633 | p->next = p + 1; | ||
634 | while (++p < pbpage); | ||
635 | } | ||
636 | |||
637 | /** | ||
638 | * create_pbe_list - Create a list of PBEs on top of a given chain | ||
639 | * of memory pages allocated with alloc_pagedir() | ||
640 | */ | ||
641 | |||
642 | static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) | ||
643 | { | ||
644 | struct pbe *pbpage, *p; | ||
645 | unsigned num = PBES_PER_PAGE; | ||
646 | |||
647 | for_each_pb_page (pbpage, pblist) { | ||
648 | if (num >= nr_pages) | ||
649 | break; | ||
650 | |||
651 | fill_pb_page(pbpage); | ||
652 | num += PBES_PER_PAGE; | ||
653 | } | ||
654 | if (pbpage) { | ||
655 | for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) | ||
656 | p->next = p + 1; | ||
657 | p->next = NULL; | ||
658 | } | ||
659 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | ||
660 | } | ||
661 | |||
662 | /** | ||
663 | * alloc_pagedir - Allocate the page directory. | ||
664 | * | ||
665 | * First, determine exactly how many pages we need and | ||
666 | * allocate them. | ||
667 | * | ||
668 | * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE | ||
669 | * struct pbe elements (pbes) and the last element in the page points | ||
670 | * to the next page. | ||
671 | * | ||
672 | * On each page we set up a list of struct_pbe elements. | ||
673 | */ | ||
674 | |||
675 | static struct pbe * alloc_pagedir(unsigned nr_pages) | ||
676 | { | ||
677 | unsigned num; | ||
678 | struct pbe *pblist, *pbe; | ||
679 | |||
680 | if (!nr_pages) | ||
681 | return NULL; | ||
682 | |||
683 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
684 | pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
685 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | ||
686 | pbe = pbe->next, num += PBES_PER_PAGE) { | ||
687 | pbe += PB_PAGE_SKIP; | ||
688 | pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
689 | } | ||
690 | if (!pbe) { /* get_zeroed_page() failed */ | ||
691 | free_pagedir(pblist); | ||
692 | pblist = NULL; | ||
693 | } | ||
694 | return pblist; | ||
695 | } | ||
696 | |||
697 | /** | ||
698 | * free_image_pages - Free pages allocated for snapshot | ||
699 | */ | ||
700 | |||
701 | static void free_image_pages(void) | ||
702 | { | ||
703 | struct pbe * p; | ||
704 | |||
705 | for_each_pbe(p, pagedir_save) { | ||
706 | if (p->address) { | ||
707 | ClearPageNosave(virt_to_page(p->address)); | ||
708 | free_page(p->address); | ||
709 | p->address = 0; | ||
710 | } | ||
711 | } | ||
712 | } | ||
713 | |||
714 | /** | ||
715 | * alloc_image_pages - Allocate pages for the snapshot. | ||
716 | */ | ||
717 | |||
718 | static int alloc_image_pages(void) | ||
719 | { | ||
720 | struct pbe * p; | ||
721 | |||
722 | for_each_pbe(p, pagedir_save) { | ||
723 | p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
724 | if (!p->address) | ||
725 | return -ENOMEM; | ||
726 | SetPageNosave(virt_to_page(p->address)); | ||
727 | } | ||
728 | return 0; | ||
729 | } | ||
730 | |||
731 | void swsusp_free(void) | ||
732 | { | ||
733 | BUG_ON(PageNosave(virt_to_page(pagedir_save))); | ||
734 | BUG_ON(PageNosaveFree(virt_to_page(pagedir_save))); | ||
735 | free_image_pages(); | ||
736 | free_pagedir(pagedir_save); | ||
737 | } | ||
738 | |||
739 | |||
740 | /** | ||
741 | * enough_free_mem - Make sure we enough free memory to snapshot. | ||
742 | * | ||
743 | * Returns TRUE or FALSE after checking the number of available | ||
744 | * free pages. | ||
745 | */ | ||
746 | |||
747 | static int enough_free_mem(void) | ||
748 | { | ||
749 | if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) { | ||
750 | pr_debug("swsusp: Not enough free pages: Have %d\n", | ||
751 | nr_free_pages()); | ||
752 | return 0; | ||
753 | } | ||
754 | return 1; | ||
755 | } | ||
756 | |||
757 | |||
758 | /** | ||
759 | * enough_swap - Make sure we have enough swap to save the image. | ||
760 | * | ||
761 | * Returns TRUE or FALSE after checking the total amount of swap | ||
762 | * space avaiable. | ||
763 | * | ||
764 | * FIXME: si_swapinfo(&i) returns all swap devices information. | ||
765 | * We should only consider resume_device. | ||
766 | */ | ||
767 | |||
768 | static int enough_swap(void) | ||
769 | { | ||
770 | struct sysinfo i; | ||
771 | |||
772 | si_swapinfo(&i); | ||
773 | if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { | ||
774 | pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); | ||
775 | return 0; | ||
776 | } | ||
777 | return 1; | ||
778 | } | ||
779 | |||
780 | static int swsusp_alloc(void) | ||
781 | { | ||
782 | int error; | ||
783 | |||
784 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", | ||
785 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); | ||
786 | |||
787 | pagedir_nosave = NULL; | ||
788 | if (!enough_free_mem()) | ||
789 | return -ENOMEM; | ||
790 | |||
791 | if (!enough_swap()) | ||
792 | return -ENOSPC; | ||
793 | |||
794 | nr_copy_pages = calc_nr(nr_copy_pages); | ||
795 | |||
796 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { | ||
797 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | ||
798 | return -ENOMEM; | ||
799 | } | ||
800 | create_pbe_list(pagedir_save, nr_copy_pages); | ||
801 | pagedir_nosave = pagedir_save; | ||
802 | if ((error = alloc_image_pages())) { | ||
803 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | ||
804 | swsusp_free(); | ||
805 | return error; | ||
806 | } | ||
807 | |||
808 | nr_copy_pages_check = nr_copy_pages; | ||
809 | return 0; | ||
810 | } | ||
811 | |||
812 | static int suspend_prepare_image(void) | ||
813 | { | ||
814 | int error; | ||
815 | |||
816 | pr_debug("swsusp: critical section: \n"); | ||
817 | if (save_highmem()) { | ||
818 | printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n"); | ||
819 | restore_highmem(); | ||
820 | return -ENOMEM; | ||
821 | } | ||
822 | |||
823 | drain_local_pages(); | ||
824 | count_data_pages(); | ||
825 | printk("swsusp: Need to copy %u pages\n", nr_copy_pages); | ||
826 | |||
827 | error = swsusp_alloc(); | ||
828 | if (error) | ||
829 | return error; | ||
830 | |||
831 | /* During allocating of suspend pagedir, new cold pages may appear. | ||
832 | * Kill them. | ||
833 | */ | ||
834 | drain_local_pages(); | ||
835 | copy_data_pages(); | ||
836 | |||
837 | /* | ||
838 | * End of critical section. From now on, we can write to memory, | ||
839 | * but we should not touch disk. This specially means we must _not_ | ||
840 | * touch swap space! Except we must write out our image of course. | ||
841 | */ | ||
842 | |||
843 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages ); | ||
844 | return 0; | ||
845 | } | ||
846 | |||
847 | |||
848 | /* It is important _NOT_ to umount filesystems at this point. We want | ||
849 | * them synced (in case something goes wrong) but we DO not want to mark | ||
850 | * filesystem clean: it is not. (And it does not matter, if we resume | ||
851 | * correctly, we'll mark system clean, anyway.) | ||
852 | */ | ||
853 | int swsusp_write(void) | ||
854 | { | ||
855 | int error; | ||
856 | device_resume(); | ||
857 | lock_swapdevices(); | ||
858 | error = write_suspend_image(); | ||
859 | /* This will unlock ignored swap devices since writing is finished */ | ||
860 | lock_swapdevices(); | ||
861 | return error; | ||
862 | |||
863 | } | ||
864 | |||
865 | |||
866 | extern asmlinkage int swsusp_arch_suspend(void); | ||
867 | extern asmlinkage int swsusp_arch_resume(void); | ||
868 | |||
869 | |||
870 | asmlinkage int swsusp_save(void) | ||
871 | { | ||
872 | int error = 0; | ||
873 | |||
874 | if ((error = swsusp_swap_check())) { | ||
875 | printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " | ||
876 | "swapon -a!\n"); | ||
877 | return error; | ||
878 | } | ||
879 | return suspend_prepare_image(); | ||
880 | } | ||
881 | |||
882 | int swsusp_suspend(void) | ||
883 | { | ||
884 | int error; | ||
885 | if ((error = arch_prepare_suspend())) | ||
886 | return error; | ||
887 | local_irq_disable(); | ||
888 | /* At this point, device_suspend() has been called, but *not* | ||
889 | * device_power_down(). We *must* device_power_down() now. | ||
890 | * Otherwise, drivers for some devices (e.g. interrupt controllers) | ||
891 | * become desynchronized with the actual state of the hardware | ||
892 | * at resume time, and evil weirdness ensues. | ||
893 | */ | ||
894 | if ((error = device_power_down(PMSG_FREEZE))) { | ||
895 | printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); | ||
896 | local_irq_enable(); | ||
897 | swsusp_free(); | ||
898 | return error; | ||
899 | } | ||
900 | save_processor_state(); | ||
901 | if ((error = swsusp_arch_suspend())) | ||
902 | swsusp_free(); | ||
903 | /* Restore control flow magically appears here */ | ||
904 | restore_processor_state(); | ||
905 | BUG_ON (nr_copy_pages_check != nr_copy_pages); | ||
906 | restore_highmem(); | ||
907 | device_power_up(); | ||
908 | local_irq_enable(); | ||
909 | return error; | ||
910 | } | ||
911 | |||
912 | int swsusp_resume(void) | ||
913 | { | ||
914 | int error; | ||
915 | local_irq_disable(); | ||
916 | if (device_power_down(PMSG_FREEZE)) | ||
917 | printk(KERN_ERR "Some devices failed to power down, very bad\n"); | ||
918 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | ||
919 | save_processor_state(); | ||
920 | error = swsusp_arch_resume(); | ||
921 | /* Code below is only ever reached in case of failure. Otherwise | ||
922 | * execution continues at place where swsusp_arch_suspend was called | ||
923 | */ | ||
924 | BUG_ON(!error); | ||
925 | restore_processor_state(); | ||
926 | restore_highmem(); | ||
927 | device_power_up(); | ||
928 | local_irq_enable(); | ||
929 | return error; | ||
930 | } | ||
931 | |||
932 | /* More restore stuff */ | ||
933 | |||
934 | /* | ||
935 | * Returns true if given address/order collides with any orig_address | ||
936 | */ | ||
937 | static int does_collide_order(unsigned long addr, int order) | ||
938 | { | ||
939 | int i; | ||
940 | |||
941 | for (i=0; i < (1<<order); i++) | ||
942 | if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE))) | ||
943 | return 1; | ||
944 | return 0; | ||
945 | } | ||
946 | |||
947 | /** | ||
948 | * On resume, for storing the PBE list and the image, | ||
949 | * we can only use memory pages that do not conflict with the pages | ||
950 | * which had been used before suspend. | ||
951 | * | ||
952 | * We don't know which pages are usable until we allocate them. | ||
953 | * | ||
954 | * Allocated but unusable (ie eaten) memory pages are linked together | ||
955 | * to create a list, so that we can free them easily | ||
956 | * | ||
957 | * We could have used a type other than (void *) | ||
958 | * for this purpose, but ... | ||
959 | */ | ||
960 | static void **eaten_memory = NULL; | ||
961 | |||
962 | static inline void eat_page(void *page) | ||
963 | { | ||
964 | void **c; | ||
965 | |||
966 | c = eaten_memory; | ||
967 | eaten_memory = page; | ||
968 | *eaten_memory = c; | ||
969 | } | ||
970 | |||
971 | static unsigned long get_usable_page(unsigned gfp_mask) | ||
972 | { | ||
973 | unsigned long m; | ||
974 | |||
975 | m = get_zeroed_page(gfp_mask); | ||
976 | while (does_collide_order(m, 0)) { | ||
977 | eat_page((void *)m); | ||
978 | m = get_zeroed_page(gfp_mask); | ||
979 | if (!m) | ||
980 | break; | ||
981 | } | ||
982 | return m; | ||
983 | } | ||
984 | |||
985 | static void free_eaten_memory(void) | ||
986 | { | ||
987 | unsigned long m; | ||
988 | void **c; | ||
989 | int i = 0; | ||
990 | |||
991 | c = eaten_memory; | ||
992 | while (c) { | ||
993 | m = (unsigned long)c; | ||
994 | c = *c; | ||
995 | free_page(m); | ||
996 | i++; | ||
997 | } | ||
998 | eaten_memory = NULL; | ||
999 | pr_debug("swsusp: %d unused pages freed\n", i); | ||
1000 | } | ||
1001 | |||
1002 | /** | ||
1003 | * check_pagedir - We ensure here that pages that the PBEs point to | ||
1004 | * won't collide with pages where we're going to restore from the loaded | ||
1005 | * pages later | ||
1006 | */ | ||
1007 | |||
1008 | static int check_pagedir(struct pbe *pblist) | ||
1009 | { | ||
1010 | struct pbe *p; | ||
1011 | |||
1012 | /* This is necessary, so that we can free allocated pages | ||
1013 | * in case of failure | ||
1014 | */ | ||
1015 | for_each_pbe (p, pblist) | ||
1016 | p->address = 0UL; | ||
1017 | |||
1018 | for_each_pbe (p, pblist) { | ||
1019 | p->address = get_usable_page(GFP_ATOMIC); | ||
1020 | if (!p->address) | ||
1021 | return -ENOMEM; | ||
1022 | } | ||
1023 | return 0; | ||
1024 | } | ||
1025 | |||
1026 | /** | ||
1027 | * swsusp_pagedir_relocate - It is possible, that some memory pages | ||
1028 | * occupied by the list of PBEs collide with pages where we're going to | ||
1029 | * restore from the loaded pages later. We relocate them here. | ||
1030 | */ | ||
1031 | |||
1032 | static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | ||
1033 | { | ||
1034 | struct zone *zone; | ||
1035 | unsigned long zone_pfn; | ||
1036 | struct pbe *pbpage, *tail, *p; | ||
1037 | void *m; | ||
1038 | int rel = 0, error = 0; | ||
1039 | |||
1040 | if (!pblist) /* a sanity check */ | ||
1041 | return NULL; | ||
1042 | |||
1043 | pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", | ||
1044 | swsusp_info.pagedir_pages); | ||
1045 | |||
1046 | /* Set page flags */ | ||
1047 | |||
1048 | for_each_zone(zone) { | ||
1049 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
1050 | SetPageNosaveFree(pfn_to_page(zone_pfn + | ||
1051 | zone->zone_start_pfn)); | ||
1052 | } | ||
1053 | |||
1054 | /* Clear orig addresses */ | ||
1055 | |||
1056 | for_each_pbe (p, pblist) | ||
1057 | ClearPageNosaveFree(virt_to_page(p->orig_address)); | ||
1058 | |||
1059 | tail = pblist + PB_PAGE_SKIP; | ||
1060 | |||
1061 | /* Relocate colliding pages */ | ||
1062 | |||
1063 | for_each_pb_page (pbpage, pblist) { | ||
1064 | if (does_collide_order((unsigned long)pbpage, 0)) { | ||
1065 | m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); | ||
1066 | if (!m) { | ||
1067 | error = -ENOMEM; | ||
1068 | break; | ||
1069 | } | ||
1070 | memcpy(m, (void *)pbpage, PAGE_SIZE); | ||
1071 | if (pbpage == pblist) | ||
1072 | pblist = (struct pbe *)m; | ||
1073 | else | ||
1074 | tail->next = (struct pbe *)m; | ||
1075 | |||
1076 | eat_page((void *)pbpage); | ||
1077 | pbpage = (struct pbe *)m; | ||
1078 | |||
1079 | /* We have to link the PBEs again */ | ||
1080 | |||
1081 | for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) | ||
1082 | if (p->next) /* needed to save the end */ | ||
1083 | p->next = p + 1; | ||
1084 | |||
1085 | rel++; | ||
1086 | } | ||
1087 | tail = pbpage + PB_PAGE_SKIP; | ||
1088 | } | ||
1089 | |||
1090 | if (error) { | ||
1091 | printk("\nswsusp: Out of memory\n\n"); | ||
1092 | free_pagedir(pblist); | ||
1093 | free_eaten_memory(); | ||
1094 | pblist = NULL; | ||
1095 | } | ||
1096 | else | ||
1097 | printk("swsusp: Relocated %d pages\n", rel); | ||
1098 | |||
1099 | return pblist; | ||
1100 | } | ||
1101 | |||
1102 | /** | ||
1103 | * Using bio to read from swap. | ||
1104 | * This code requires a bit more work than just using buffer heads | ||
1105 | * but, it is the recommended way for 2.5/2.6. | ||
1106 | * The following are to signal the beginning and end of I/O. Bios | ||
1107 | * finish asynchronously, while we want them to happen synchronously. | ||
1108 | * A simple atomic_t, and a wait loop take care of this problem. | ||
1109 | */ | ||
1110 | |||
1111 | static atomic_t io_done = ATOMIC_INIT(0); | ||
1112 | |||
1113 | static int end_io(struct bio * bio, unsigned int num, int err) | ||
1114 | { | ||
1115 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
1116 | panic("I/O error reading memory image"); | ||
1117 | atomic_set(&io_done, 0); | ||
1118 | return 0; | ||
1119 | } | ||
1120 | |||
1121 | static struct block_device * resume_bdev; | ||
1122 | |||
1123 | /** | ||
1124 | * submit - submit BIO request. | ||
1125 | * @rw: READ or WRITE. | ||
1126 | * @off physical offset of page. | ||
1127 | * @page: page we're reading or writing. | ||
1128 | * | ||
1129 | * Straight from the textbook - allocate and initialize the bio. | ||
1130 | * If we're writing, make sure the page is marked as dirty. | ||
1131 | * Then submit it and wait. | ||
1132 | */ | ||
1133 | |||
1134 | static int submit(int rw, pgoff_t page_off, void * page) | ||
1135 | { | ||
1136 | int error = 0; | ||
1137 | struct bio * bio; | ||
1138 | |||
1139 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
1140 | if (!bio) | ||
1141 | return -ENOMEM; | ||
1142 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
1143 | bio_get(bio); | ||
1144 | bio->bi_bdev = resume_bdev; | ||
1145 | bio->bi_end_io = end_io; | ||
1146 | |||
1147 | if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { | ||
1148 | printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); | ||
1149 | error = -EFAULT; | ||
1150 | goto Done; | ||
1151 | } | ||
1152 | |||
1153 | if (rw == WRITE) | ||
1154 | bio_set_pages_dirty(bio); | ||
1155 | |||
1156 | atomic_set(&io_done, 1); | ||
1157 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
1158 | while (atomic_read(&io_done)) | ||
1159 | yield(); | ||
1160 | |||
1161 | Done: | ||
1162 | bio_put(bio); | ||
1163 | return error; | ||
1164 | } | ||
1165 | |||
1166 | static int bio_read_page(pgoff_t page_off, void * page) | ||
1167 | { | ||
1168 | return submit(READ, page_off, page); | ||
1169 | } | ||
1170 | |||
1171 | static int bio_write_page(pgoff_t page_off, void * page) | ||
1172 | { | ||
1173 | return submit(WRITE, page_off, page); | ||
1174 | } | ||
1175 | |||
1176 | /* | ||
1177 | * Sanity check if this image makes sense with this kernel/swap context | ||
1178 | * I really don't think that it's foolproof but more than nothing.. | ||
1179 | */ | ||
1180 | |||
1181 | static const char * sanity_check(void) | ||
1182 | { | ||
1183 | dump_info(); | ||
1184 | if(swsusp_info.version_code != LINUX_VERSION_CODE) | ||
1185 | return "kernel version"; | ||
1186 | if(swsusp_info.num_physpages != num_physpages) | ||
1187 | return "memory size"; | ||
1188 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) | ||
1189 | return "system type"; | ||
1190 | if (strcmp(swsusp_info.uts.release,system_utsname.release)) | ||
1191 | return "kernel release"; | ||
1192 | if (strcmp(swsusp_info.uts.version,system_utsname.version)) | ||
1193 | return "version"; | ||
1194 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | ||
1195 | return "machine"; | ||
1196 | if(swsusp_info.cpus != num_online_cpus()) | ||
1197 | return "number of cpus"; | ||
1198 | return NULL; | ||
1199 | } | ||
1200 | |||
1201 | |||
1202 | static int check_header(void) | ||
1203 | { | ||
1204 | const char * reason = NULL; | ||
1205 | int error; | ||
1206 | |||
1207 | if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) | ||
1208 | return error; | ||
1209 | |||
1210 | /* Is this same machine? */ | ||
1211 | if ((reason = sanity_check())) { | ||
1212 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason); | ||
1213 | return -EPERM; | ||
1214 | } | ||
1215 | nr_copy_pages = swsusp_info.image_pages; | ||
1216 | return error; | ||
1217 | } | ||
1218 | |||
1219 | static int check_sig(void) | ||
1220 | { | ||
1221 | int error; | ||
1222 | |||
1223 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | ||
1224 | if ((error = bio_read_page(0, &swsusp_header))) | ||
1225 | return error; | ||
1226 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | ||
1227 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
1228 | |||
1229 | /* | ||
1230 | * Reset swap signature now. | ||
1231 | */ | ||
1232 | error = bio_write_page(0, &swsusp_header); | ||
1233 | } else { | ||
1234 | printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n"); | ||
1235 | return -EINVAL; | ||
1236 | } | ||
1237 | if (!error) | ||
1238 | pr_debug("swsusp: Signature found, resuming\n"); | ||
1239 | return error; | ||
1240 | } | ||
1241 | |||
1242 | /** | ||
1243 | * data_read - Read image pages from swap. | ||
1244 | * | ||
1245 | * You do not need to check for overlaps, check_pagedir() | ||
1246 | * already did that. | ||
1247 | */ | ||
1248 | |||
1249 | static int data_read(struct pbe *pblist) | ||
1250 | { | ||
1251 | struct pbe * p; | ||
1252 | int error = 0; | ||
1253 | int i = 0; | ||
1254 | int mod = swsusp_info.image_pages / 100; | ||
1255 | |||
1256 | if (!mod) | ||
1257 | mod = 1; | ||
1258 | |||
1259 | printk("swsusp: Reading image data (%lu pages): ", | ||
1260 | swsusp_info.image_pages); | ||
1261 | |||
1262 | for_each_pbe (p, pblist) { | ||
1263 | if (!(i % mod)) | ||
1264 | printk("\b\b\b\b%3d%%", i / mod); | ||
1265 | |||
1266 | error = bio_read_page(swp_offset(p->swap_address), | ||
1267 | (void *)p->address); | ||
1268 | if (error) | ||
1269 | return error; | ||
1270 | |||
1271 | i++; | ||
1272 | } | ||
1273 | printk("\b\b\b\bdone\n"); | ||
1274 | return error; | ||
1275 | } | ||
1276 | |||
1277 | extern dev_t name_to_dev_t(const char *line); | ||
1278 | |||
1279 | /** | ||
1280 | * read_pagedir - Read page backup list pages from swap | ||
1281 | */ | ||
1282 | |||
1283 | static int read_pagedir(struct pbe *pblist) | ||
1284 | { | ||
1285 | struct pbe *pbpage, *p; | ||
1286 | unsigned i = 0; | ||
1287 | int error; | ||
1288 | |||
1289 | if (!pblist) | ||
1290 | return -EFAULT; | ||
1291 | |||
1292 | printk("swsusp: Reading pagedir (%lu pages)\n", | ||
1293 | swsusp_info.pagedir_pages); | ||
1294 | |||
1295 | for_each_pb_page (pbpage, pblist) { | ||
1296 | unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); | ||
1297 | |||
1298 | error = -EFAULT; | ||
1299 | if (offset) { | ||
1300 | p = (pbpage + PB_PAGE_SKIP)->next; | ||
1301 | error = bio_read_page(offset, (void *)pbpage); | ||
1302 | (pbpage + PB_PAGE_SKIP)->next = p; | ||
1303 | } | ||
1304 | if (error) | ||
1305 | break; | ||
1306 | } | ||
1307 | |||
1308 | if (error) | ||
1309 | free_page((unsigned long)pblist); | ||
1310 | |||
1311 | BUG_ON(i != swsusp_info.pagedir_pages); | ||
1312 | |||
1313 | return error; | ||
1314 | } | ||
1315 | |||
1316 | |||
1317 | static int check_suspend_image(void) | ||
1318 | { | ||
1319 | int error = 0; | ||
1320 | |||
1321 | if ((error = check_sig())) | ||
1322 | return error; | ||
1323 | |||
1324 | if ((error = check_header())) | ||
1325 | return error; | ||
1326 | |||
1327 | return 0; | ||
1328 | } | ||
1329 | |||
1330 | static int read_suspend_image(void) | ||
1331 | { | ||
1332 | int error = 0; | ||
1333 | struct pbe *p; | ||
1334 | |||
1335 | if (!(p = alloc_pagedir(nr_copy_pages))) | ||
1336 | return -ENOMEM; | ||
1337 | |||
1338 | if ((error = read_pagedir(p))) | ||
1339 | return error; | ||
1340 | |||
1341 | create_pbe_list(p, nr_copy_pages); | ||
1342 | |||
1343 | if (!(pagedir_nosave = swsusp_pagedir_relocate(p))) | ||
1344 | return -ENOMEM; | ||
1345 | |||
1346 | /* Allocate memory for the image and read the data from swap */ | ||
1347 | |||
1348 | error = check_pagedir(pagedir_nosave); | ||
1349 | free_eaten_memory(); | ||
1350 | if (!error) | ||
1351 | error = data_read(pagedir_nosave); | ||
1352 | |||
1353 | if (error) { /* We fail cleanly */ | ||
1354 | for_each_pbe (p, pagedir_nosave) | ||
1355 | if (p->address) { | ||
1356 | free_page(p->address); | ||
1357 | p->address = 0UL; | ||
1358 | } | ||
1359 | free_pagedir(pagedir_nosave); | ||
1360 | } | ||
1361 | return error; | ||
1362 | } | ||
1363 | |||
1364 | /** | ||
1365 | * swsusp_check - Check for saved image in swap | ||
1366 | */ | ||
1367 | |||
1368 | int swsusp_check(void) | ||
1369 | { | ||
1370 | int error; | ||
1371 | |||
1372 | if (!swsusp_resume_device) { | ||
1373 | if (!strlen(resume_file)) | ||
1374 | return -ENOENT; | ||
1375 | swsusp_resume_device = name_to_dev_t(resume_file); | ||
1376 | pr_debug("swsusp: Resume From Partition %s\n", resume_file); | ||
1377 | } else { | ||
1378 | pr_debug("swsusp: Resume From Partition %d:%d\n", | ||
1379 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); | ||
1380 | } | ||
1381 | |||
1382 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | ||
1383 | if (!IS_ERR(resume_bdev)) { | ||
1384 | set_blocksize(resume_bdev, PAGE_SIZE); | ||
1385 | error = check_suspend_image(); | ||
1386 | if (error) | ||
1387 | blkdev_put(resume_bdev); | ||
1388 | } else | ||
1389 | error = PTR_ERR(resume_bdev); | ||
1390 | |||
1391 | if (!error) | ||
1392 | pr_debug("swsusp: resume file found\n"); | ||
1393 | else | ||
1394 | pr_debug("swsusp: Error %d check for resume file\n", error); | ||
1395 | return error; | ||
1396 | } | ||
1397 | |||
1398 | /** | ||
1399 | * swsusp_read - Read saved image from swap. | ||
1400 | */ | ||
1401 | |||
1402 | int swsusp_read(void) | ||
1403 | { | ||
1404 | int error; | ||
1405 | |||
1406 | if (IS_ERR(resume_bdev)) { | ||
1407 | pr_debug("swsusp: block device not initialised\n"); | ||
1408 | return PTR_ERR(resume_bdev); | ||
1409 | } | ||
1410 | |||
1411 | error = read_suspend_image(); | ||
1412 | blkdev_put(resume_bdev); | ||
1413 | |||
1414 | if (!error) | ||
1415 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
1416 | else | ||
1417 | pr_debug("swsusp: Error %d resuming\n", error); | ||
1418 | return error; | ||
1419 | } | ||
1420 | |||
1421 | /** | ||
1422 | * swsusp_close - close swap device. | ||
1423 | */ | ||
1424 | |||
1425 | void swsusp_close(void) | ||
1426 | { | ||
1427 | if (IS_ERR(resume_bdev)) { | ||
1428 | pr_debug("swsusp: block device not initialised\n"); | ||
1429 | return; | ||
1430 | } | ||
1431 | |||
1432 | blkdev_put(resume_bdev); | ||
1433 | } | ||
diff --git a/kernel/printk.c b/kernel/printk.c new file mode 100644 index 000000000000..1498689548d1 --- /dev/null +++ b/kernel/printk.c | |||
@@ -0,0 +1,996 @@ | |||
1 | /* | ||
2 | * linux/kernel/printk.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * | ||
6 | * Modified to make sys_syslog() more flexible: added commands to | ||
7 | * return the last 4k of kernel messages, regardless of whether | ||
8 | * they've been read or not. Added option to suppress kernel printk's | ||
9 | * to the console. Added hook for sending the console messages | ||
10 | * elsewhere, in preparation for a serial line console (someday). | ||
11 | * Ted Ts'o, 2/11/93. | ||
12 | * Modified for sysctl support, 1/8/97, Chris Horn. | ||
13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul | ||
14 | * manfreds@colorfullife.com | ||
15 | * Rewrote bits to get rid of console_lock | ||
16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/tty.h> | ||
22 | #include <linux/tty_driver.h> | ||
23 | #include <linux/smp_lock.h> | ||
24 | #include <linux/console.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/interrupt.h> /* For in_interrupt() */ | ||
28 | #include <linux/config.h> | ||
29 | #include <linux/delay.h> | ||
30 | #include <linux/smp.h> | ||
31 | #include <linux/security.h> | ||
32 | #include <linux/bootmem.h> | ||
33 | #include <linux/syscalls.h> | ||
34 | |||
35 | #include <asm/uaccess.h> | ||
36 | |||
37 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
38 | |||
39 | /* printk's without a loglevel use this.. */ | ||
40 | #define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ | ||
41 | |||
42 | /* We show everything that is MORE important than this.. */ | ||
43 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ | ||
44 | #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ | ||
45 | |||
46 | DECLARE_WAIT_QUEUE_HEAD(log_wait); | ||
47 | |||
48 | int console_printk[4] = { | ||
49 | DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ | ||
50 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ | ||
51 | MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ | ||
52 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | ||
53 | }; | ||
54 | |||
55 | EXPORT_SYMBOL(console_printk); | ||
56 | |||
57 | /* | ||
58 | * Low lever drivers may need that to know if they can schedule in | ||
59 | * their unblank() callback or not. So let's export it. | ||
60 | */ | ||
61 | int oops_in_progress; | ||
62 | EXPORT_SYMBOL(oops_in_progress); | ||
63 | |||
64 | /* | ||
65 | * console_sem protects the console_drivers list, and also | ||
66 | * provides serialisation for access to the entire console | ||
67 | * driver system. | ||
68 | */ | ||
69 | static DECLARE_MUTEX(console_sem); | ||
70 | struct console *console_drivers; | ||
71 | /* | ||
72 | * This is used for debugging the mess that is the VT code by | ||
73 | * keeping track if we have the console semaphore held. It's | ||
74 | * definitely not the perfect debug tool (we don't know if _WE_ | ||
75 | * hold it are racing, but it helps tracking those weird code | ||
76 | * path in the console code where we end up in places I want | ||
77 | * locked without the console sempahore held | ||
78 | */ | ||
79 | static int console_locked; | ||
80 | |||
81 | /* | ||
82 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | ||
83 | * It is also used in interesting ways to provide interlocking in | ||
84 | * release_console_sem(). | ||
85 | */ | ||
86 | static DEFINE_SPINLOCK(logbuf_lock); | ||
87 | |||
88 | static char __log_buf[__LOG_BUF_LEN]; | ||
89 | static char *log_buf = __log_buf; | ||
90 | static int log_buf_len = __LOG_BUF_LEN; | ||
91 | |||
92 | #define LOG_BUF_MASK (log_buf_len-1) | ||
93 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) | ||
94 | |||
95 | /* | ||
96 | * The indices into log_buf are not constrained to log_buf_len - they | ||
97 | * must be masked before subscripting | ||
98 | */ | ||
99 | static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */ | ||
100 | static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */ | ||
101 | static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */ | ||
102 | static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ | ||
103 | |||
104 | /* | ||
105 | * Array of consoles built from command line options (console=) | ||
106 | */ | ||
107 | struct console_cmdline | ||
108 | { | ||
109 | char name[8]; /* Name of the driver */ | ||
110 | int index; /* Minor dev. to use */ | ||
111 | char *options; /* Options for the driver */ | ||
112 | }; | ||
113 | |||
114 | #define MAX_CMDLINECONSOLES 8 | ||
115 | |||
116 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; | ||
117 | static int selected_console = -1; | ||
118 | static int preferred_console = -1; | ||
119 | |||
120 | /* Flag: console code may call schedule() */ | ||
121 | static int console_may_schedule; | ||
122 | |||
123 | /* | ||
124 | * Setup a list of consoles. Called from init/main.c | ||
125 | */ | ||
126 | static int __init console_setup(char *str) | ||
127 | { | ||
128 | char name[sizeof(console_cmdline[0].name)]; | ||
129 | char *s, *options; | ||
130 | int idx; | ||
131 | |||
132 | /* | ||
133 | * Decode str into name, index, options. | ||
134 | */ | ||
135 | if (str[0] >= '0' && str[0] <= '9') { | ||
136 | strcpy(name, "ttyS"); | ||
137 | strncpy(name + 4, str, sizeof(name) - 5); | ||
138 | } else | ||
139 | strncpy(name, str, sizeof(name) - 1); | ||
140 | name[sizeof(name) - 1] = 0; | ||
141 | if ((options = strchr(str, ',')) != NULL) | ||
142 | *(options++) = 0; | ||
143 | #ifdef __sparc__ | ||
144 | if (!strcmp(str, "ttya")) | ||
145 | strcpy(name, "ttyS0"); | ||
146 | if (!strcmp(str, "ttyb")) | ||
147 | strcpy(name, "ttyS1"); | ||
148 | #endif | ||
149 | for(s = name; *s; s++) | ||
150 | if ((*s >= '0' && *s <= '9') || *s == ',') | ||
151 | break; | ||
152 | idx = simple_strtoul(s, NULL, 10); | ||
153 | *s = 0; | ||
154 | |||
155 | add_preferred_console(name, idx, options); | ||
156 | return 1; | ||
157 | } | ||
158 | |||
159 | __setup("console=", console_setup); | ||
160 | |||
161 | /** | ||
162 | * add_preferred_console - add a device to the list of preferred consoles. | ||
163 | * | ||
164 | * The last preferred console added will be used for kernel messages | ||
165 | * and stdin/out/err for init. Normally this is used by console_setup | ||
166 | * above to handle user-supplied console arguments; however it can also | ||
167 | * be used by arch-specific code either to override the user or more | ||
168 | * commonly to provide a default console (ie from PROM variables) when | ||
169 | * the user has not supplied one. | ||
170 | */ | ||
171 | int __init add_preferred_console(char *name, int idx, char *options) | ||
172 | { | ||
173 | struct console_cmdline *c; | ||
174 | int i; | ||
175 | |||
176 | /* | ||
177 | * See if this tty is not yet registered, and | ||
178 | * if we have a slot free. | ||
179 | */ | ||
180 | for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) | ||
181 | if (strcmp(console_cmdline[i].name, name) == 0 && | ||
182 | console_cmdline[i].index == idx) { | ||
183 | selected_console = i; | ||
184 | return 0; | ||
185 | } | ||
186 | if (i == MAX_CMDLINECONSOLES) | ||
187 | return -E2BIG; | ||
188 | selected_console = i; | ||
189 | c = &console_cmdline[i]; | ||
190 | memcpy(c->name, name, sizeof(c->name)); | ||
191 | c->name[sizeof(c->name) - 1] = 0; | ||
192 | c->options = options; | ||
193 | c->index = idx; | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | static int __init log_buf_len_setup(char *str) | ||
198 | { | ||
199 | unsigned long size = memparse(str, &str); | ||
200 | unsigned long flags; | ||
201 | |||
202 | if (size) | ||
203 | size = roundup_pow_of_two(size); | ||
204 | if (size > log_buf_len) { | ||
205 | unsigned long start, dest_idx, offset; | ||
206 | char * new_log_buf; | ||
207 | |||
208 | new_log_buf = alloc_bootmem(size); | ||
209 | if (!new_log_buf) { | ||
210 | printk("log_buf_len: allocation failed\n"); | ||
211 | goto out; | ||
212 | } | ||
213 | |||
214 | spin_lock_irqsave(&logbuf_lock, flags); | ||
215 | log_buf_len = size; | ||
216 | log_buf = new_log_buf; | ||
217 | |||
218 | offset = start = min(con_start, log_start); | ||
219 | dest_idx = 0; | ||
220 | while (start != log_end) { | ||
221 | log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; | ||
222 | start++; | ||
223 | dest_idx++; | ||
224 | } | ||
225 | log_start -= offset; | ||
226 | con_start -= offset; | ||
227 | log_end -= offset; | ||
228 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
229 | |||
230 | printk("log_buf_len: %d\n", log_buf_len); | ||
231 | } | ||
232 | out: | ||
233 | |||
234 | return 1; | ||
235 | } | ||
236 | |||
237 | __setup("log_buf_len=", log_buf_len_setup); | ||
238 | |||
239 | /* | ||
240 | * Commands to do_syslog: | ||
241 | * | ||
242 | * 0 -- Close the log. Currently a NOP. | ||
243 | * 1 -- Open the log. Currently a NOP. | ||
244 | * 2 -- Read from the log. | ||
245 | * 3 -- Read all messages remaining in the ring buffer. | ||
246 | * 4 -- Read and clear all messages remaining in the ring buffer | ||
247 | * 5 -- Clear ring buffer. | ||
248 | * 6 -- Disable printk's to console | ||
249 | * 7 -- Enable printk's to console | ||
250 | * 8 -- Set level of messages printed to console | ||
251 | * 9 -- Return number of unread characters in the log buffer | ||
252 | * 10 -- Return size of the log buffer | ||
253 | */ | ||
254 | int do_syslog(int type, char __user * buf, int len) | ||
255 | { | ||
256 | unsigned long i, j, limit, count; | ||
257 | int do_clear = 0; | ||
258 | char c; | ||
259 | int error = 0; | ||
260 | |||
261 | error = security_syslog(type); | ||
262 | if (error) | ||
263 | return error; | ||
264 | |||
265 | switch (type) { | ||
266 | case 0: /* Close log */ | ||
267 | break; | ||
268 | case 1: /* Open log */ | ||
269 | break; | ||
270 | case 2: /* Read from log */ | ||
271 | error = -EINVAL; | ||
272 | if (!buf || len < 0) | ||
273 | goto out; | ||
274 | error = 0; | ||
275 | if (!len) | ||
276 | goto out; | ||
277 | if (!access_ok(VERIFY_WRITE, buf, len)) { | ||
278 | error = -EFAULT; | ||
279 | goto out; | ||
280 | } | ||
281 | error = wait_event_interruptible(log_wait, (log_start - log_end)); | ||
282 | if (error) | ||
283 | goto out; | ||
284 | i = 0; | ||
285 | spin_lock_irq(&logbuf_lock); | ||
286 | while (!error && (log_start != log_end) && i < len) { | ||
287 | c = LOG_BUF(log_start); | ||
288 | log_start++; | ||
289 | spin_unlock_irq(&logbuf_lock); | ||
290 | error = __put_user(c,buf); | ||
291 | buf++; | ||
292 | i++; | ||
293 | cond_resched(); | ||
294 | spin_lock_irq(&logbuf_lock); | ||
295 | } | ||
296 | spin_unlock_irq(&logbuf_lock); | ||
297 | if (!error) | ||
298 | error = i; | ||
299 | break; | ||
300 | case 4: /* Read/clear last kernel messages */ | ||
301 | do_clear = 1; | ||
302 | /* FALL THRU */ | ||
303 | case 3: /* Read last kernel messages */ | ||
304 | error = -EINVAL; | ||
305 | if (!buf || len < 0) | ||
306 | goto out; | ||
307 | error = 0; | ||
308 | if (!len) | ||
309 | goto out; | ||
310 | if (!access_ok(VERIFY_WRITE, buf, len)) { | ||
311 | error = -EFAULT; | ||
312 | goto out; | ||
313 | } | ||
314 | count = len; | ||
315 | if (count > log_buf_len) | ||
316 | count = log_buf_len; | ||
317 | spin_lock_irq(&logbuf_lock); | ||
318 | if (count > logged_chars) | ||
319 | count = logged_chars; | ||
320 | if (do_clear) | ||
321 | logged_chars = 0; | ||
322 | limit = log_end; | ||
323 | /* | ||
324 | * __put_user() could sleep, and while we sleep | ||
325 | * printk() could overwrite the messages | ||
326 | * we try to copy to user space. Therefore | ||
327 | * the messages are copied in reverse. <manfreds> | ||
328 | */ | ||
329 | for(i = 0; i < count && !error; i++) { | ||
330 | j = limit-1-i; | ||
331 | if (j + log_buf_len < log_end) | ||
332 | break; | ||
333 | c = LOG_BUF(j); | ||
334 | spin_unlock_irq(&logbuf_lock); | ||
335 | error = __put_user(c,&buf[count-1-i]); | ||
336 | cond_resched(); | ||
337 | spin_lock_irq(&logbuf_lock); | ||
338 | } | ||
339 | spin_unlock_irq(&logbuf_lock); | ||
340 | if (error) | ||
341 | break; | ||
342 | error = i; | ||
343 | if(i != count) { | ||
344 | int offset = count-error; | ||
345 | /* buffer overflow during copy, correct user buffer. */ | ||
346 | for(i=0;i<error;i++) { | ||
347 | if (__get_user(c,&buf[i+offset]) || | ||
348 | __put_user(c,&buf[i])) { | ||
349 | error = -EFAULT; | ||
350 | break; | ||
351 | } | ||
352 | cond_resched(); | ||
353 | } | ||
354 | } | ||
355 | break; | ||
356 | case 5: /* Clear ring buffer */ | ||
357 | logged_chars = 0; | ||
358 | break; | ||
359 | case 6: /* Disable logging to console */ | ||
360 | console_loglevel = minimum_console_loglevel; | ||
361 | break; | ||
362 | case 7: /* Enable logging to console */ | ||
363 | console_loglevel = default_console_loglevel; | ||
364 | break; | ||
365 | case 8: /* Set level of messages printed to console */ | ||
366 | error = -EINVAL; | ||
367 | if (len < 1 || len > 8) | ||
368 | goto out; | ||
369 | if (len < minimum_console_loglevel) | ||
370 | len = minimum_console_loglevel; | ||
371 | console_loglevel = len; | ||
372 | error = 0; | ||
373 | break; | ||
374 | case 9: /* Number of chars in the log buffer */ | ||
375 | error = log_end - log_start; | ||
376 | break; | ||
377 | case 10: /* Size of the log buffer */ | ||
378 | error = log_buf_len; | ||
379 | break; | ||
380 | default: | ||
381 | error = -EINVAL; | ||
382 | break; | ||
383 | } | ||
384 | out: | ||
385 | return error; | ||
386 | } | ||
387 | |||
388 | asmlinkage long sys_syslog(int type, char __user * buf, int len) | ||
389 | { | ||
390 | return do_syslog(type, buf, len); | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * Call the console drivers on a range of log_buf | ||
395 | */ | ||
396 | static void __call_console_drivers(unsigned long start, unsigned long end) | ||
397 | { | ||
398 | struct console *con; | ||
399 | |||
400 | for (con = console_drivers; con; con = con->next) { | ||
401 | if ((con->flags & CON_ENABLED) && con->write) | ||
402 | con->write(con, &LOG_BUF(start), end - start); | ||
403 | } | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * Write out chars from start to end - 1 inclusive | ||
408 | */ | ||
409 | static void _call_console_drivers(unsigned long start, | ||
410 | unsigned long end, int msg_log_level) | ||
411 | { | ||
412 | if (msg_log_level < console_loglevel && | ||
413 | console_drivers && start != end) { | ||
414 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | ||
415 | /* wrapped write */ | ||
416 | __call_console_drivers(start & LOG_BUF_MASK, | ||
417 | log_buf_len); | ||
418 | __call_console_drivers(0, end & LOG_BUF_MASK); | ||
419 | } else { | ||
420 | __call_console_drivers(start, end); | ||
421 | } | ||
422 | } | ||
423 | } | ||
424 | |||
425 | /* | ||
426 | * Call the console drivers, asking them to write out | ||
427 | * log_buf[start] to log_buf[end - 1]. | ||
428 | * The console_sem must be held. | ||
429 | */ | ||
430 | static void call_console_drivers(unsigned long start, unsigned long end) | ||
431 | { | ||
432 | unsigned long cur_index, start_print; | ||
433 | static int msg_level = -1; | ||
434 | |||
435 | if (((long)(start - end)) > 0) | ||
436 | BUG(); | ||
437 | |||
438 | cur_index = start; | ||
439 | start_print = start; | ||
440 | while (cur_index != end) { | ||
441 | if ( msg_level < 0 && | ||
442 | ((end - cur_index) > 2) && | ||
443 | LOG_BUF(cur_index + 0) == '<' && | ||
444 | LOG_BUF(cur_index + 1) >= '0' && | ||
445 | LOG_BUF(cur_index + 1) <= '7' && | ||
446 | LOG_BUF(cur_index + 2) == '>') | ||
447 | { | ||
448 | msg_level = LOG_BUF(cur_index + 1) - '0'; | ||
449 | cur_index += 3; | ||
450 | start_print = cur_index; | ||
451 | } | ||
452 | while (cur_index != end) { | ||
453 | char c = LOG_BUF(cur_index); | ||
454 | cur_index++; | ||
455 | |||
456 | if (c == '\n') { | ||
457 | if (msg_level < 0) { | ||
458 | /* | ||
459 | * printk() has already given us loglevel tags in | ||
460 | * the buffer. This code is here in case the | ||
461 | * log buffer has wrapped right round and scribbled | ||
462 | * on those tags | ||
463 | */ | ||
464 | msg_level = default_message_loglevel; | ||
465 | } | ||
466 | _call_console_drivers(start_print, cur_index, msg_level); | ||
467 | msg_level = -1; | ||
468 | start_print = cur_index; | ||
469 | break; | ||
470 | } | ||
471 | } | ||
472 | } | ||
473 | _call_console_drivers(start_print, end, msg_level); | ||
474 | } | ||
475 | |||
476 | static void emit_log_char(char c) | ||
477 | { | ||
478 | LOG_BUF(log_end) = c; | ||
479 | log_end++; | ||
480 | if (log_end - log_start > log_buf_len) | ||
481 | log_start = log_end - log_buf_len; | ||
482 | if (log_end - con_start > log_buf_len) | ||
483 | con_start = log_end - log_buf_len; | ||
484 | if (logged_chars < log_buf_len) | ||
485 | logged_chars++; | ||
486 | } | ||
487 | |||
488 | /* | ||
489 | * Zap console related locks when oopsing. Only zap at most once | ||
490 | * every 10 seconds, to leave time for slow consoles to print a | ||
491 | * full oops. | ||
492 | */ | ||
493 | static void zap_locks(void) | ||
494 | { | ||
495 | static unsigned long oops_timestamp; | ||
496 | |||
497 | if (time_after_eq(jiffies, oops_timestamp) && | ||
498 | !time_after(jiffies, oops_timestamp + 30*HZ)) | ||
499 | return; | ||
500 | |||
501 | oops_timestamp = jiffies; | ||
502 | |||
503 | /* If a crash is occurring, make sure we can't deadlock */ | ||
504 | spin_lock_init(&logbuf_lock); | ||
505 | /* And make sure that we print immediately */ | ||
506 | init_MUTEX(&console_sem); | ||
507 | } | ||
508 | |||
509 | #if defined(CONFIG_PRINTK_TIME) | ||
510 | static int printk_time = 1; | ||
511 | #else | ||
512 | static int printk_time = 0; | ||
513 | #endif | ||
514 | |||
515 | static int __init printk_time_setup(char *str) | ||
516 | { | ||
517 | if (*str) | ||
518 | return 0; | ||
519 | printk_time = 1; | ||
520 | return 1; | ||
521 | } | ||
522 | |||
523 | __setup("time", printk_time_setup); | ||
524 | |||
525 | /* | ||
526 | * This is printk. It can be called from any context. We want it to work. | ||
527 | * | ||
528 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | ||
529 | * call the console drivers. If we fail to get the semaphore we place the output | ||
530 | * into the log buffer and return. The current holder of the console_sem will | ||
531 | * notice the new output in release_console_sem() and will send it to the | ||
532 | * consoles before releasing the semaphore. | ||
533 | * | ||
534 | * One effect of this deferred printing is that code which calls printk() and | ||
535 | * then changes console_loglevel may break. This is because console_loglevel | ||
536 | * is inspected when the actual printing occurs. | ||
537 | */ | ||
538 | asmlinkage int printk(const char *fmt, ...) | ||
539 | { | ||
540 | va_list args; | ||
541 | int r; | ||
542 | |||
543 | va_start(args, fmt); | ||
544 | r = vprintk(fmt, args); | ||
545 | va_end(args); | ||
546 | |||
547 | return r; | ||
548 | } | ||
549 | |||
550 | asmlinkage int vprintk(const char *fmt, va_list args) | ||
551 | { | ||
552 | unsigned long flags; | ||
553 | int printed_len; | ||
554 | char *p; | ||
555 | static char printk_buf[1024]; | ||
556 | static int log_level_unknown = 1; | ||
557 | |||
558 | if (unlikely(oops_in_progress)) | ||
559 | zap_locks(); | ||
560 | |||
561 | /* This stops the holder of console_sem just where we want him */ | ||
562 | spin_lock_irqsave(&logbuf_lock, flags); | ||
563 | |||
564 | /* Emit the output into the temporary buffer */ | ||
565 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); | ||
566 | |||
567 | /* | ||
568 | * Copy the output into log_buf. If the caller didn't provide | ||
569 | * appropriate log level tags, we insert them here | ||
570 | */ | ||
571 | for (p = printk_buf; *p; p++) { | ||
572 | if (log_level_unknown) { | ||
573 | /* log_level_unknown signals the start of a new line */ | ||
574 | if (printk_time) { | ||
575 | int loglev_char; | ||
576 | char tbuf[50], *tp; | ||
577 | unsigned tlen; | ||
578 | unsigned long long t; | ||
579 | unsigned long nanosec_rem; | ||
580 | |||
581 | /* | ||
582 | * force the log level token to be | ||
583 | * before the time output. | ||
584 | */ | ||
585 | if (p[0] == '<' && p[1] >='0' && | ||
586 | p[1] <= '7' && p[2] == '>') { | ||
587 | loglev_char = p[1]; | ||
588 | p += 3; | ||
589 | printed_len += 3; | ||
590 | } else { | ||
591 | loglev_char = default_message_loglevel | ||
592 | + '0'; | ||
593 | } | ||
594 | t = sched_clock(); | ||
595 | nanosec_rem = do_div(t, 1000000000); | ||
596 | tlen = sprintf(tbuf, | ||
597 | "<%c>[%5lu.%06lu] ", | ||
598 | loglev_char, | ||
599 | (unsigned long)t, | ||
600 | nanosec_rem/1000); | ||
601 | |||
602 | for (tp = tbuf; tp < tbuf + tlen; tp++) | ||
603 | emit_log_char(*tp); | ||
604 | printed_len += tlen - 3; | ||
605 | } else { | ||
606 | if (p[0] != '<' || p[1] < '0' || | ||
607 | p[1] > '7' || p[2] != '>') { | ||
608 | emit_log_char('<'); | ||
609 | emit_log_char(default_message_loglevel | ||
610 | + '0'); | ||
611 | emit_log_char('>'); | ||
612 | } | ||
613 | printed_len += 3; | ||
614 | } | ||
615 | log_level_unknown = 0; | ||
616 | if (!*p) | ||
617 | break; | ||
618 | } | ||
619 | emit_log_char(*p); | ||
620 | if (*p == '\n') | ||
621 | log_level_unknown = 1; | ||
622 | } | ||
623 | |||
624 | if (!cpu_online(smp_processor_id()) && | ||
625 | system_state != SYSTEM_RUNNING) { | ||
626 | /* | ||
627 | * Some console drivers may assume that per-cpu resources have | ||
628 | * been allocated. So don't allow them to be called by this | ||
629 | * CPU until it is officially up. We shouldn't be calling into | ||
630 | * random console drivers on a CPU which doesn't exist yet.. | ||
631 | */ | ||
632 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
633 | goto out; | ||
634 | } | ||
635 | if (!down_trylock(&console_sem)) { | ||
636 | console_locked = 1; | ||
637 | /* | ||
638 | * We own the drivers. We can drop the spinlock and let | ||
639 | * release_console_sem() print the text | ||
640 | */ | ||
641 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
642 | console_may_schedule = 0; | ||
643 | release_console_sem(); | ||
644 | } else { | ||
645 | /* | ||
646 | * Someone else owns the drivers. We drop the spinlock, which | ||
647 | * allows the semaphore holder to proceed and to call the | ||
648 | * console drivers with the output which we just produced. | ||
649 | */ | ||
650 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
651 | } | ||
652 | out: | ||
653 | return printed_len; | ||
654 | } | ||
655 | EXPORT_SYMBOL(printk); | ||
656 | EXPORT_SYMBOL(vprintk); | ||
657 | |||
658 | /** | ||
659 | * acquire_console_sem - lock the console system for exclusive use. | ||
660 | * | ||
661 | * Acquires a semaphore which guarantees that the caller has | ||
662 | * exclusive access to the console system and the console_drivers list. | ||
663 | * | ||
664 | * Can sleep, returns nothing. | ||
665 | */ | ||
666 | void acquire_console_sem(void) | ||
667 | { | ||
668 | if (in_interrupt()) | ||
669 | BUG(); | ||
670 | down(&console_sem); | ||
671 | console_locked = 1; | ||
672 | console_may_schedule = 1; | ||
673 | } | ||
674 | EXPORT_SYMBOL(acquire_console_sem); | ||
675 | |||
676 | int try_acquire_console_sem(void) | ||
677 | { | ||
678 | if (down_trylock(&console_sem)) | ||
679 | return -1; | ||
680 | console_locked = 1; | ||
681 | console_may_schedule = 0; | ||
682 | return 0; | ||
683 | } | ||
684 | EXPORT_SYMBOL(try_acquire_console_sem); | ||
685 | |||
686 | int is_console_locked(void) | ||
687 | { | ||
688 | return console_locked; | ||
689 | } | ||
690 | EXPORT_SYMBOL(is_console_locked); | ||
691 | |||
692 | /** | ||
693 | * release_console_sem - unlock the console system | ||
694 | * | ||
695 | * Releases the semaphore which the caller holds on the console system | ||
696 | * and the console driver list. | ||
697 | * | ||
698 | * While the semaphore was held, console output may have been buffered | ||
699 | * by printk(). If this is the case, release_console_sem() emits | ||
700 | * the output prior to releasing the semaphore. | ||
701 | * | ||
702 | * If there is output waiting for klogd, we wake it up. | ||
703 | * | ||
704 | * release_console_sem() may be called from any context. | ||
705 | */ | ||
706 | void release_console_sem(void) | ||
707 | { | ||
708 | unsigned long flags; | ||
709 | unsigned long _con_start, _log_end; | ||
710 | unsigned long wake_klogd = 0; | ||
711 | |||
712 | for ( ; ; ) { | ||
713 | spin_lock_irqsave(&logbuf_lock, flags); | ||
714 | wake_klogd |= log_start - log_end; | ||
715 | if (con_start == log_end) | ||
716 | break; /* Nothing to print */ | ||
717 | _con_start = con_start; | ||
718 | _log_end = log_end; | ||
719 | con_start = log_end; /* Flush */ | ||
720 | spin_unlock(&logbuf_lock); | ||
721 | call_console_drivers(_con_start, _log_end); | ||
722 | local_irq_restore(flags); | ||
723 | } | ||
724 | console_locked = 0; | ||
725 | console_may_schedule = 0; | ||
726 | up(&console_sem); | ||
727 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
728 | if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) | ||
729 | wake_up_interruptible(&log_wait); | ||
730 | } | ||
731 | EXPORT_SYMBOL(release_console_sem); | ||
732 | |||
733 | /** console_conditional_schedule - yield the CPU if required | ||
734 | * | ||
735 | * If the console code is currently allowed to sleep, and | ||
736 | * if this CPU should yield the CPU to another task, do | ||
737 | * so here. | ||
738 | * | ||
739 | * Must be called within acquire_console_sem(). | ||
740 | */ | ||
741 | void __sched console_conditional_schedule(void) | ||
742 | { | ||
743 | if (console_may_schedule) | ||
744 | cond_resched(); | ||
745 | } | ||
746 | EXPORT_SYMBOL(console_conditional_schedule); | ||
747 | |||
748 | void console_print(const char *s) | ||
749 | { | ||
750 | printk(KERN_EMERG "%s", s); | ||
751 | } | ||
752 | EXPORT_SYMBOL(console_print); | ||
753 | |||
754 | void console_unblank(void) | ||
755 | { | ||
756 | struct console *c; | ||
757 | |||
758 | /* | ||
759 | * console_unblank can no longer be called in interrupt context unless | ||
760 | * oops_in_progress is set to 1.. | ||
761 | */ | ||
762 | if (oops_in_progress) { | ||
763 | if (down_trylock(&console_sem) != 0) | ||
764 | return; | ||
765 | } else | ||
766 | acquire_console_sem(); | ||
767 | |||
768 | console_locked = 1; | ||
769 | console_may_schedule = 0; | ||
770 | for (c = console_drivers; c != NULL; c = c->next) | ||
771 | if ((c->flags & CON_ENABLED) && c->unblank) | ||
772 | c->unblank(); | ||
773 | release_console_sem(); | ||
774 | } | ||
775 | EXPORT_SYMBOL(console_unblank); | ||
776 | |||
777 | /* | ||
778 | * Return the console tty driver structure and its associated index | ||
779 | */ | ||
780 | struct tty_driver *console_device(int *index) | ||
781 | { | ||
782 | struct console *c; | ||
783 | struct tty_driver *driver = NULL; | ||
784 | |||
785 | acquire_console_sem(); | ||
786 | for (c = console_drivers; c != NULL; c = c->next) { | ||
787 | if (!c->device) | ||
788 | continue; | ||
789 | driver = c->device(c, index); | ||
790 | if (driver) | ||
791 | break; | ||
792 | } | ||
793 | release_console_sem(); | ||
794 | return driver; | ||
795 | } | ||
796 | |||
797 | /* | ||
798 | * Prevent further output on the passed console device so that (for example) | ||
799 | * serial drivers can disable console output before suspending a port, and can | ||
800 | * re-enable output afterwards. | ||
801 | */ | ||
802 | void console_stop(struct console *console) | ||
803 | { | ||
804 | acquire_console_sem(); | ||
805 | console->flags &= ~CON_ENABLED; | ||
806 | release_console_sem(); | ||
807 | } | ||
808 | EXPORT_SYMBOL(console_stop); | ||
809 | |||
810 | void console_start(struct console *console) | ||
811 | { | ||
812 | acquire_console_sem(); | ||
813 | console->flags |= CON_ENABLED; | ||
814 | release_console_sem(); | ||
815 | } | ||
816 | EXPORT_SYMBOL(console_start); | ||
817 | |||
818 | /* | ||
819 | * The console driver calls this routine during kernel initialization | ||
820 | * to register the console printing procedure with printk() and to | ||
821 | * print any messages that were printed by the kernel before the | ||
822 | * console driver was initialized. | ||
823 | */ | ||
824 | void register_console(struct console * console) | ||
825 | { | ||
826 | int i; | ||
827 | unsigned long flags; | ||
828 | |||
829 | if (preferred_console < 0) | ||
830 | preferred_console = selected_console; | ||
831 | |||
832 | /* | ||
833 | * See if we want to use this console driver. If we | ||
834 | * didn't select a console we take the first one | ||
835 | * that registers here. | ||
836 | */ | ||
837 | if (preferred_console < 0) { | ||
838 | if (console->index < 0) | ||
839 | console->index = 0; | ||
840 | if (console->setup == NULL || | ||
841 | console->setup(console, NULL) == 0) { | ||
842 | console->flags |= CON_ENABLED | CON_CONSDEV; | ||
843 | preferred_console = 0; | ||
844 | } | ||
845 | } | ||
846 | |||
847 | /* | ||
848 | * See if this console matches one we selected on | ||
849 | * the command line. | ||
850 | */ | ||
851 | for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { | ||
852 | if (strcmp(console_cmdline[i].name, console->name) != 0) | ||
853 | continue; | ||
854 | if (console->index >= 0 && | ||
855 | console->index != console_cmdline[i].index) | ||
856 | continue; | ||
857 | if (console->index < 0) | ||
858 | console->index = console_cmdline[i].index; | ||
859 | if (console->setup && | ||
860 | console->setup(console, console_cmdline[i].options) != 0) | ||
861 | break; | ||
862 | console->flags |= CON_ENABLED; | ||
863 | console->index = console_cmdline[i].index; | ||
864 | if (i == preferred_console) | ||
865 | console->flags |= CON_CONSDEV; | ||
866 | break; | ||
867 | } | ||
868 | |||
869 | if (!(console->flags & CON_ENABLED)) | ||
870 | return; | ||
871 | |||
872 | if (console_drivers && (console_drivers->flags & CON_BOOT)) { | ||
873 | unregister_console(console_drivers); | ||
874 | console->flags &= ~CON_PRINTBUFFER; | ||
875 | } | ||
876 | |||
877 | /* | ||
878 | * Put this console in the list - keep the | ||
879 | * preferred driver at the head of the list. | ||
880 | */ | ||
881 | acquire_console_sem(); | ||
882 | if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { | ||
883 | console->next = console_drivers; | ||
884 | console_drivers = console; | ||
885 | } else { | ||
886 | console->next = console_drivers->next; | ||
887 | console_drivers->next = console; | ||
888 | } | ||
889 | if (console->flags & CON_PRINTBUFFER) { | ||
890 | /* | ||
891 | * release_console_sem() will print out the buffered messages | ||
892 | * for us. | ||
893 | */ | ||
894 | spin_lock_irqsave(&logbuf_lock, flags); | ||
895 | con_start = log_start; | ||
896 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
897 | } | ||
898 | release_console_sem(); | ||
899 | } | ||
900 | EXPORT_SYMBOL(register_console); | ||
901 | |||
902 | int unregister_console(struct console * console) | ||
903 | { | ||
904 | struct console *a,*b; | ||
905 | int res = 1; | ||
906 | |||
907 | acquire_console_sem(); | ||
908 | if (console_drivers == console) { | ||
909 | console_drivers=console->next; | ||
910 | res = 0; | ||
911 | } else { | ||
912 | for (a=console_drivers->next, b=console_drivers ; | ||
913 | a; b=a, a=b->next) { | ||
914 | if (a == console) { | ||
915 | b->next = a->next; | ||
916 | res = 0; | ||
917 | break; | ||
918 | } | ||
919 | } | ||
920 | } | ||
921 | |||
922 | /* If last console is removed, we re-enable picking the first | ||
923 | * one that gets registered. Without that, pmac early boot console | ||
924 | * would prevent fbcon from taking over. | ||
925 | */ | ||
926 | if (console_drivers == NULL) | ||
927 | preferred_console = selected_console; | ||
928 | |||
929 | |||
930 | release_console_sem(); | ||
931 | return res; | ||
932 | } | ||
933 | EXPORT_SYMBOL(unregister_console); | ||
934 | |||
935 | /** | ||
936 | * tty_write_message - write a message to a certain tty, not just the console. | ||
937 | * | ||
938 | * This is used for messages that need to be redirected to a specific tty. | ||
939 | * We don't put it into the syslog queue right now maybe in the future if | ||
940 | * really needed. | ||
941 | */ | ||
942 | void tty_write_message(struct tty_struct *tty, char *msg) | ||
943 | { | ||
944 | if (tty && tty->driver->write) | ||
945 | tty->driver->write(tty, msg, strlen(msg)); | ||
946 | return; | ||
947 | } | ||
948 | |||
949 | /* | ||
950 | * printk rate limiting, lifted from the networking subsystem. | ||
951 | * | ||
952 | * This enforces a rate limit: not more than one kernel message | ||
953 | * every printk_ratelimit_jiffies to make a denial-of-service | ||
954 | * attack impossible. | ||
955 | */ | ||
956 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | ||
957 | { | ||
958 | static DEFINE_SPINLOCK(ratelimit_lock); | ||
959 | static unsigned long toks = 10*5*HZ; | ||
960 | static unsigned long last_msg; | ||
961 | static int missed; | ||
962 | unsigned long flags; | ||
963 | unsigned long now = jiffies; | ||
964 | |||
965 | spin_lock_irqsave(&ratelimit_lock, flags); | ||
966 | toks += now - last_msg; | ||
967 | last_msg = now; | ||
968 | if (toks > (ratelimit_burst * ratelimit_jiffies)) | ||
969 | toks = ratelimit_burst * ratelimit_jiffies; | ||
970 | if (toks >= ratelimit_jiffies) { | ||
971 | int lost = missed; | ||
972 | missed = 0; | ||
973 | toks -= ratelimit_jiffies; | ||
974 | spin_unlock_irqrestore(&ratelimit_lock, flags); | ||
975 | if (lost) | ||
976 | printk(KERN_WARNING "printk: %d messages suppressed.\n", lost); | ||
977 | return 1; | ||
978 | } | ||
979 | missed++; | ||
980 | spin_unlock_irqrestore(&ratelimit_lock, flags); | ||
981 | return 0; | ||
982 | } | ||
983 | EXPORT_SYMBOL(__printk_ratelimit); | ||
984 | |||
985 | /* minimum time in jiffies between messages */ | ||
986 | int printk_ratelimit_jiffies = 5*HZ; | ||
987 | |||
988 | /* number of messages we send before ratelimiting */ | ||
989 | int printk_ratelimit_burst = 10; | ||
990 | |||
991 | int printk_ratelimit(void) | ||
992 | { | ||
993 | return __printk_ratelimit(printk_ratelimit_jiffies, | ||
994 | printk_ratelimit_burst); | ||
995 | } | ||
996 | EXPORT_SYMBOL(printk_ratelimit); | ||
diff --git a/kernel/profile.c b/kernel/profile.c new file mode 100644 index 000000000000..a38fa70075fe --- /dev/null +++ b/kernel/profile.c | |||
@@ -0,0 +1,563 @@ | |||
1 | /* | ||
2 | * linux/kernel/profile.c | ||
3 | * Simple profiling. Manages a direct-mapped profile hit count buffer, | ||
4 | * with configurable resolution, support for restricting the cpus on | ||
5 | * which profiling is done, and switching between cpu time and | ||
6 | * schedule() calls via kernel command line parameters passed at boot. | ||
7 | * | ||
8 | * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, | ||
9 | * Red Hat, July 2004 | ||
10 | * Consolidation of architecture support code for profiling, | ||
11 | * William Irwin, Oracle, July 2004 | ||
12 | * Amortized hit count accounting via per-cpu open-addressed hashtables | ||
13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 | ||
14 | */ | ||
15 | |||
16 | #include <linux/config.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/profile.h> | ||
19 | #include <linux/bootmem.h> | ||
20 | #include <linux/notifier.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/cpumask.h> | ||
23 | #include <linux/cpu.h> | ||
24 | #include <linux/profile.h> | ||
25 | #include <linux/highmem.h> | ||
26 | #include <asm/sections.h> | ||
27 | #include <asm/semaphore.h> | ||
28 | |||
29 | struct profile_hit { | ||
30 | u32 pc, hits; | ||
31 | }; | ||
32 | #define PROFILE_GRPSHIFT 3 | ||
33 | #define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) | ||
34 | #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) | ||
35 | #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) | ||
36 | |||
37 | /* Oprofile timer tick hook */ | ||
38 | int (*timer_hook)(struct pt_regs *); | ||
39 | |||
40 | static atomic_t *prof_buffer; | ||
41 | static unsigned long prof_len, prof_shift; | ||
42 | static int prof_on; | ||
43 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; | ||
44 | #ifdef CONFIG_SMP | ||
45 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); | ||
46 | static DEFINE_PER_CPU(int, cpu_profile_flip); | ||
47 | static DECLARE_MUTEX(profile_flip_mutex); | ||
48 | #endif /* CONFIG_SMP */ | ||
49 | |||
50 | static int __init profile_setup(char * str) | ||
51 | { | ||
52 | int par; | ||
53 | |||
54 | if (!strncmp(str, "schedule", 8)) { | ||
55 | prof_on = SCHED_PROFILING; | ||
56 | printk(KERN_INFO "kernel schedule profiling enabled\n"); | ||
57 | if (str[7] == ',') | ||
58 | str += 8; | ||
59 | } | ||
60 | if (get_option(&str,&par)) { | ||
61 | prof_shift = par; | ||
62 | prof_on = CPU_PROFILING; | ||
63 | printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", | ||
64 | prof_shift); | ||
65 | } | ||
66 | return 1; | ||
67 | } | ||
68 | __setup("profile=", profile_setup); | ||
69 | |||
70 | |||
71 | void __init profile_init(void) | ||
72 | { | ||
73 | if (!prof_on) | ||
74 | return; | ||
75 | |||
76 | /* only text is profiled */ | ||
77 | prof_len = (_etext - _stext) >> prof_shift; | ||
78 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); | ||
79 | } | ||
80 | |||
81 | /* Profile event notifications */ | ||
82 | |||
83 | #ifdef CONFIG_PROFILING | ||
84 | |||
85 | static DECLARE_RWSEM(profile_rwsem); | ||
86 | static DEFINE_RWLOCK(handoff_lock); | ||
87 | static struct notifier_block * task_exit_notifier; | ||
88 | static struct notifier_block * task_free_notifier; | ||
89 | static struct notifier_block * munmap_notifier; | ||
90 | |||
91 | void profile_task_exit(struct task_struct * task) | ||
92 | { | ||
93 | down_read(&profile_rwsem); | ||
94 | notifier_call_chain(&task_exit_notifier, 0, task); | ||
95 | up_read(&profile_rwsem); | ||
96 | } | ||
97 | |||
98 | int profile_handoff_task(struct task_struct * task) | ||
99 | { | ||
100 | int ret; | ||
101 | read_lock(&handoff_lock); | ||
102 | ret = notifier_call_chain(&task_free_notifier, 0, task); | ||
103 | read_unlock(&handoff_lock); | ||
104 | return (ret == NOTIFY_OK) ? 1 : 0; | ||
105 | } | ||
106 | |||
107 | void profile_munmap(unsigned long addr) | ||
108 | { | ||
109 | down_read(&profile_rwsem); | ||
110 | notifier_call_chain(&munmap_notifier, 0, (void *)addr); | ||
111 | up_read(&profile_rwsem); | ||
112 | } | ||
113 | |||
114 | int task_handoff_register(struct notifier_block * n) | ||
115 | { | ||
116 | int err = -EINVAL; | ||
117 | |||
118 | write_lock(&handoff_lock); | ||
119 | err = notifier_chain_register(&task_free_notifier, n); | ||
120 | write_unlock(&handoff_lock); | ||
121 | return err; | ||
122 | } | ||
123 | |||
124 | int task_handoff_unregister(struct notifier_block * n) | ||
125 | { | ||
126 | int err = -EINVAL; | ||
127 | |||
128 | write_lock(&handoff_lock); | ||
129 | err = notifier_chain_unregister(&task_free_notifier, n); | ||
130 | write_unlock(&handoff_lock); | ||
131 | return err; | ||
132 | } | ||
133 | |||
134 | int profile_event_register(enum profile_type type, struct notifier_block * n) | ||
135 | { | ||
136 | int err = -EINVAL; | ||
137 | |||
138 | down_write(&profile_rwsem); | ||
139 | |||
140 | switch (type) { | ||
141 | case PROFILE_TASK_EXIT: | ||
142 | err = notifier_chain_register(&task_exit_notifier, n); | ||
143 | break; | ||
144 | case PROFILE_MUNMAP: | ||
145 | err = notifier_chain_register(&munmap_notifier, n); | ||
146 | break; | ||
147 | } | ||
148 | |||
149 | up_write(&profile_rwsem); | ||
150 | |||
151 | return err; | ||
152 | } | ||
153 | |||
154 | |||
155 | int profile_event_unregister(enum profile_type type, struct notifier_block * n) | ||
156 | { | ||
157 | int err = -EINVAL; | ||
158 | |||
159 | down_write(&profile_rwsem); | ||
160 | |||
161 | switch (type) { | ||
162 | case PROFILE_TASK_EXIT: | ||
163 | err = notifier_chain_unregister(&task_exit_notifier, n); | ||
164 | break; | ||
165 | case PROFILE_MUNMAP: | ||
166 | err = notifier_chain_unregister(&munmap_notifier, n); | ||
167 | break; | ||
168 | } | ||
169 | |||
170 | up_write(&profile_rwsem); | ||
171 | return err; | ||
172 | } | ||
173 | |||
174 | int register_timer_hook(int (*hook)(struct pt_regs *)) | ||
175 | { | ||
176 | if (timer_hook) | ||
177 | return -EBUSY; | ||
178 | timer_hook = hook; | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) | ||
183 | { | ||
184 | WARN_ON(hook != timer_hook); | ||
185 | timer_hook = NULL; | ||
186 | /* make sure all CPUs see the NULL hook */ | ||
187 | synchronize_kernel(); | ||
188 | } | ||
189 | |||
190 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
191 | EXPORT_SYMBOL_GPL(unregister_timer_hook); | ||
192 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
193 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
194 | |||
195 | #endif /* CONFIG_PROFILING */ | ||
196 | |||
197 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
198 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
199 | |||
200 | #ifdef CONFIG_SMP | ||
201 | /* | ||
202 | * Each cpu has a pair of open-addressed hashtables for pending | ||
203 | * profile hits. read_profile() IPI's all cpus to request them | ||
204 | * to flip buffers and flushes their contents to prof_buffer itself. | ||
205 | * Flip requests are serialized by the profile_flip_mutex. The sole | ||
206 | * use of having a second hashtable is for avoiding cacheline | ||
207 | * contention that would otherwise happen during flushes of pending | ||
208 | * profile hits required for the accuracy of reported profile hits | ||
209 | * and so resurrect the interrupt livelock issue. | ||
210 | * | ||
211 | * The open-addressed hashtables are indexed by profile buffer slot | ||
212 | * and hold the number of pending hits to that profile buffer slot on | ||
213 | * a cpu in an entry. When the hashtable overflows, all pending hits | ||
214 | * are accounted to their corresponding profile buffer slots with | ||
215 | * atomic_add() and the hashtable emptied. As numerous pending hits | ||
216 | * may be accounted to a profile buffer slot in a hashtable entry, | ||
217 | * this amortizes a number of atomic profile buffer increments likely | ||
218 | * to be far larger than the number of entries in the hashtable, | ||
219 | * particularly given that the number of distinct profile buffer | ||
220 | * positions to which hits are accounted during short intervals (e.g. | ||
221 | * several seconds) is usually very small. Exclusion from buffer | ||
222 | * flipping is provided by interrupt disablement (note that for | ||
223 | * SCHED_PROFILING profile_hit() may be called from process context). | ||
224 | * The hash function is meant to be lightweight as opposed to strong, | ||
225 | * and was vaguely inspired by ppc64 firmware-supported inverted | ||
226 | * pagetable hash functions, but uses a full hashtable full of finite | ||
227 | * collision chains, not just pairs of them. | ||
228 | * | ||
229 | * -- wli | ||
230 | */ | ||
231 | static void __profile_flip_buffers(void *unused) | ||
232 | { | ||
233 | int cpu = smp_processor_id(); | ||
234 | |||
235 | per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); | ||
236 | } | ||
237 | |||
238 | static void profile_flip_buffers(void) | ||
239 | { | ||
240 | int i, j, cpu; | ||
241 | |||
242 | down(&profile_flip_mutex); | ||
243 | j = per_cpu(cpu_profile_flip, get_cpu()); | ||
244 | put_cpu(); | ||
245 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); | ||
246 | for_each_online_cpu(cpu) { | ||
247 | struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; | ||
248 | for (i = 0; i < NR_PROFILE_HIT; ++i) { | ||
249 | if (!hits[i].hits) { | ||
250 | if (hits[i].pc) | ||
251 | hits[i].pc = 0; | ||
252 | continue; | ||
253 | } | ||
254 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); | ||
255 | hits[i].hits = hits[i].pc = 0; | ||
256 | } | ||
257 | } | ||
258 | up(&profile_flip_mutex); | ||
259 | } | ||
260 | |||
261 | static void profile_discard_flip_buffers(void) | ||
262 | { | ||
263 | int i, cpu; | ||
264 | |||
265 | down(&profile_flip_mutex); | ||
266 | i = per_cpu(cpu_profile_flip, get_cpu()); | ||
267 | put_cpu(); | ||
268 | on_each_cpu(__profile_flip_buffers, NULL, 0, 1); | ||
269 | for_each_online_cpu(cpu) { | ||
270 | struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; | ||
271 | memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); | ||
272 | } | ||
273 | up(&profile_flip_mutex); | ||
274 | } | ||
275 | |||
276 | void profile_hit(int type, void *__pc) | ||
277 | { | ||
278 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; | ||
279 | int i, j, cpu; | ||
280 | struct profile_hit *hits; | ||
281 | |||
282 | if (prof_on != type || !prof_buffer) | ||
283 | return; | ||
284 | pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); | ||
285 | i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; | ||
286 | secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; | ||
287 | cpu = get_cpu(); | ||
288 | hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; | ||
289 | if (!hits) { | ||
290 | put_cpu(); | ||
291 | return; | ||
292 | } | ||
293 | local_irq_save(flags); | ||
294 | do { | ||
295 | for (j = 0; j < PROFILE_GRPSZ; ++j) { | ||
296 | if (hits[i + j].pc == pc) { | ||
297 | hits[i + j].hits++; | ||
298 | goto out; | ||
299 | } else if (!hits[i + j].hits) { | ||
300 | hits[i + j].pc = pc; | ||
301 | hits[i + j].hits = 1; | ||
302 | goto out; | ||
303 | } | ||
304 | } | ||
305 | i = (i + secondary) & (NR_PROFILE_HIT - 1); | ||
306 | } while (i != primary); | ||
307 | atomic_inc(&prof_buffer[pc]); | ||
308 | for (i = 0; i < NR_PROFILE_HIT; ++i) { | ||
309 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); | ||
310 | hits[i].pc = hits[i].hits = 0; | ||
311 | } | ||
312 | out: | ||
313 | local_irq_restore(flags); | ||
314 | put_cpu(); | ||
315 | } | ||
316 | |||
317 | #ifdef CONFIG_HOTPLUG_CPU | ||
318 | static int __devinit profile_cpu_callback(struct notifier_block *info, | ||
319 | unsigned long action, void *__cpu) | ||
320 | { | ||
321 | int node, cpu = (unsigned long)__cpu; | ||
322 | struct page *page; | ||
323 | |||
324 | switch (action) { | ||
325 | case CPU_UP_PREPARE: | ||
326 | node = cpu_to_node(cpu); | ||
327 | per_cpu(cpu_profile_flip, cpu) = 0; | ||
328 | if (!per_cpu(cpu_profile_hits, cpu)[1]) { | ||
329 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
330 | if (!page) | ||
331 | return NOTIFY_BAD; | ||
332 | per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); | ||
333 | } | ||
334 | if (!per_cpu(cpu_profile_hits, cpu)[0]) { | ||
335 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
336 | if (!page) | ||
337 | goto out_free; | ||
338 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | ||
339 | } | ||
340 | break; | ||
341 | out_free: | ||
342 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | ||
343 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | ||
344 | __free_page(page); | ||
345 | return NOTIFY_BAD; | ||
346 | case CPU_ONLINE: | ||
347 | cpu_set(cpu, prof_cpu_mask); | ||
348 | break; | ||
349 | case CPU_UP_CANCELED: | ||
350 | case CPU_DEAD: | ||
351 | cpu_clear(cpu, prof_cpu_mask); | ||
352 | if (per_cpu(cpu_profile_hits, cpu)[0]) { | ||
353 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); | ||
354 | per_cpu(cpu_profile_hits, cpu)[0] = NULL; | ||
355 | __free_page(page); | ||
356 | } | ||
357 | if (per_cpu(cpu_profile_hits, cpu)[1]) { | ||
358 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | ||
359 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | ||
360 | __free_page(page); | ||
361 | } | ||
362 | break; | ||
363 | } | ||
364 | return NOTIFY_OK; | ||
365 | } | ||
366 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
367 | #else /* !CONFIG_SMP */ | ||
368 | #define profile_flip_buffers() do { } while (0) | ||
369 | #define profile_discard_flip_buffers() do { } while (0) | ||
370 | |||
371 | void profile_hit(int type, void *__pc) | ||
372 | { | ||
373 | unsigned long pc; | ||
374 | |||
375 | if (prof_on != type || !prof_buffer) | ||
376 | return; | ||
377 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; | ||
378 | atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); | ||
379 | } | ||
380 | #endif /* !CONFIG_SMP */ | ||
381 | |||
382 | void profile_tick(int type, struct pt_regs *regs) | ||
383 | { | ||
384 | if (type == CPU_PROFILING && timer_hook) | ||
385 | timer_hook(regs); | ||
386 | if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) | ||
387 | profile_hit(type, (void *)profile_pc(regs)); | ||
388 | } | ||
389 | |||
390 | #ifdef CONFIG_PROC_FS | ||
391 | #include <linux/proc_fs.h> | ||
392 | #include <asm/uaccess.h> | ||
393 | #include <asm/ptrace.h> | ||
394 | |||
395 | static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | ||
396 | int count, int *eof, void *data) | ||
397 | { | ||
398 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); | ||
399 | if (count - len < 2) | ||
400 | return -EINVAL; | ||
401 | len += sprintf(page + len, "\n"); | ||
402 | return len; | ||
403 | } | ||
404 | |||
405 | static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, | ||
406 | unsigned long count, void *data) | ||
407 | { | ||
408 | cpumask_t *mask = (cpumask_t *)data; | ||
409 | unsigned long full_count = count, err; | ||
410 | cpumask_t new_value; | ||
411 | |||
412 | err = cpumask_parse(buffer, count, new_value); | ||
413 | if (err) | ||
414 | return err; | ||
415 | |||
416 | *mask = new_value; | ||
417 | return full_count; | ||
418 | } | ||
419 | |||
420 | void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) | ||
421 | { | ||
422 | struct proc_dir_entry *entry; | ||
423 | |||
424 | /* create /proc/irq/prof_cpu_mask */ | ||
425 | if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) | ||
426 | return; | ||
427 | entry->nlink = 1; | ||
428 | entry->data = (void *)&prof_cpu_mask; | ||
429 | entry->read_proc = prof_cpu_mask_read_proc; | ||
430 | entry->write_proc = prof_cpu_mask_write_proc; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * This function accesses profiling information. The returned data is | ||
435 | * binary: the sampling step and the actual contents of the profile | ||
436 | * buffer. Use of the program readprofile is recommended in order to | ||
437 | * get meaningful info out of these data. | ||
438 | */ | ||
439 | static ssize_t | ||
440 | read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
441 | { | ||
442 | unsigned long p = *ppos; | ||
443 | ssize_t read; | ||
444 | char * pnt; | ||
445 | unsigned int sample_step = 1 << prof_shift; | ||
446 | |||
447 | profile_flip_buffers(); | ||
448 | if (p >= (prof_len+1)*sizeof(unsigned int)) | ||
449 | return 0; | ||
450 | if (count > (prof_len+1)*sizeof(unsigned int) - p) | ||
451 | count = (prof_len+1)*sizeof(unsigned int) - p; | ||
452 | read = 0; | ||
453 | |||
454 | while (p < sizeof(unsigned int) && count > 0) { | ||
455 | put_user(*((char *)(&sample_step)+p),buf); | ||
456 | buf++; p++; count--; read++; | ||
457 | } | ||
458 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); | ||
459 | if (copy_to_user(buf,(void *)pnt,count)) | ||
460 | return -EFAULT; | ||
461 | read += count; | ||
462 | *ppos += read; | ||
463 | return read; | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * Writing to /proc/profile resets the counters | ||
468 | * | ||
469 | * Writing a 'profiling multiplier' value into it also re-sets the profiling | ||
470 | * interrupt frequency, on architectures that support this. | ||
471 | */ | ||
472 | static ssize_t write_profile(struct file *file, const char __user *buf, | ||
473 | size_t count, loff_t *ppos) | ||
474 | { | ||
475 | #ifdef CONFIG_SMP | ||
476 | extern int setup_profiling_timer (unsigned int multiplier); | ||
477 | |||
478 | if (count == sizeof(int)) { | ||
479 | unsigned int multiplier; | ||
480 | |||
481 | if (copy_from_user(&multiplier, buf, sizeof(int))) | ||
482 | return -EFAULT; | ||
483 | |||
484 | if (setup_profiling_timer(multiplier)) | ||
485 | return -EINVAL; | ||
486 | } | ||
487 | #endif | ||
488 | profile_discard_flip_buffers(); | ||
489 | memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); | ||
490 | return count; | ||
491 | } | ||
492 | |||
493 | static struct file_operations proc_profile_operations = { | ||
494 | .read = read_profile, | ||
495 | .write = write_profile, | ||
496 | }; | ||
497 | |||
498 | #ifdef CONFIG_SMP | ||
499 | static void __init profile_nop(void *unused) | ||
500 | { | ||
501 | } | ||
502 | |||
503 | static int __init create_hash_tables(void) | ||
504 | { | ||
505 | int cpu; | ||
506 | |||
507 | for_each_online_cpu(cpu) { | ||
508 | int node = cpu_to_node(cpu); | ||
509 | struct page *page; | ||
510 | |||
511 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
512 | if (!page) | ||
513 | goto out_cleanup; | ||
514 | per_cpu(cpu_profile_hits, cpu)[1] | ||
515 | = (struct profile_hit *)page_address(page); | ||
516 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
517 | if (!page) | ||
518 | goto out_cleanup; | ||
519 | per_cpu(cpu_profile_hits, cpu)[0] | ||
520 | = (struct profile_hit *)page_address(page); | ||
521 | } | ||
522 | return 0; | ||
523 | out_cleanup: | ||
524 | prof_on = 0; | ||
525 | mb(); | ||
526 | on_each_cpu(profile_nop, NULL, 0, 1); | ||
527 | for_each_online_cpu(cpu) { | ||
528 | struct page *page; | ||
529 | |||
530 | if (per_cpu(cpu_profile_hits, cpu)[0]) { | ||
531 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); | ||
532 | per_cpu(cpu_profile_hits, cpu)[0] = NULL; | ||
533 | __free_page(page); | ||
534 | } | ||
535 | if (per_cpu(cpu_profile_hits, cpu)[1]) { | ||
536 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | ||
537 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | ||
538 | __free_page(page); | ||
539 | } | ||
540 | } | ||
541 | return -1; | ||
542 | } | ||
543 | #else | ||
544 | #define create_hash_tables() ({ 0; }) | ||
545 | #endif | ||
546 | |||
547 | static int __init create_proc_profile(void) | ||
548 | { | ||
549 | struct proc_dir_entry *entry; | ||
550 | |||
551 | if (!prof_on) | ||
552 | return 0; | ||
553 | if (create_hash_tables()) | ||
554 | return -1; | ||
555 | if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) | ||
556 | return 0; | ||
557 | entry->proc_fops = &proc_profile_operations; | ||
558 | entry->size = (1+prof_len) * sizeof(atomic_t); | ||
559 | hotcpu_notifier(profile_cpu_callback, 0); | ||
560 | return 0; | ||
561 | } | ||
562 | module_init(create_proc_profile); | ||
563 | #endif /* CONFIG_PROC_FS */ | ||
diff --git a/kernel/ptrace.c b/kernel/ptrace.c new file mode 100644 index 000000000000..88b306c4e841 --- /dev/null +++ b/kernel/ptrace.c | |||
@@ -0,0 +1,389 @@ | |||
1 | /* | ||
2 | * linux/kernel/ptrace.c | ||
3 | * | ||
4 | * (C) Copyright 1999 Linus Torvalds | ||
5 | * | ||
6 | * Common interfaces for "ptrace()" which we do not want | ||
7 | * to continually duplicate across every architecture. | ||
8 | */ | ||
9 | |||
10 | #include <linux/module.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/errno.h> | ||
13 | #include <linux/mm.h> | ||
14 | #include <linux/highmem.h> | ||
15 | #include <linux/pagemap.h> | ||
16 | #include <linux/smp_lock.h> | ||
17 | #include <linux/ptrace.h> | ||
18 | #include <linux/security.h> | ||
19 | |||
20 | #include <asm/pgtable.h> | ||
21 | #include <asm/uaccess.h> | ||
22 | |||
23 | /* | ||
24 | * ptrace a task: make the debugger its new parent and | ||
25 | * move it to the ptrace list. | ||
26 | * | ||
27 | * Must be called with the tasklist lock write-held. | ||
28 | */ | ||
29 | void __ptrace_link(task_t *child, task_t *new_parent) | ||
30 | { | ||
31 | if (!list_empty(&child->ptrace_list)) | ||
32 | BUG(); | ||
33 | if (child->parent == new_parent) | ||
34 | return; | ||
35 | list_add(&child->ptrace_list, &child->parent->ptrace_children); | ||
36 | REMOVE_LINKS(child); | ||
37 | child->parent = new_parent; | ||
38 | SET_LINKS(child); | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * Turn a tracing stop into a normal stop now, since with no tracer there | ||
43 | * would be no way to wake it up with SIGCONT or SIGKILL. If there was a | ||
44 | * signal sent that would resume the child, but didn't because it was in | ||
45 | * TASK_TRACED, resume it now. | ||
46 | * Requires that irqs be disabled. | ||
47 | */ | ||
48 | void ptrace_untrace(task_t *child) | ||
49 | { | ||
50 | spin_lock(&child->sighand->siglock); | ||
51 | if (child->state == TASK_TRACED) { | ||
52 | if (child->signal->flags & SIGNAL_STOP_STOPPED) { | ||
53 | child->state = TASK_STOPPED; | ||
54 | } else { | ||
55 | signal_wake_up(child, 1); | ||
56 | } | ||
57 | } | ||
58 | spin_unlock(&child->sighand->siglock); | ||
59 | } | ||
60 | |||
61 | /* | ||
62 | * unptrace a task: move it back to its original parent and | ||
63 | * remove it from the ptrace list. | ||
64 | * | ||
65 | * Must be called with the tasklist lock write-held. | ||
66 | */ | ||
67 | void __ptrace_unlink(task_t *child) | ||
68 | { | ||
69 | if (!child->ptrace) | ||
70 | BUG(); | ||
71 | child->ptrace = 0; | ||
72 | if (!list_empty(&child->ptrace_list)) { | ||
73 | list_del_init(&child->ptrace_list); | ||
74 | REMOVE_LINKS(child); | ||
75 | child->parent = child->real_parent; | ||
76 | SET_LINKS(child); | ||
77 | } | ||
78 | |||
79 | if (child->state == TASK_TRACED) | ||
80 | ptrace_untrace(child); | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Check that we have indeed attached to the thing.. | ||
85 | */ | ||
86 | int ptrace_check_attach(struct task_struct *child, int kill) | ||
87 | { | ||
88 | int ret = -ESRCH; | ||
89 | |||
90 | /* | ||
91 | * We take the read lock around doing both checks to close a | ||
92 | * possible race where someone else was tracing our child and | ||
93 | * detached between these two checks. After this locked check, | ||
94 | * we are sure that this is our traced child and that can only | ||
95 | * be changed by us so it's not changing right after this. | ||
96 | */ | ||
97 | read_lock(&tasklist_lock); | ||
98 | if ((child->ptrace & PT_PTRACED) && child->parent == current && | ||
99 | (!(child->ptrace & PT_ATTACHED) || child->real_parent != current) | ||
100 | && child->signal != NULL) { | ||
101 | ret = 0; | ||
102 | spin_lock_irq(&child->sighand->siglock); | ||
103 | if (child->state == TASK_STOPPED) { | ||
104 | child->state = TASK_TRACED; | ||
105 | } else if (child->state != TASK_TRACED && !kill) { | ||
106 | ret = -ESRCH; | ||
107 | } | ||
108 | spin_unlock_irq(&child->sighand->siglock); | ||
109 | } | ||
110 | read_unlock(&tasklist_lock); | ||
111 | |||
112 | if (!ret && !kill) { | ||
113 | wait_task_inactive(child); | ||
114 | } | ||
115 | |||
116 | /* All systems go.. */ | ||
117 | return ret; | ||
118 | } | ||
119 | |||
120 | int ptrace_attach(struct task_struct *task) | ||
121 | { | ||
122 | int retval; | ||
123 | task_lock(task); | ||
124 | retval = -EPERM; | ||
125 | if (task->pid <= 1) | ||
126 | goto bad; | ||
127 | if (task == current) | ||
128 | goto bad; | ||
129 | if (!task->mm) | ||
130 | goto bad; | ||
131 | if(((current->uid != task->euid) || | ||
132 | (current->uid != task->suid) || | ||
133 | (current->uid != task->uid) || | ||
134 | (current->gid != task->egid) || | ||
135 | (current->gid != task->sgid) || | ||
136 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | ||
137 | goto bad; | ||
138 | rmb(); | ||
139 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | ||
140 | goto bad; | ||
141 | /* the same process cannot be attached many times */ | ||
142 | if (task->ptrace & PT_PTRACED) | ||
143 | goto bad; | ||
144 | retval = security_ptrace(current, task); | ||
145 | if (retval) | ||
146 | goto bad; | ||
147 | |||
148 | /* Go */ | ||
149 | task->ptrace |= PT_PTRACED | ((task->real_parent != current) | ||
150 | ? PT_ATTACHED : 0); | ||
151 | if (capable(CAP_SYS_PTRACE)) | ||
152 | task->ptrace |= PT_PTRACE_CAP; | ||
153 | task_unlock(task); | ||
154 | |||
155 | write_lock_irq(&tasklist_lock); | ||
156 | __ptrace_link(task, current); | ||
157 | write_unlock_irq(&tasklist_lock); | ||
158 | |||
159 | force_sig_specific(SIGSTOP, task); | ||
160 | return 0; | ||
161 | |||
162 | bad: | ||
163 | task_unlock(task); | ||
164 | return retval; | ||
165 | } | ||
166 | |||
167 | int ptrace_detach(struct task_struct *child, unsigned int data) | ||
168 | { | ||
169 | if ((unsigned long) data > _NSIG) | ||
170 | return -EIO; | ||
171 | |||
172 | /* Architecture-specific hardware disable .. */ | ||
173 | ptrace_disable(child); | ||
174 | |||
175 | /* .. re-parent .. */ | ||
176 | child->exit_code = data; | ||
177 | |||
178 | write_lock_irq(&tasklist_lock); | ||
179 | __ptrace_unlink(child); | ||
180 | /* .. and wake it up. */ | ||
181 | if (child->exit_state != EXIT_ZOMBIE) | ||
182 | wake_up_process(child); | ||
183 | write_unlock_irq(&tasklist_lock); | ||
184 | |||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * Access another process' address space. | ||
190 | * Source/target buffer must be kernel space, | ||
191 | * Do not walk the page table directly, use get_user_pages | ||
192 | */ | ||
193 | |||
194 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
195 | { | ||
196 | struct mm_struct *mm; | ||
197 | struct vm_area_struct *vma; | ||
198 | struct page *page; | ||
199 | void *old_buf = buf; | ||
200 | |||
201 | mm = get_task_mm(tsk); | ||
202 | if (!mm) | ||
203 | return 0; | ||
204 | |||
205 | down_read(&mm->mmap_sem); | ||
206 | /* ignore errors, just check how much was sucessfully transfered */ | ||
207 | while (len) { | ||
208 | int bytes, ret, offset; | ||
209 | void *maddr; | ||
210 | |||
211 | ret = get_user_pages(tsk, mm, addr, 1, | ||
212 | write, 1, &page, &vma); | ||
213 | if (ret <= 0) | ||
214 | break; | ||
215 | |||
216 | bytes = len; | ||
217 | offset = addr & (PAGE_SIZE-1); | ||
218 | if (bytes > PAGE_SIZE-offset) | ||
219 | bytes = PAGE_SIZE-offset; | ||
220 | |||
221 | maddr = kmap(page); | ||
222 | if (write) { | ||
223 | copy_to_user_page(vma, page, addr, | ||
224 | maddr + offset, buf, bytes); | ||
225 | set_page_dirty_lock(page); | ||
226 | } else { | ||
227 | copy_from_user_page(vma, page, addr, | ||
228 | buf, maddr + offset, bytes); | ||
229 | } | ||
230 | kunmap(page); | ||
231 | page_cache_release(page); | ||
232 | len -= bytes; | ||
233 | buf += bytes; | ||
234 | addr += bytes; | ||
235 | } | ||
236 | up_read(&mm->mmap_sem); | ||
237 | mmput(mm); | ||
238 | |||
239 | return buf - old_buf; | ||
240 | } | ||
241 | |||
242 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) | ||
243 | { | ||
244 | int copied = 0; | ||
245 | |||
246 | while (len > 0) { | ||
247 | char buf[128]; | ||
248 | int this_len, retval; | ||
249 | |||
250 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; | ||
251 | retval = access_process_vm(tsk, src, buf, this_len, 0); | ||
252 | if (!retval) { | ||
253 | if (copied) | ||
254 | break; | ||
255 | return -EIO; | ||
256 | } | ||
257 | if (copy_to_user(dst, buf, retval)) | ||
258 | return -EFAULT; | ||
259 | copied += retval; | ||
260 | src += retval; | ||
261 | dst += retval; | ||
262 | len -= retval; | ||
263 | } | ||
264 | return copied; | ||
265 | } | ||
266 | |||
267 | int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len) | ||
268 | { | ||
269 | int copied = 0; | ||
270 | |||
271 | while (len > 0) { | ||
272 | char buf[128]; | ||
273 | int this_len, retval; | ||
274 | |||
275 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; | ||
276 | if (copy_from_user(buf, src, this_len)) | ||
277 | return -EFAULT; | ||
278 | retval = access_process_vm(tsk, dst, buf, this_len, 1); | ||
279 | if (!retval) { | ||
280 | if (copied) | ||
281 | break; | ||
282 | return -EIO; | ||
283 | } | ||
284 | copied += retval; | ||
285 | src += retval; | ||
286 | dst += retval; | ||
287 | len -= retval; | ||
288 | } | ||
289 | return copied; | ||
290 | } | ||
291 | |||
292 | static int ptrace_setoptions(struct task_struct *child, long data) | ||
293 | { | ||
294 | child->ptrace &= ~PT_TRACE_MASK; | ||
295 | |||
296 | if (data & PTRACE_O_TRACESYSGOOD) | ||
297 | child->ptrace |= PT_TRACESYSGOOD; | ||
298 | |||
299 | if (data & PTRACE_O_TRACEFORK) | ||
300 | child->ptrace |= PT_TRACE_FORK; | ||
301 | |||
302 | if (data & PTRACE_O_TRACEVFORK) | ||
303 | child->ptrace |= PT_TRACE_VFORK; | ||
304 | |||
305 | if (data & PTRACE_O_TRACECLONE) | ||
306 | child->ptrace |= PT_TRACE_CLONE; | ||
307 | |||
308 | if (data & PTRACE_O_TRACEEXEC) | ||
309 | child->ptrace |= PT_TRACE_EXEC; | ||
310 | |||
311 | if (data & PTRACE_O_TRACEVFORKDONE) | ||
312 | child->ptrace |= PT_TRACE_VFORK_DONE; | ||
313 | |||
314 | if (data & PTRACE_O_TRACEEXIT) | ||
315 | child->ptrace |= PT_TRACE_EXIT; | ||
316 | |||
317 | return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; | ||
318 | } | ||
319 | |||
320 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) | ||
321 | { | ||
322 | siginfo_t lastinfo; | ||
323 | int error = -ESRCH; | ||
324 | |||
325 | read_lock(&tasklist_lock); | ||
326 | if (likely(child->sighand != NULL)) { | ||
327 | error = -EINVAL; | ||
328 | spin_lock_irq(&child->sighand->siglock); | ||
329 | if (likely(child->last_siginfo != NULL)) { | ||
330 | lastinfo = *child->last_siginfo; | ||
331 | error = 0; | ||
332 | } | ||
333 | spin_unlock_irq(&child->sighand->siglock); | ||
334 | } | ||
335 | read_unlock(&tasklist_lock); | ||
336 | if (!error) | ||
337 | return copy_siginfo_to_user(data, &lastinfo); | ||
338 | return error; | ||
339 | } | ||
340 | |||
341 | static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) | ||
342 | { | ||
343 | siginfo_t newinfo; | ||
344 | int error = -ESRCH; | ||
345 | |||
346 | if (copy_from_user(&newinfo, data, sizeof (siginfo_t))) | ||
347 | return -EFAULT; | ||
348 | |||
349 | read_lock(&tasklist_lock); | ||
350 | if (likely(child->sighand != NULL)) { | ||
351 | error = -EINVAL; | ||
352 | spin_lock_irq(&child->sighand->siglock); | ||
353 | if (likely(child->last_siginfo != NULL)) { | ||
354 | *child->last_siginfo = newinfo; | ||
355 | error = 0; | ||
356 | } | ||
357 | spin_unlock_irq(&child->sighand->siglock); | ||
358 | } | ||
359 | read_unlock(&tasklist_lock); | ||
360 | return error; | ||
361 | } | ||
362 | |||
363 | int ptrace_request(struct task_struct *child, long request, | ||
364 | long addr, long data) | ||
365 | { | ||
366 | int ret = -EIO; | ||
367 | |||
368 | switch (request) { | ||
369 | #ifdef PTRACE_OLDSETOPTIONS | ||
370 | case PTRACE_OLDSETOPTIONS: | ||
371 | #endif | ||
372 | case PTRACE_SETOPTIONS: | ||
373 | ret = ptrace_setoptions(child, data); | ||
374 | break; | ||
375 | case PTRACE_GETEVENTMSG: | ||
376 | ret = put_user(child->ptrace_message, (unsigned long __user *) data); | ||
377 | break; | ||
378 | case PTRACE_GETSIGINFO: | ||
379 | ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); | ||
380 | break; | ||
381 | case PTRACE_SETSIGINFO: | ||
382 | ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); | ||
383 | break; | ||
384 | default: | ||
385 | break; | ||
386 | } | ||
387 | |||
388 | return ret; | ||
389 | } | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c new file mode 100644 index 000000000000..d00eded75d71 --- /dev/null +++ b/kernel/rcupdate.c | |||
@@ -0,0 +1,470 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2001 | ||
19 | * | ||
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | ||
21 | * Manfred Spraul <manfred@colorfullife.com> | ||
22 | * | ||
23 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
25 | * Papers: | ||
26 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
27 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
28 | * | ||
29 | * For detailed explanation of Read-Copy Update mechanism see - | ||
30 | * http://lse.sourceforge.net/locking/rcupdate.html | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/types.h> | ||
34 | #include <linux/kernel.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/spinlock.h> | ||
37 | #include <linux/smp.h> | ||
38 | #include <linux/interrupt.h> | ||
39 | #include <linux/sched.h> | ||
40 | #include <asm/atomic.h> | ||
41 | #include <linux/bitops.h> | ||
42 | #include <linux/module.h> | ||
43 | #include <linux/completion.h> | ||
44 | #include <linux/moduleparam.h> | ||
45 | #include <linux/percpu.h> | ||
46 | #include <linux/notifier.h> | ||
47 | #include <linux/rcupdate.h> | ||
48 | #include <linux/cpu.h> | ||
49 | |||
50 | /* Definition for rcupdate control block. */ | ||
51 | struct rcu_ctrlblk rcu_ctrlblk = | ||
52 | { .cur = -300, .completed = -300 }; | ||
53 | struct rcu_ctrlblk rcu_bh_ctrlblk = | ||
54 | { .cur = -300, .completed = -300 }; | ||
55 | |||
56 | /* Bookkeeping of the progress of the grace period */ | ||
57 | struct rcu_state { | ||
58 | spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */ | ||
59 | cpumask_t cpumask; /* CPUs that need to switch in order */ | ||
60 | /* for current batch to proceed. */ | ||
61 | }; | ||
62 | |||
63 | static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = | ||
64 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; | ||
65 | static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = | ||
66 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; | ||
67 | |||
68 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | ||
69 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
70 | |||
71 | /* Fake initialization required by compiler */ | ||
72 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | ||
73 | static int maxbatch = 10; | ||
74 | |||
75 | /** | ||
76 | * call_rcu - Queue an RCU callback for invocation after a grace period. | ||
77 | * @head: structure to be used for queueing the RCU updates. | ||
78 | * @func: actual update function to be invoked after the grace period | ||
79 | * | ||
80 | * The update function will be invoked some time after a full grace | ||
81 | * period elapses, in other words after all currently executing RCU | ||
82 | * read-side critical sections have completed. RCU read-side critical | ||
83 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
84 | * and may be nested. | ||
85 | */ | ||
86 | void fastcall call_rcu(struct rcu_head *head, | ||
87 | void (*func)(struct rcu_head *rcu)) | ||
88 | { | ||
89 | unsigned long flags; | ||
90 | struct rcu_data *rdp; | ||
91 | |||
92 | head->func = func; | ||
93 | head->next = NULL; | ||
94 | local_irq_save(flags); | ||
95 | rdp = &__get_cpu_var(rcu_data); | ||
96 | *rdp->nxttail = head; | ||
97 | rdp->nxttail = &head->next; | ||
98 | local_irq_restore(flags); | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
103 | * @head: structure to be used for queueing the RCU updates. | ||
104 | * @func: actual update function to be invoked after the grace period | ||
105 | * | ||
106 | * The update function will be invoked some time after a full grace | ||
107 | * period elapses, in other words after all currently executing RCU | ||
108 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
109 | * that the read-side critical sections end on completion of a softirq | ||
110 | * handler. This means that read-side critical sections in process | ||
111 | * context must not be interrupted by softirqs. This interface is to be | ||
112 | * used when most of the read-side critical sections are in softirq context. | ||
113 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
114 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
115 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
116 | */ | ||
117 | void fastcall call_rcu_bh(struct rcu_head *head, | ||
118 | void (*func)(struct rcu_head *rcu)) | ||
119 | { | ||
120 | unsigned long flags; | ||
121 | struct rcu_data *rdp; | ||
122 | |||
123 | head->func = func; | ||
124 | head->next = NULL; | ||
125 | local_irq_save(flags); | ||
126 | rdp = &__get_cpu_var(rcu_bh_data); | ||
127 | *rdp->nxttail = head; | ||
128 | rdp->nxttail = &head->next; | ||
129 | local_irq_restore(flags); | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Invoke the completed RCU callbacks. They are expected to be in | ||
134 | * a per-cpu list. | ||
135 | */ | ||
136 | static void rcu_do_batch(struct rcu_data *rdp) | ||
137 | { | ||
138 | struct rcu_head *next, *list; | ||
139 | int count = 0; | ||
140 | |||
141 | list = rdp->donelist; | ||
142 | while (list) { | ||
143 | next = rdp->donelist = list->next; | ||
144 | list->func(list); | ||
145 | list = next; | ||
146 | if (++count >= maxbatch) | ||
147 | break; | ||
148 | } | ||
149 | if (!rdp->donelist) | ||
150 | rdp->donetail = &rdp->donelist; | ||
151 | else | ||
152 | tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * Grace period handling: | ||
157 | * The grace period handling consists out of two steps: | ||
158 | * - A new grace period is started. | ||
159 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
160 | * all cpus, they must pick this up by comparing rcp->cur with | ||
161 | * rdp->quiescbatch. All cpus are recorded in the | ||
162 | * rcu_state.cpumask bitmap. | ||
163 | * - All cpus must go through a quiescent state. | ||
164 | * Since the start of the grace period is not broadcasted, at least two | ||
165 | * calls to rcu_check_quiescent_state are required: | ||
166 | * The first call just notices that a new grace period is running. The | ||
167 | * following calls check if there was a quiescent state since the beginning | ||
168 | * of the grace period. If so, it updates rcu_state.cpumask. If | ||
169 | * the bitmap is empty, then the grace period is completed. | ||
170 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
171 | * period (if necessary). | ||
172 | */ | ||
173 | /* | ||
174 | * Register a new batch of callbacks, and start it up if there is currently no | ||
175 | * active batch and the batch to be registered has not already occurred. | ||
176 | * Caller must hold rcu_state.lock. | ||
177 | */ | ||
178 | static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, | ||
179 | int next_pending) | ||
180 | { | ||
181 | if (next_pending) | ||
182 | rcp->next_pending = 1; | ||
183 | |||
184 | if (rcp->next_pending && | ||
185 | rcp->completed == rcp->cur) { | ||
186 | /* Can't change, since spin lock held. */ | ||
187 | cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
188 | |||
189 | rcp->next_pending = 0; | ||
190 | /* next_pending == 0 must be visible in __rcu_process_callbacks() | ||
191 | * before it can see new value of cur. | ||
192 | */ | ||
193 | smp_wmb(); | ||
194 | rcp->cur++; | ||
195 | } | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | * cpu went through a quiescent state since the beginning of the grace period. | ||
200 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
201 | * cpu. Start another grace period if someone has further entries pending | ||
202 | */ | ||
203 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) | ||
204 | { | ||
205 | cpu_clear(cpu, rsp->cpumask); | ||
206 | if (cpus_empty(rsp->cpumask)) { | ||
207 | /* batch completed ! */ | ||
208 | rcp->completed = rcp->cur; | ||
209 | rcu_start_batch(rcp, rsp, 0); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | /* | ||
214 | * Check if the cpu has gone through a quiescent state (say context | ||
215 | * switch). If so and if it already hasn't done so in this RCU | ||
216 | * quiescent cycle, then indicate that it has done so. | ||
217 | */ | ||
218 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
219 | struct rcu_state *rsp, struct rcu_data *rdp) | ||
220 | { | ||
221 | if (rdp->quiescbatch != rcp->cur) { | ||
222 | /* start new grace period: */ | ||
223 | rdp->qs_pending = 1; | ||
224 | rdp->passed_quiesc = 0; | ||
225 | rdp->quiescbatch = rcp->cur; | ||
226 | return; | ||
227 | } | ||
228 | |||
229 | /* Grace period already completed for this cpu? | ||
230 | * qs_pending is checked instead of the actual bitmap to avoid | ||
231 | * cacheline trashing. | ||
232 | */ | ||
233 | if (!rdp->qs_pending) | ||
234 | return; | ||
235 | |||
236 | /* | ||
237 | * Was there a quiescent state since the beginning of the grace | ||
238 | * period? If no, then exit and wait for the next call. | ||
239 | */ | ||
240 | if (!rdp->passed_quiesc) | ||
241 | return; | ||
242 | rdp->qs_pending = 0; | ||
243 | |||
244 | spin_lock(&rsp->lock); | ||
245 | /* | ||
246 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
247 | * during cpu startup. Ignore the quiescent state. | ||
248 | */ | ||
249 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
250 | cpu_quiet(rdp->cpu, rcp, rsp); | ||
251 | |||
252 | spin_unlock(&rsp->lock); | ||
253 | } | ||
254 | |||
255 | |||
256 | #ifdef CONFIG_HOTPLUG_CPU | ||
257 | |||
258 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
259 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
260 | * which is dead and hence not processing interrupts. | ||
261 | */ | ||
262 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
263 | struct rcu_head **tail) | ||
264 | { | ||
265 | local_irq_disable(); | ||
266 | *this_rdp->nxttail = list; | ||
267 | if (list) | ||
268 | this_rdp->nxttail = tail; | ||
269 | local_irq_enable(); | ||
270 | } | ||
271 | |||
272 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
273 | struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp) | ||
274 | { | ||
275 | /* if the cpu going offline owns the grace period | ||
276 | * we can block indefinitely waiting for it, so flush | ||
277 | * it here | ||
278 | */ | ||
279 | spin_lock_bh(&rsp->lock); | ||
280 | if (rcp->cur != rcp->completed) | ||
281 | cpu_quiet(rdp->cpu, rcp, rsp); | ||
282 | spin_unlock_bh(&rsp->lock); | ||
283 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
284 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
285 | |||
286 | } | ||
287 | static void rcu_offline_cpu(int cpu) | ||
288 | { | ||
289 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
290 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
291 | |||
292 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state, | ||
293 | &per_cpu(rcu_data, cpu)); | ||
294 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state, | ||
295 | &per_cpu(rcu_bh_data, cpu)); | ||
296 | put_cpu_var(rcu_data); | ||
297 | put_cpu_var(rcu_bh_data); | ||
298 | tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); | ||
299 | } | ||
300 | |||
301 | #else | ||
302 | |||
303 | static void rcu_offline_cpu(int cpu) | ||
304 | { | ||
305 | } | ||
306 | |||
307 | #endif | ||
308 | |||
309 | /* | ||
310 | * This does the RCU processing work from tasklet context. | ||
311 | */ | ||
312 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
313 | struct rcu_state *rsp, struct rcu_data *rdp) | ||
314 | { | ||
315 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
316 | *rdp->donetail = rdp->curlist; | ||
317 | rdp->donetail = rdp->curtail; | ||
318 | rdp->curlist = NULL; | ||
319 | rdp->curtail = &rdp->curlist; | ||
320 | } | ||
321 | |||
322 | local_irq_disable(); | ||
323 | if (rdp->nxtlist && !rdp->curlist) { | ||
324 | rdp->curlist = rdp->nxtlist; | ||
325 | rdp->curtail = rdp->nxttail; | ||
326 | rdp->nxtlist = NULL; | ||
327 | rdp->nxttail = &rdp->nxtlist; | ||
328 | local_irq_enable(); | ||
329 | |||
330 | /* | ||
331 | * start the next batch of callbacks | ||
332 | */ | ||
333 | |||
334 | /* determine batch number */ | ||
335 | rdp->batch = rcp->cur + 1; | ||
336 | /* see the comment and corresponding wmb() in | ||
337 | * the rcu_start_batch() | ||
338 | */ | ||
339 | smp_rmb(); | ||
340 | |||
341 | if (!rcp->next_pending) { | ||
342 | /* and start it/schedule start if it's a new batch */ | ||
343 | spin_lock(&rsp->lock); | ||
344 | rcu_start_batch(rcp, rsp, 1); | ||
345 | spin_unlock(&rsp->lock); | ||
346 | } | ||
347 | } else { | ||
348 | local_irq_enable(); | ||
349 | } | ||
350 | rcu_check_quiescent_state(rcp, rsp, rdp); | ||
351 | if (rdp->donelist) | ||
352 | rcu_do_batch(rdp); | ||
353 | } | ||
354 | |||
355 | static void rcu_process_callbacks(unsigned long unused) | ||
356 | { | ||
357 | __rcu_process_callbacks(&rcu_ctrlblk, &rcu_state, | ||
358 | &__get_cpu_var(rcu_data)); | ||
359 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state, | ||
360 | &__get_cpu_var(rcu_bh_data)); | ||
361 | } | ||
362 | |||
363 | void rcu_check_callbacks(int cpu, int user) | ||
364 | { | ||
365 | if (user || | ||
366 | (idle_cpu(cpu) && !in_softirq() && | ||
367 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
368 | rcu_qsctr_inc(cpu); | ||
369 | rcu_bh_qsctr_inc(cpu); | ||
370 | } else if (!in_softirq()) | ||
371 | rcu_bh_qsctr_inc(cpu); | ||
372 | tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); | ||
373 | } | ||
374 | |||
375 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
376 | struct rcu_data *rdp) | ||
377 | { | ||
378 | memset(rdp, 0, sizeof(*rdp)); | ||
379 | rdp->curtail = &rdp->curlist; | ||
380 | rdp->nxttail = &rdp->nxtlist; | ||
381 | rdp->donetail = &rdp->donelist; | ||
382 | rdp->quiescbatch = rcp->completed; | ||
383 | rdp->qs_pending = 0; | ||
384 | rdp->cpu = cpu; | ||
385 | } | ||
386 | |||
387 | static void __devinit rcu_online_cpu(int cpu) | ||
388 | { | ||
389 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
390 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
391 | |||
392 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
393 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
394 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | ||
395 | } | ||
396 | |||
397 | static int __devinit rcu_cpu_notify(struct notifier_block *self, | ||
398 | unsigned long action, void *hcpu) | ||
399 | { | ||
400 | long cpu = (long)hcpu; | ||
401 | switch (action) { | ||
402 | case CPU_UP_PREPARE: | ||
403 | rcu_online_cpu(cpu); | ||
404 | break; | ||
405 | case CPU_DEAD: | ||
406 | rcu_offline_cpu(cpu); | ||
407 | break; | ||
408 | default: | ||
409 | break; | ||
410 | } | ||
411 | return NOTIFY_OK; | ||
412 | } | ||
413 | |||
414 | static struct notifier_block __devinitdata rcu_nb = { | ||
415 | .notifier_call = rcu_cpu_notify, | ||
416 | }; | ||
417 | |||
418 | /* | ||
419 | * Initializes rcu mechanism. Assumed to be called early. | ||
420 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
421 | * Note that rcu_qsctr and friends are implicitly | ||
422 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
423 | */ | ||
424 | void __init rcu_init(void) | ||
425 | { | ||
426 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | ||
427 | (void *)(long)smp_processor_id()); | ||
428 | /* Register notifier for non-boot CPUs */ | ||
429 | register_cpu_notifier(&rcu_nb); | ||
430 | } | ||
431 | |||
432 | struct rcu_synchronize { | ||
433 | struct rcu_head head; | ||
434 | struct completion completion; | ||
435 | }; | ||
436 | |||
437 | /* Because of FASTCALL declaration of complete, we use this wrapper */ | ||
438 | static void wakeme_after_rcu(struct rcu_head *head) | ||
439 | { | ||
440 | struct rcu_synchronize *rcu; | ||
441 | |||
442 | rcu = container_of(head, struct rcu_synchronize, head); | ||
443 | complete(&rcu->completion); | ||
444 | } | ||
445 | |||
446 | /** | ||
447 | * synchronize_kernel - wait until a grace period has elapsed. | ||
448 | * | ||
449 | * Control will return to the caller some time after a full grace | ||
450 | * period has elapsed, in other words after all currently executing RCU | ||
451 | * read-side critical sections have completed. RCU read-side critical | ||
452 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
453 | * and may be nested. | ||
454 | */ | ||
455 | void synchronize_kernel(void) | ||
456 | { | ||
457 | struct rcu_synchronize rcu; | ||
458 | |||
459 | init_completion(&rcu.completion); | ||
460 | /* Will wake me after RCU finished */ | ||
461 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
462 | |||
463 | /* Wait for it */ | ||
464 | wait_for_completion(&rcu.completion); | ||
465 | } | ||
466 | |||
467 | module_param(maxbatch, int, 0); | ||
468 | EXPORT_SYMBOL_GPL(call_rcu); | ||
469 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
470 | EXPORT_SYMBOL_GPL(synchronize_kernel); | ||
diff --git a/kernel/resource.c b/kernel/resource.c new file mode 100644 index 000000000000..35c99ac02c7c --- /dev/null +++ b/kernel/resource.c | |||
@@ -0,0 +1,551 @@ | |||
1 | /* | ||
2 | * linux/kernel/resource.c | ||
3 | * | ||
4 | * Copyright (C) 1999 Linus Torvalds | ||
5 | * Copyright (C) 1999 Martin Mares <mj@ucw.cz> | ||
6 | * | ||
7 | * Arbitrary resource management. | ||
8 | */ | ||
9 | |||
10 | #include <linux/config.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/errno.h> | ||
14 | #include <linux/ioport.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/spinlock.h> | ||
18 | #include <linux/fs.h> | ||
19 | #include <linux/proc_fs.h> | ||
20 | #include <linux/seq_file.h> | ||
21 | #include <asm/io.h> | ||
22 | |||
23 | |||
24 | struct resource ioport_resource = { | ||
25 | .name = "PCI IO", | ||
26 | .start = 0x0000, | ||
27 | .end = IO_SPACE_LIMIT, | ||
28 | .flags = IORESOURCE_IO, | ||
29 | }; | ||
30 | |||
31 | EXPORT_SYMBOL(ioport_resource); | ||
32 | |||
33 | struct resource iomem_resource = { | ||
34 | .name = "PCI mem", | ||
35 | .start = 0UL, | ||
36 | .end = ~0UL, | ||
37 | .flags = IORESOURCE_MEM, | ||
38 | }; | ||
39 | |||
40 | EXPORT_SYMBOL(iomem_resource); | ||
41 | |||
42 | static DEFINE_RWLOCK(resource_lock); | ||
43 | |||
44 | #ifdef CONFIG_PROC_FS | ||
45 | |||
46 | enum { MAX_IORES_LEVEL = 5 }; | ||
47 | |||
48 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | ||
49 | { | ||
50 | struct resource *p = v; | ||
51 | (*pos)++; | ||
52 | if (p->child) | ||
53 | return p->child; | ||
54 | while (!p->sibling && p->parent) | ||
55 | p = p->parent; | ||
56 | return p->sibling; | ||
57 | } | ||
58 | |||
59 | static void *r_start(struct seq_file *m, loff_t *pos) | ||
60 | __acquires(resource_lock) | ||
61 | { | ||
62 | struct resource *p = m->private; | ||
63 | loff_t l = 0; | ||
64 | read_lock(&resource_lock); | ||
65 | for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) | ||
66 | ; | ||
67 | return p; | ||
68 | } | ||
69 | |||
70 | static void r_stop(struct seq_file *m, void *v) | ||
71 | __releases(resource_lock) | ||
72 | { | ||
73 | read_unlock(&resource_lock); | ||
74 | } | ||
75 | |||
76 | static int r_show(struct seq_file *m, void *v) | ||
77 | { | ||
78 | struct resource *root = m->private; | ||
79 | struct resource *r = v, *p; | ||
80 | int width = root->end < 0x10000 ? 4 : 8; | ||
81 | int depth; | ||
82 | |||
83 | for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) | ||
84 | if (p->parent == root) | ||
85 | break; | ||
86 | seq_printf(m, "%*s%0*lx-%0*lx : %s\n", | ||
87 | depth * 2, "", | ||
88 | width, r->start, | ||
89 | width, r->end, | ||
90 | r->name ? r->name : "<BAD>"); | ||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | static struct seq_operations resource_op = { | ||
95 | .start = r_start, | ||
96 | .next = r_next, | ||
97 | .stop = r_stop, | ||
98 | .show = r_show, | ||
99 | }; | ||
100 | |||
101 | static int ioports_open(struct inode *inode, struct file *file) | ||
102 | { | ||
103 | int res = seq_open(file, &resource_op); | ||
104 | if (!res) { | ||
105 | struct seq_file *m = file->private_data; | ||
106 | m->private = &ioport_resource; | ||
107 | } | ||
108 | return res; | ||
109 | } | ||
110 | |||
111 | static int iomem_open(struct inode *inode, struct file *file) | ||
112 | { | ||
113 | int res = seq_open(file, &resource_op); | ||
114 | if (!res) { | ||
115 | struct seq_file *m = file->private_data; | ||
116 | m->private = &iomem_resource; | ||
117 | } | ||
118 | return res; | ||
119 | } | ||
120 | |||
121 | static struct file_operations proc_ioports_operations = { | ||
122 | .open = ioports_open, | ||
123 | .read = seq_read, | ||
124 | .llseek = seq_lseek, | ||
125 | .release = seq_release, | ||
126 | }; | ||
127 | |||
128 | static struct file_operations proc_iomem_operations = { | ||
129 | .open = iomem_open, | ||
130 | .read = seq_read, | ||
131 | .llseek = seq_lseek, | ||
132 | .release = seq_release, | ||
133 | }; | ||
134 | |||
135 | static int __init ioresources_init(void) | ||
136 | { | ||
137 | struct proc_dir_entry *entry; | ||
138 | |||
139 | entry = create_proc_entry("ioports", 0, NULL); | ||
140 | if (entry) | ||
141 | entry->proc_fops = &proc_ioports_operations; | ||
142 | entry = create_proc_entry("iomem", 0, NULL); | ||
143 | if (entry) | ||
144 | entry->proc_fops = &proc_iomem_operations; | ||
145 | return 0; | ||
146 | } | ||
147 | __initcall(ioresources_init); | ||
148 | |||
149 | #endif /* CONFIG_PROC_FS */ | ||
150 | |||
151 | /* Return the conflict entry if you can't request it */ | ||
152 | static struct resource * __request_resource(struct resource *root, struct resource *new) | ||
153 | { | ||
154 | unsigned long start = new->start; | ||
155 | unsigned long end = new->end; | ||
156 | struct resource *tmp, **p; | ||
157 | |||
158 | if (end < start) | ||
159 | return root; | ||
160 | if (start < root->start) | ||
161 | return root; | ||
162 | if (end > root->end) | ||
163 | return root; | ||
164 | p = &root->child; | ||
165 | for (;;) { | ||
166 | tmp = *p; | ||
167 | if (!tmp || tmp->start > end) { | ||
168 | new->sibling = tmp; | ||
169 | *p = new; | ||
170 | new->parent = root; | ||
171 | return NULL; | ||
172 | } | ||
173 | p = &tmp->sibling; | ||
174 | if (tmp->end < start) | ||
175 | continue; | ||
176 | return tmp; | ||
177 | } | ||
178 | } | ||
179 | |||
180 | static int __release_resource(struct resource *old) | ||
181 | { | ||
182 | struct resource *tmp, **p; | ||
183 | |||
184 | p = &old->parent->child; | ||
185 | for (;;) { | ||
186 | tmp = *p; | ||
187 | if (!tmp) | ||
188 | break; | ||
189 | if (tmp == old) { | ||
190 | *p = tmp->sibling; | ||
191 | old->parent = NULL; | ||
192 | return 0; | ||
193 | } | ||
194 | p = &tmp->sibling; | ||
195 | } | ||
196 | return -EINVAL; | ||
197 | } | ||
198 | |||
199 | int request_resource(struct resource *root, struct resource *new) | ||
200 | { | ||
201 | struct resource *conflict; | ||
202 | |||
203 | write_lock(&resource_lock); | ||
204 | conflict = __request_resource(root, new); | ||
205 | write_unlock(&resource_lock); | ||
206 | return conflict ? -EBUSY : 0; | ||
207 | } | ||
208 | |||
209 | EXPORT_SYMBOL(request_resource); | ||
210 | |||
211 | struct resource *____request_resource(struct resource *root, struct resource *new) | ||
212 | { | ||
213 | struct resource *conflict; | ||
214 | |||
215 | write_lock(&resource_lock); | ||
216 | conflict = __request_resource(root, new); | ||
217 | write_unlock(&resource_lock); | ||
218 | return conflict; | ||
219 | } | ||
220 | |||
221 | EXPORT_SYMBOL(____request_resource); | ||
222 | |||
223 | int release_resource(struct resource *old) | ||
224 | { | ||
225 | int retval; | ||
226 | |||
227 | write_lock(&resource_lock); | ||
228 | retval = __release_resource(old); | ||
229 | write_unlock(&resource_lock); | ||
230 | return retval; | ||
231 | } | ||
232 | |||
233 | EXPORT_SYMBOL(release_resource); | ||
234 | |||
235 | /* | ||
236 | * Find empty slot in the resource tree given range and alignment. | ||
237 | */ | ||
238 | static int find_resource(struct resource *root, struct resource *new, | ||
239 | unsigned long size, | ||
240 | unsigned long min, unsigned long max, | ||
241 | unsigned long align, | ||
242 | void (*alignf)(void *, struct resource *, | ||
243 | unsigned long, unsigned long), | ||
244 | void *alignf_data) | ||
245 | { | ||
246 | struct resource *this = root->child; | ||
247 | |||
248 | new->start = root->start; | ||
249 | /* | ||
250 | * Skip past an allocated resource that starts at 0, since the assignment | ||
251 | * of this->start - 1 to new->end below would cause an underflow. | ||
252 | */ | ||
253 | if (this && this->start == 0) { | ||
254 | new->start = this->end + 1; | ||
255 | this = this->sibling; | ||
256 | } | ||
257 | for(;;) { | ||
258 | if (this) | ||
259 | new->end = this->start - 1; | ||
260 | else | ||
261 | new->end = root->end; | ||
262 | if (new->start < min) | ||
263 | new->start = min; | ||
264 | if (new->end > max) | ||
265 | new->end = max; | ||
266 | new->start = (new->start + align - 1) & ~(align - 1); | ||
267 | if (alignf) | ||
268 | alignf(alignf_data, new, size, align); | ||
269 | if (new->start < new->end && new->end - new->start + 1 >= size) { | ||
270 | new->end = new->start + size - 1; | ||
271 | return 0; | ||
272 | } | ||
273 | if (!this) | ||
274 | break; | ||
275 | new->start = this->end + 1; | ||
276 | this = this->sibling; | ||
277 | } | ||
278 | return -EBUSY; | ||
279 | } | ||
280 | |||
281 | /* | ||
282 | * Allocate empty slot in the resource tree given range and alignment. | ||
283 | */ | ||
284 | int allocate_resource(struct resource *root, struct resource *new, | ||
285 | unsigned long size, | ||
286 | unsigned long min, unsigned long max, | ||
287 | unsigned long align, | ||
288 | void (*alignf)(void *, struct resource *, | ||
289 | unsigned long, unsigned long), | ||
290 | void *alignf_data) | ||
291 | { | ||
292 | int err; | ||
293 | |||
294 | write_lock(&resource_lock); | ||
295 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | ||
296 | if (err >= 0 && __request_resource(root, new)) | ||
297 | err = -EBUSY; | ||
298 | write_unlock(&resource_lock); | ||
299 | return err; | ||
300 | } | ||
301 | |||
302 | EXPORT_SYMBOL(allocate_resource); | ||
303 | |||
304 | /** | ||
305 | * insert_resource - Inserts a resource in the resource tree | ||
306 | * @parent: parent of the new resource | ||
307 | * @new: new resource to insert | ||
308 | * | ||
309 | * Returns 0 on success, -EBUSY if the resource can't be inserted. | ||
310 | * | ||
311 | * This function is equivalent of request_resource when no conflict | ||
312 | * happens. If a conflict happens, and the conflicting resources | ||
313 | * entirely fit within the range of the new resource, then the new | ||
314 | * resource is inserted and the conflicting resources become childs of | ||
315 | * the new resource. Otherwise the new resource becomes the child of | ||
316 | * the conflicting resource | ||
317 | */ | ||
318 | int insert_resource(struct resource *parent, struct resource *new) | ||
319 | { | ||
320 | int result; | ||
321 | struct resource *first, *next; | ||
322 | |||
323 | write_lock(&resource_lock); | ||
324 | begin: | ||
325 | result = 0; | ||
326 | first = __request_resource(parent, new); | ||
327 | if (!first) | ||
328 | goto out; | ||
329 | |||
330 | result = -EBUSY; | ||
331 | if (first == parent) | ||
332 | goto out; | ||
333 | |||
334 | /* Resource fully contained by the clashing resource? Recurse into it */ | ||
335 | if (first->start <= new->start && first->end >= new->end) { | ||
336 | parent = first; | ||
337 | goto begin; | ||
338 | } | ||
339 | |||
340 | for (next = first; ; next = next->sibling) { | ||
341 | /* Partial overlap? Bad, and unfixable */ | ||
342 | if (next->start < new->start || next->end > new->end) | ||
343 | goto out; | ||
344 | if (!next->sibling) | ||
345 | break; | ||
346 | if (next->sibling->start > new->end) | ||
347 | break; | ||
348 | } | ||
349 | |||
350 | result = 0; | ||
351 | |||
352 | new->parent = parent; | ||
353 | new->sibling = next->sibling; | ||
354 | new->child = first; | ||
355 | |||
356 | next->sibling = NULL; | ||
357 | for (next = first; next; next = next->sibling) | ||
358 | next->parent = new; | ||
359 | |||
360 | if (parent->child == first) { | ||
361 | parent->child = new; | ||
362 | } else { | ||
363 | next = parent->child; | ||
364 | while (next->sibling != first) | ||
365 | next = next->sibling; | ||
366 | next->sibling = new; | ||
367 | } | ||
368 | |||
369 | out: | ||
370 | write_unlock(&resource_lock); | ||
371 | return result; | ||
372 | } | ||
373 | |||
374 | EXPORT_SYMBOL(insert_resource); | ||
375 | |||
376 | /* | ||
377 | * Given an existing resource, change its start and size to match the | ||
378 | * arguments. Returns -EBUSY if it can't fit. Existing children of | ||
379 | * the resource are assumed to be immutable. | ||
380 | */ | ||
381 | int adjust_resource(struct resource *res, unsigned long start, unsigned long size) | ||
382 | { | ||
383 | struct resource *tmp, *parent = res->parent; | ||
384 | unsigned long end = start + size - 1; | ||
385 | int result = -EBUSY; | ||
386 | |||
387 | write_lock(&resource_lock); | ||
388 | |||
389 | if ((start < parent->start) || (end > parent->end)) | ||
390 | goto out; | ||
391 | |||
392 | for (tmp = res->child; tmp; tmp = tmp->sibling) { | ||
393 | if ((tmp->start < start) || (tmp->end > end)) | ||
394 | goto out; | ||
395 | } | ||
396 | |||
397 | if (res->sibling && (res->sibling->start <= end)) | ||
398 | goto out; | ||
399 | |||
400 | tmp = parent->child; | ||
401 | if (tmp != res) { | ||
402 | while (tmp->sibling != res) | ||
403 | tmp = tmp->sibling; | ||
404 | if (start <= tmp->end) | ||
405 | goto out; | ||
406 | } | ||
407 | |||
408 | res->start = start; | ||
409 | res->end = end; | ||
410 | result = 0; | ||
411 | |||
412 | out: | ||
413 | write_unlock(&resource_lock); | ||
414 | return result; | ||
415 | } | ||
416 | |||
417 | EXPORT_SYMBOL(adjust_resource); | ||
418 | |||
419 | /* | ||
420 | * This is compatibility stuff for IO resources. | ||
421 | * | ||
422 | * Note how this, unlike the above, knows about | ||
423 | * the IO flag meanings (busy etc). | ||
424 | * | ||
425 | * Request-region creates a new busy region. | ||
426 | * | ||
427 | * Check-region returns non-zero if the area is already busy | ||
428 | * | ||
429 | * Release-region releases a matching busy region. | ||
430 | */ | ||
431 | struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) | ||
432 | { | ||
433 | struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); | ||
434 | |||
435 | if (res) { | ||
436 | memset(res, 0, sizeof(*res)); | ||
437 | res->name = name; | ||
438 | res->start = start; | ||
439 | res->end = start + n - 1; | ||
440 | res->flags = IORESOURCE_BUSY; | ||
441 | |||
442 | write_lock(&resource_lock); | ||
443 | |||
444 | for (;;) { | ||
445 | struct resource *conflict; | ||
446 | |||
447 | conflict = __request_resource(parent, res); | ||
448 | if (!conflict) | ||
449 | break; | ||
450 | if (conflict != parent) { | ||
451 | parent = conflict; | ||
452 | if (!(conflict->flags & IORESOURCE_BUSY)) | ||
453 | continue; | ||
454 | } | ||
455 | |||
456 | /* Uhhuh, that didn't work out.. */ | ||
457 | kfree(res); | ||
458 | res = NULL; | ||
459 | break; | ||
460 | } | ||
461 | write_unlock(&resource_lock); | ||
462 | } | ||
463 | return res; | ||
464 | } | ||
465 | |||
466 | EXPORT_SYMBOL(__request_region); | ||
467 | |||
468 | int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n) | ||
469 | { | ||
470 | struct resource * res; | ||
471 | |||
472 | res = __request_region(parent, start, n, "check-region"); | ||
473 | if (!res) | ||
474 | return -EBUSY; | ||
475 | |||
476 | release_resource(res); | ||
477 | kfree(res); | ||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | EXPORT_SYMBOL(__check_region); | ||
482 | |||
483 | void __release_region(struct resource *parent, unsigned long start, unsigned long n) | ||
484 | { | ||
485 | struct resource **p; | ||
486 | unsigned long end; | ||
487 | |||
488 | p = &parent->child; | ||
489 | end = start + n - 1; | ||
490 | |||
491 | write_lock(&resource_lock); | ||
492 | |||
493 | for (;;) { | ||
494 | struct resource *res = *p; | ||
495 | |||
496 | if (!res) | ||
497 | break; | ||
498 | if (res->start <= start && res->end >= end) { | ||
499 | if (!(res->flags & IORESOURCE_BUSY)) { | ||
500 | p = &res->child; | ||
501 | continue; | ||
502 | } | ||
503 | if (res->start != start || res->end != end) | ||
504 | break; | ||
505 | *p = res->sibling; | ||
506 | write_unlock(&resource_lock); | ||
507 | kfree(res); | ||
508 | return; | ||
509 | } | ||
510 | p = &res->sibling; | ||
511 | } | ||
512 | |||
513 | write_unlock(&resource_lock); | ||
514 | |||
515 | printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); | ||
516 | } | ||
517 | |||
518 | EXPORT_SYMBOL(__release_region); | ||
519 | |||
520 | /* | ||
521 | * Called from init/main.c to reserve IO ports. | ||
522 | */ | ||
523 | #define MAXRESERVE 4 | ||
524 | static int __init reserve_setup(char *str) | ||
525 | { | ||
526 | static int reserved; | ||
527 | static struct resource reserve[MAXRESERVE]; | ||
528 | |||
529 | for (;;) { | ||
530 | int io_start, io_num; | ||
531 | int x = reserved; | ||
532 | |||
533 | if (get_option (&str, &io_start) != 2) | ||
534 | break; | ||
535 | if (get_option (&str, &io_num) == 0) | ||
536 | break; | ||
537 | if (x < MAXRESERVE) { | ||
538 | struct resource *res = reserve + x; | ||
539 | res->name = "reserved"; | ||
540 | res->start = io_start; | ||
541 | res->end = io_start + io_num - 1; | ||
542 | res->flags = IORESOURCE_BUSY; | ||
543 | res->child = NULL; | ||
544 | if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) | ||
545 | reserved = x+1; | ||
546 | } | ||
547 | } | ||
548 | return 1; | ||
549 | } | ||
550 | |||
551 | __setup("reserve=", reserve_setup); | ||
diff --git a/kernel/sched.c b/kernel/sched.c new file mode 100644 index 000000000000..f69c4a5361e3 --- /dev/null +++ b/kernel/sched.c | |||
@@ -0,0 +1,5004 @@ | |||
1 | /* | ||
2 | * kernel/sched.c | ||
3 | * | ||
4 | * Kernel scheduler and related syscalls | ||
5 | * | ||
6 | * Copyright (C) 1991-2002 Linus Torvalds | ||
7 | * | ||
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | ||
9 | * make semaphores SMP safe | ||
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | ||
11 | * by Andrea Arcangeli | ||
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | ||
13 | * hybrid priority-list and round-robin design with | ||
14 | * an array-switch method of distributing timeslices | ||
15 | * and per-CPU runqueues. Cleanups and useful suggestions | ||
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | ||
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | ||
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | ||
19 | */ | ||
20 | |||
21 | #include <linux/mm.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/nmi.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <asm/uaccess.h> | ||
26 | #include <linux/highmem.h> | ||
27 | #include <linux/smp_lock.h> | ||
28 | #include <asm/mmu_context.h> | ||
29 | #include <linux/interrupt.h> | ||
30 | #include <linux/completion.h> | ||
31 | #include <linux/kernel_stat.h> | ||
32 | #include <linux/security.h> | ||
33 | #include <linux/notifier.h> | ||
34 | #include <linux/profile.h> | ||
35 | #include <linux/suspend.h> | ||
36 | #include <linux/blkdev.h> | ||
37 | #include <linux/delay.h> | ||
38 | #include <linux/smp.h> | ||
39 | #include <linux/threads.h> | ||
40 | #include <linux/timer.h> | ||
41 | #include <linux/rcupdate.h> | ||
42 | #include <linux/cpu.h> | ||
43 | #include <linux/cpuset.h> | ||
44 | #include <linux/percpu.h> | ||
45 | #include <linux/kthread.h> | ||
46 | #include <linux/seq_file.h> | ||
47 | #include <linux/syscalls.h> | ||
48 | #include <linux/times.h> | ||
49 | #include <linux/acct.h> | ||
50 | #include <asm/tlb.h> | ||
51 | |||
52 | #include <asm/unistd.h> | ||
53 | |||
54 | /* | ||
55 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
56 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
57 | * and back. | ||
58 | */ | ||
59 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
60 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
61 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
62 | |||
63 | /* | ||
64 | * 'User priority' is the nice value converted to something we | ||
65 | * can work with better when scaling various scheduler parameters, | ||
66 | * it's a [ 0 ... 39 ] range. | ||
67 | */ | ||
68 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
69 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
70 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
71 | |||
72 | /* | ||
73 | * Some helpers for converting nanosecond timing to jiffy resolution | ||
74 | */ | ||
75 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | ||
76 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | ||
77 | |||
78 | /* | ||
79 | * These are the 'tuning knobs' of the scheduler: | ||
80 | * | ||
81 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | ||
82 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | ||
83 | * Timeslices get refilled after they expire. | ||
84 | */ | ||
85 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | ||
86 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
87 | #define ON_RUNQUEUE_WEIGHT 30 | ||
88 | #define CHILD_PENALTY 95 | ||
89 | #define PARENT_PENALTY 100 | ||
90 | #define EXIT_WEIGHT 3 | ||
91 | #define PRIO_BONUS_RATIO 25 | ||
92 | #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) | ||
93 | #define INTERACTIVE_DELTA 2 | ||
94 | #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) | ||
95 | #define STARVATION_LIMIT (MAX_SLEEP_AVG) | ||
96 | #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) | ||
97 | |||
98 | /* | ||
99 | * If a task is 'interactive' then we reinsert it in the active | ||
100 | * array after it has expired its current timeslice. (it will not | ||
101 | * continue to run immediately, it will still roundrobin with | ||
102 | * other interactive tasks.) | ||
103 | * | ||
104 | * This part scales the interactivity limit depending on niceness. | ||
105 | * | ||
106 | * We scale it linearly, offset by the INTERACTIVE_DELTA delta. | ||
107 | * Here are a few examples of different nice levels: | ||
108 | * | ||
109 | * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] | ||
110 | * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] | ||
111 | * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] | ||
112 | * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] | ||
113 | * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] | ||
114 | * | ||
115 | * (the X axis represents the possible -5 ... 0 ... +5 dynamic | ||
116 | * priority range a task can explore, a value of '1' means the | ||
117 | * task is rated interactive.) | ||
118 | * | ||
119 | * Ie. nice +19 tasks can never get 'interactive' enough to be | ||
120 | * reinserted into the active array. And only heavily CPU-hog nice -20 | ||
121 | * tasks will be expired. Default nice 0 tasks are somewhere between, | ||
122 | * it takes some effort for them to get interactive, but it's not | ||
123 | * too hard. | ||
124 | */ | ||
125 | |||
126 | #define CURRENT_BONUS(p) \ | ||
127 | (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ | ||
128 | MAX_SLEEP_AVG) | ||
129 | |||
130 | #define GRANULARITY (10 * HZ / 1000 ? : 1) | ||
131 | |||
132 | #ifdef CONFIG_SMP | ||
133 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
134 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ | ||
135 | num_online_cpus()) | ||
136 | #else | ||
137 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | ||
138 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) | ||
139 | #endif | ||
140 | |||
141 | #define SCALE(v1,v1_max,v2_max) \ | ||
142 | (v1) * (v2_max) / (v1_max) | ||
143 | |||
144 | #define DELTA(p) \ | ||
145 | (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) | ||
146 | |||
147 | #define TASK_INTERACTIVE(p) \ | ||
148 | ((p)->prio <= (p)->static_prio - DELTA(p)) | ||
149 | |||
150 | #define INTERACTIVE_SLEEP(p) \ | ||
151 | (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ | ||
152 | (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) | ||
153 | |||
154 | #define TASK_PREEMPTS_CURR(p, rq) \ | ||
155 | ((p)->prio < (rq)->curr->prio) | ||
156 | |||
157 | /* | ||
158 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | ||
159 | * to time slice values: [800ms ... 100ms ... 5ms] | ||
160 | * | ||
161 | * The higher a thread's priority, the bigger timeslices | ||
162 | * it gets during one round of execution. But even the lowest | ||
163 | * priority thread gets MIN_TIMESLICE worth of execution time. | ||
164 | */ | ||
165 | |||
166 | #define SCALE_PRIO(x, prio) \ | ||
167 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | ||
168 | |||
169 | static inline unsigned int task_timeslice(task_t *p) | ||
170 | { | ||
171 | if (p->static_prio < NICE_TO_PRIO(0)) | ||
172 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | ||
173 | else | ||
174 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | ||
175 | } | ||
176 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | ||
177 | < (long long) (sd)->cache_hot_time) | ||
178 | |||
179 | /* | ||
180 | * These are the runqueue data structures: | ||
181 | */ | ||
182 | |||
183 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
184 | |||
185 | typedef struct runqueue runqueue_t; | ||
186 | |||
187 | struct prio_array { | ||
188 | unsigned int nr_active; | ||
189 | unsigned long bitmap[BITMAP_SIZE]; | ||
190 | struct list_head queue[MAX_PRIO]; | ||
191 | }; | ||
192 | |||
193 | /* | ||
194 | * This is the main, per-CPU runqueue data structure. | ||
195 | * | ||
196 | * Locking rule: those places that want to lock multiple runqueues | ||
197 | * (such as the load balancing or the thread migration code), lock | ||
198 | * acquire operations must be ordered by ascending &runqueue. | ||
199 | */ | ||
200 | struct runqueue { | ||
201 | spinlock_t lock; | ||
202 | |||
203 | /* | ||
204 | * nr_running and cpu_load should be in the same cacheline because | ||
205 | * remote CPUs use both these fields when doing load calculation. | ||
206 | */ | ||
207 | unsigned long nr_running; | ||
208 | #ifdef CONFIG_SMP | ||
209 | unsigned long cpu_load; | ||
210 | #endif | ||
211 | unsigned long long nr_switches; | ||
212 | |||
213 | /* | ||
214 | * This is part of a global counter where only the total sum | ||
215 | * over all CPUs matters. A task can increase this counter on | ||
216 | * one CPU and if it got migrated afterwards it may decrease | ||
217 | * it on another CPU. Always updated under the runqueue lock: | ||
218 | */ | ||
219 | unsigned long nr_uninterruptible; | ||
220 | |||
221 | unsigned long expired_timestamp; | ||
222 | unsigned long long timestamp_last_tick; | ||
223 | task_t *curr, *idle; | ||
224 | struct mm_struct *prev_mm; | ||
225 | prio_array_t *active, *expired, arrays[2]; | ||
226 | int best_expired_prio; | ||
227 | atomic_t nr_iowait; | ||
228 | |||
229 | #ifdef CONFIG_SMP | ||
230 | struct sched_domain *sd; | ||
231 | |||
232 | /* For active balancing */ | ||
233 | int active_balance; | ||
234 | int push_cpu; | ||
235 | |||
236 | task_t *migration_thread; | ||
237 | struct list_head migration_queue; | ||
238 | #endif | ||
239 | |||
240 | #ifdef CONFIG_SCHEDSTATS | ||
241 | /* latency stats */ | ||
242 | struct sched_info rq_sched_info; | ||
243 | |||
244 | /* sys_sched_yield() stats */ | ||
245 | unsigned long yld_exp_empty; | ||
246 | unsigned long yld_act_empty; | ||
247 | unsigned long yld_both_empty; | ||
248 | unsigned long yld_cnt; | ||
249 | |||
250 | /* schedule() stats */ | ||
251 | unsigned long sched_switch; | ||
252 | unsigned long sched_cnt; | ||
253 | unsigned long sched_goidle; | ||
254 | |||
255 | /* try_to_wake_up() stats */ | ||
256 | unsigned long ttwu_cnt; | ||
257 | unsigned long ttwu_local; | ||
258 | #endif | ||
259 | }; | ||
260 | |||
261 | static DEFINE_PER_CPU(struct runqueue, runqueues); | ||
262 | |||
263 | #define for_each_domain(cpu, domain) \ | ||
264 | for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) | ||
265 | |||
266 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
267 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
268 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
269 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
270 | |||
271 | /* | ||
272 | * Default context-switch locking: | ||
273 | */ | ||
274 | #ifndef prepare_arch_switch | ||
275 | # define prepare_arch_switch(rq, next) do { } while (0) | ||
276 | # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) | ||
277 | # define task_running(rq, p) ((rq)->curr == (p)) | ||
278 | #endif | ||
279 | |||
280 | /* | ||
281 | * task_rq_lock - lock the runqueue a given task resides on and disable | ||
282 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
283 | * explicitly disabling preemption. | ||
284 | */ | ||
285 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | ||
286 | __acquires(rq->lock) | ||
287 | { | ||
288 | struct runqueue *rq; | ||
289 | |||
290 | repeat_lock_task: | ||
291 | local_irq_save(*flags); | ||
292 | rq = task_rq(p); | ||
293 | spin_lock(&rq->lock); | ||
294 | if (unlikely(rq != task_rq(p))) { | ||
295 | spin_unlock_irqrestore(&rq->lock, *flags); | ||
296 | goto repeat_lock_task; | ||
297 | } | ||
298 | return rq; | ||
299 | } | ||
300 | |||
301 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | ||
302 | __releases(rq->lock) | ||
303 | { | ||
304 | spin_unlock_irqrestore(&rq->lock, *flags); | ||
305 | } | ||
306 | |||
307 | #ifdef CONFIG_SCHEDSTATS | ||
308 | /* | ||
309 | * bump this up when changing the output format or the meaning of an existing | ||
310 | * format, so that tools can adapt (or abort) | ||
311 | */ | ||
312 | #define SCHEDSTAT_VERSION 11 | ||
313 | |||
314 | static int show_schedstat(struct seq_file *seq, void *v) | ||
315 | { | ||
316 | int cpu; | ||
317 | |||
318 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
319 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
320 | for_each_online_cpu(cpu) { | ||
321 | runqueue_t *rq = cpu_rq(cpu); | ||
322 | #ifdef CONFIG_SMP | ||
323 | struct sched_domain *sd; | ||
324 | int dcnt = 0; | ||
325 | #endif | ||
326 | |||
327 | /* runqueue-specific stats */ | ||
328 | seq_printf(seq, | ||
329 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", | ||
330 | cpu, rq->yld_both_empty, | ||
331 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, | ||
332 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, | ||
333 | rq->ttwu_cnt, rq->ttwu_local, | ||
334 | rq->rq_sched_info.cpu_time, | ||
335 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); | ||
336 | |||
337 | seq_printf(seq, "\n"); | ||
338 | |||
339 | #ifdef CONFIG_SMP | ||
340 | /* domain-specific stats */ | ||
341 | for_each_domain(cpu, sd) { | ||
342 | enum idle_type itype; | ||
343 | char mask_str[NR_CPUS]; | ||
344 | |||
345 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | ||
346 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | ||
347 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; | ||
348 | itype++) { | ||
349 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", | ||
350 | sd->lb_cnt[itype], | ||
351 | sd->lb_balanced[itype], | ||
352 | sd->lb_failed[itype], | ||
353 | sd->lb_imbalance[itype], | ||
354 | sd->lb_gained[itype], | ||
355 | sd->lb_hot_gained[itype], | ||
356 | sd->lb_nobusyq[itype], | ||
357 | sd->lb_nobusyg[itype]); | ||
358 | } | ||
359 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", | ||
360 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | ||
361 | sd->sbe_pushed, sd->sbe_attempts, | ||
362 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); | ||
363 | } | ||
364 | #endif | ||
365 | } | ||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | static int schedstat_open(struct inode *inode, struct file *file) | ||
370 | { | ||
371 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
372 | char *buf = kmalloc(size, GFP_KERNEL); | ||
373 | struct seq_file *m; | ||
374 | int res; | ||
375 | |||
376 | if (!buf) | ||
377 | return -ENOMEM; | ||
378 | res = single_open(file, show_schedstat, NULL); | ||
379 | if (!res) { | ||
380 | m = file->private_data; | ||
381 | m->buf = buf; | ||
382 | m->size = size; | ||
383 | } else | ||
384 | kfree(buf); | ||
385 | return res; | ||
386 | } | ||
387 | |||
388 | struct file_operations proc_schedstat_operations = { | ||
389 | .open = schedstat_open, | ||
390 | .read = seq_read, | ||
391 | .llseek = seq_lseek, | ||
392 | .release = single_release, | ||
393 | }; | ||
394 | |||
395 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | ||
396 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | ||
397 | #else /* !CONFIG_SCHEDSTATS */ | ||
398 | # define schedstat_inc(rq, field) do { } while (0) | ||
399 | # define schedstat_add(rq, field, amt) do { } while (0) | ||
400 | #endif | ||
401 | |||
402 | /* | ||
403 | * rq_lock - lock a given runqueue and disable interrupts. | ||
404 | */ | ||
405 | static inline runqueue_t *this_rq_lock(void) | ||
406 | __acquires(rq->lock) | ||
407 | { | ||
408 | runqueue_t *rq; | ||
409 | |||
410 | local_irq_disable(); | ||
411 | rq = this_rq(); | ||
412 | spin_lock(&rq->lock); | ||
413 | |||
414 | return rq; | ||
415 | } | ||
416 | |||
417 | #ifdef CONFIG_SCHED_SMT | ||
418 | static int cpu_and_siblings_are_idle(int cpu) | ||
419 | { | ||
420 | int sib; | ||
421 | for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { | ||
422 | if (idle_cpu(sib)) | ||
423 | continue; | ||
424 | return 0; | ||
425 | } | ||
426 | |||
427 | return 1; | ||
428 | } | ||
429 | #else | ||
430 | #define cpu_and_siblings_are_idle(A) idle_cpu(A) | ||
431 | #endif | ||
432 | |||
433 | #ifdef CONFIG_SCHEDSTATS | ||
434 | /* | ||
435 | * Called when a process is dequeued from the active array and given | ||
436 | * the cpu. We should note that with the exception of interactive | ||
437 | * tasks, the expired queue will become the active queue after the active | ||
438 | * queue is empty, without explicitly dequeuing and requeuing tasks in the | ||
439 | * expired queue. (Interactive tasks may be requeued directly to the | ||
440 | * active queue, thus delaying tasks in the expired queue from running; | ||
441 | * see scheduler_tick()). | ||
442 | * | ||
443 | * This function is only called from sched_info_arrive(), rather than | ||
444 | * dequeue_task(). Even though a task may be queued and dequeued multiple | ||
445 | * times as it is shuffled about, we're really interested in knowing how | ||
446 | * long it was from the *first* time it was queued to the time that it | ||
447 | * finally hit a cpu. | ||
448 | */ | ||
449 | static inline void sched_info_dequeued(task_t *t) | ||
450 | { | ||
451 | t->sched_info.last_queued = 0; | ||
452 | } | ||
453 | |||
454 | /* | ||
455 | * Called when a task finally hits the cpu. We can now calculate how | ||
456 | * long it was waiting to run. We also note when it began so that we | ||
457 | * can keep stats on how long its timeslice is. | ||
458 | */ | ||
459 | static inline void sched_info_arrive(task_t *t) | ||
460 | { | ||
461 | unsigned long now = jiffies, diff = 0; | ||
462 | struct runqueue *rq = task_rq(t); | ||
463 | |||
464 | if (t->sched_info.last_queued) | ||
465 | diff = now - t->sched_info.last_queued; | ||
466 | sched_info_dequeued(t); | ||
467 | t->sched_info.run_delay += diff; | ||
468 | t->sched_info.last_arrival = now; | ||
469 | t->sched_info.pcnt++; | ||
470 | |||
471 | if (!rq) | ||
472 | return; | ||
473 | |||
474 | rq->rq_sched_info.run_delay += diff; | ||
475 | rq->rq_sched_info.pcnt++; | ||
476 | } | ||
477 | |||
478 | /* | ||
479 | * Called when a process is queued into either the active or expired | ||
480 | * array. The time is noted and later used to determine how long we | ||
481 | * had to wait for us to reach the cpu. Since the expired queue will | ||
482 | * become the active queue after active queue is empty, without dequeuing | ||
483 | * and requeuing any tasks, we are interested in queuing to either. It | ||
484 | * is unusual but not impossible for tasks to be dequeued and immediately | ||
485 | * requeued in the same or another array: this can happen in sched_yield(), | ||
486 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | ||
487 | * to runqueue. | ||
488 | * | ||
489 | * This function is only called from enqueue_task(), but also only updates | ||
490 | * the timestamp if it is already not set. It's assumed that | ||
491 | * sched_info_dequeued() will clear that stamp when appropriate. | ||
492 | */ | ||
493 | static inline void sched_info_queued(task_t *t) | ||
494 | { | ||
495 | if (!t->sched_info.last_queued) | ||
496 | t->sched_info.last_queued = jiffies; | ||
497 | } | ||
498 | |||
499 | /* | ||
500 | * Called when a process ceases being the active-running process, either | ||
501 | * voluntarily or involuntarily. Now we can calculate how long we ran. | ||
502 | */ | ||
503 | static inline void sched_info_depart(task_t *t) | ||
504 | { | ||
505 | struct runqueue *rq = task_rq(t); | ||
506 | unsigned long diff = jiffies - t->sched_info.last_arrival; | ||
507 | |||
508 | t->sched_info.cpu_time += diff; | ||
509 | |||
510 | if (rq) | ||
511 | rq->rq_sched_info.cpu_time += diff; | ||
512 | } | ||
513 | |||
514 | /* | ||
515 | * Called when tasks are switched involuntarily due, typically, to expiring | ||
516 | * their time slice. (This may also be called when switching to or from | ||
517 | * the idle task.) We are only called when prev != next. | ||
518 | */ | ||
519 | static inline void sched_info_switch(task_t *prev, task_t *next) | ||
520 | { | ||
521 | struct runqueue *rq = task_rq(prev); | ||
522 | |||
523 | /* | ||
524 | * prev now departs the cpu. It's not interesting to record | ||
525 | * stats about how efficient we were at scheduling the idle | ||
526 | * process, however. | ||
527 | */ | ||
528 | if (prev != rq->idle) | ||
529 | sched_info_depart(prev); | ||
530 | |||
531 | if (next != rq->idle) | ||
532 | sched_info_arrive(next); | ||
533 | } | ||
534 | #else | ||
535 | #define sched_info_queued(t) do { } while (0) | ||
536 | #define sched_info_switch(t, next) do { } while (0) | ||
537 | #endif /* CONFIG_SCHEDSTATS */ | ||
538 | |||
539 | /* | ||
540 | * Adding/removing a task to/from a priority array: | ||
541 | */ | ||
542 | static void dequeue_task(struct task_struct *p, prio_array_t *array) | ||
543 | { | ||
544 | array->nr_active--; | ||
545 | list_del(&p->run_list); | ||
546 | if (list_empty(array->queue + p->prio)) | ||
547 | __clear_bit(p->prio, array->bitmap); | ||
548 | } | ||
549 | |||
550 | static void enqueue_task(struct task_struct *p, prio_array_t *array) | ||
551 | { | ||
552 | sched_info_queued(p); | ||
553 | list_add_tail(&p->run_list, array->queue + p->prio); | ||
554 | __set_bit(p->prio, array->bitmap); | ||
555 | array->nr_active++; | ||
556 | p->array = array; | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * Put task to the end of the run list without the overhead of dequeue | ||
561 | * followed by enqueue. | ||
562 | */ | ||
563 | static void requeue_task(struct task_struct *p, prio_array_t *array) | ||
564 | { | ||
565 | list_move_tail(&p->run_list, array->queue + p->prio); | ||
566 | } | ||
567 | |||
568 | static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | ||
569 | { | ||
570 | list_add(&p->run_list, array->queue + p->prio); | ||
571 | __set_bit(p->prio, array->bitmap); | ||
572 | array->nr_active++; | ||
573 | p->array = array; | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * effective_prio - return the priority that is based on the static | ||
578 | * priority but is modified by bonuses/penalties. | ||
579 | * | ||
580 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | ||
581 | * into the -5 ... 0 ... +5 bonus/penalty range. | ||
582 | * | ||
583 | * We use 25% of the full 0...39 priority range so that: | ||
584 | * | ||
585 | * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | ||
586 | * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | ||
587 | * | ||
588 | * Both properties are important to certain workloads. | ||
589 | */ | ||
590 | static int effective_prio(task_t *p) | ||
591 | { | ||
592 | int bonus, prio; | ||
593 | |||
594 | if (rt_task(p)) | ||
595 | return p->prio; | ||
596 | |||
597 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | ||
598 | |||
599 | prio = p->static_prio - bonus; | ||
600 | if (prio < MAX_RT_PRIO) | ||
601 | prio = MAX_RT_PRIO; | ||
602 | if (prio > MAX_PRIO-1) | ||
603 | prio = MAX_PRIO-1; | ||
604 | return prio; | ||
605 | } | ||
606 | |||
607 | /* | ||
608 | * __activate_task - move a task to the runqueue. | ||
609 | */ | ||
610 | static inline void __activate_task(task_t *p, runqueue_t *rq) | ||
611 | { | ||
612 | enqueue_task(p, rq->active); | ||
613 | rq->nr_running++; | ||
614 | } | ||
615 | |||
616 | /* | ||
617 | * __activate_idle_task - move idle task to the _front_ of runqueue. | ||
618 | */ | ||
619 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | ||
620 | { | ||
621 | enqueue_task_head(p, rq->active); | ||
622 | rq->nr_running++; | ||
623 | } | ||
624 | |||
625 | static void recalc_task_prio(task_t *p, unsigned long long now) | ||
626 | { | ||
627 | /* Caller must always ensure 'now >= p->timestamp' */ | ||
628 | unsigned long long __sleep_time = now - p->timestamp; | ||
629 | unsigned long sleep_time; | ||
630 | |||
631 | if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
632 | sleep_time = NS_MAX_SLEEP_AVG; | ||
633 | else | ||
634 | sleep_time = (unsigned long)__sleep_time; | ||
635 | |||
636 | if (likely(sleep_time > 0)) { | ||
637 | /* | ||
638 | * User tasks that sleep a long time are categorised as | ||
639 | * idle and will get just interactive status to stay active & | ||
640 | * prevent them suddenly becoming cpu hogs and starving | ||
641 | * other processes. | ||
642 | */ | ||
643 | if (p->mm && p->activated != -1 && | ||
644 | sleep_time > INTERACTIVE_SLEEP(p)) { | ||
645 | p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - | ||
646 | DEF_TIMESLICE); | ||
647 | } else { | ||
648 | /* | ||
649 | * The lower the sleep avg a task has the more | ||
650 | * rapidly it will rise with sleep time. | ||
651 | */ | ||
652 | sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; | ||
653 | |||
654 | /* | ||
655 | * Tasks waking from uninterruptible sleep are | ||
656 | * limited in their sleep_avg rise as they | ||
657 | * are likely to be waiting on I/O | ||
658 | */ | ||
659 | if (p->activated == -1 && p->mm) { | ||
660 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | ||
661 | sleep_time = 0; | ||
662 | else if (p->sleep_avg + sleep_time >= | ||
663 | INTERACTIVE_SLEEP(p)) { | ||
664 | p->sleep_avg = INTERACTIVE_SLEEP(p); | ||
665 | sleep_time = 0; | ||
666 | } | ||
667 | } | ||
668 | |||
669 | /* | ||
670 | * This code gives a bonus to interactive tasks. | ||
671 | * | ||
672 | * The boost works by updating the 'average sleep time' | ||
673 | * value here, based on ->timestamp. The more time a | ||
674 | * task spends sleeping, the higher the average gets - | ||
675 | * and the higher the priority boost gets as well. | ||
676 | */ | ||
677 | p->sleep_avg += sleep_time; | ||
678 | |||
679 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
680 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
681 | } | ||
682 | } | ||
683 | |||
684 | p->prio = effective_prio(p); | ||
685 | } | ||
686 | |||
687 | /* | ||
688 | * activate_task - move a task to the runqueue and do priority recalculation | ||
689 | * | ||
690 | * Update all the scheduling statistics stuff. (sleep average | ||
691 | * calculation, priority modifiers, etc.) | ||
692 | */ | ||
693 | static void activate_task(task_t *p, runqueue_t *rq, int local) | ||
694 | { | ||
695 | unsigned long long now; | ||
696 | |||
697 | now = sched_clock(); | ||
698 | #ifdef CONFIG_SMP | ||
699 | if (!local) { | ||
700 | /* Compensate for drifting sched_clock */ | ||
701 | runqueue_t *this_rq = this_rq(); | ||
702 | now = (now - this_rq->timestamp_last_tick) | ||
703 | + rq->timestamp_last_tick; | ||
704 | } | ||
705 | #endif | ||
706 | |||
707 | recalc_task_prio(p, now); | ||
708 | |||
709 | /* | ||
710 | * This checks to make sure it's not an uninterruptible task | ||
711 | * that is now waking up. | ||
712 | */ | ||
713 | if (!p->activated) { | ||
714 | /* | ||
715 | * Tasks which were woken up by interrupts (ie. hw events) | ||
716 | * are most likely of interactive nature. So we give them | ||
717 | * the credit of extending their sleep time to the period | ||
718 | * of time they spend on the runqueue, waiting for execution | ||
719 | * on a CPU, first time around: | ||
720 | */ | ||
721 | if (in_interrupt()) | ||
722 | p->activated = 2; | ||
723 | else { | ||
724 | /* | ||
725 | * Normal first-time wakeups get a credit too for | ||
726 | * on-runqueue time, but it will be weighted down: | ||
727 | */ | ||
728 | p->activated = 1; | ||
729 | } | ||
730 | } | ||
731 | p->timestamp = now; | ||
732 | |||
733 | __activate_task(p, rq); | ||
734 | } | ||
735 | |||
736 | /* | ||
737 | * deactivate_task - remove a task from the runqueue. | ||
738 | */ | ||
739 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | ||
740 | { | ||
741 | rq->nr_running--; | ||
742 | dequeue_task(p, p->array); | ||
743 | p->array = NULL; | ||
744 | } | ||
745 | |||
746 | /* | ||
747 | * resched_task - mark a task 'to be rescheduled now'. | ||
748 | * | ||
749 | * On UP this means the setting of the need_resched flag, on SMP it | ||
750 | * might also involve a cross-CPU call to trigger the scheduler on | ||
751 | * the target CPU. | ||
752 | */ | ||
753 | #ifdef CONFIG_SMP | ||
754 | static void resched_task(task_t *p) | ||
755 | { | ||
756 | int need_resched, nrpolling; | ||
757 | |||
758 | assert_spin_locked(&task_rq(p)->lock); | ||
759 | |||
760 | /* minimise the chance of sending an interrupt to poll_idle() */ | ||
761 | nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); | ||
762 | need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); | ||
763 | nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); | ||
764 | |||
765 | if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) | ||
766 | smp_send_reschedule(task_cpu(p)); | ||
767 | } | ||
768 | #else | ||
769 | static inline void resched_task(task_t *p) | ||
770 | { | ||
771 | set_tsk_need_resched(p); | ||
772 | } | ||
773 | #endif | ||
774 | |||
775 | /** | ||
776 | * task_curr - is this task currently executing on a CPU? | ||
777 | * @p: the task in question. | ||
778 | */ | ||
779 | inline int task_curr(const task_t *p) | ||
780 | { | ||
781 | return cpu_curr(task_cpu(p)) == p; | ||
782 | } | ||
783 | |||
784 | #ifdef CONFIG_SMP | ||
785 | enum request_type { | ||
786 | REQ_MOVE_TASK, | ||
787 | REQ_SET_DOMAIN, | ||
788 | }; | ||
789 | |||
790 | typedef struct { | ||
791 | struct list_head list; | ||
792 | enum request_type type; | ||
793 | |||
794 | /* For REQ_MOVE_TASK */ | ||
795 | task_t *task; | ||
796 | int dest_cpu; | ||
797 | |||
798 | /* For REQ_SET_DOMAIN */ | ||
799 | struct sched_domain *sd; | ||
800 | |||
801 | struct completion done; | ||
802 | } migration_req_t; | ||
803 | |||
804 | /* | ||
805 | * The task's runqueue lock must be held. | ||
806 | * Returns true if you have to wait for migration thread. | ||
807 | */ | ||
808 | static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | ||
809 | { | ||
810 | runqueue_t *rq = task_rq(p); | ||
811 | |||
812 | /* | ||
813 | * If the task is not on a runqueue (and not running), then | ||
814 | * it is sufficient to simply update the task's cpu field. | ||
815 | */ | ||
816 | if (!p->array && !task_running(rq, p)) { | ||
817 | set_task_cpu(p, dest_cpu); | ||
818 | return 0; | ||
819 | } | ||
820 | |||
821 | init_completion(&req->done); | ||
822 | req->type = REQ_MOVE_TASK; | ||
823 | req->task = p; | ||
824 | req->dest_cpu = dest_cpu; | ||
825 | list_add(&req->list, &rq->migration_queue); | ||
826 | return 1; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * wait_task_inactive - wait for a thread to unschedule. | ||
831 | * | ||
832 | * The caller must ensure that the task *will* unschedule sometime soon, | ||
833 | * else this function might spin for a *long* time. This function can't | ||
834 | * be called with interrupts off, or it may introduce deadlock with | ||
835 | * smp_call_function() if an IPI is sent by the same process we are | ||
836 | * waiting to become inactive. | ||
837 | */ | ||
838 | void wait_task_inactive(task_t * p) | ||
839 | { | ||
840 | unsigned long flags; | ||
841 | runqueue_t *rq; | ||
842 | int preempted; | ||
843 | |||
844 | repeat: | ||
845 | rq = task_rq_lock(p, &flags); | ||
846 | /* Must be off runqueue entirely, not preempted. */ | ||
847 | if (unlikely(p->array || task_running(rq, p))) { | ||
848 | /* If it's preempted, we yield. It could be a while. */ | ||
849 | preempted = !task_running(rq, p); | ||
850 | task_rq_unlock(rq, &flags); | ||
851 | cpu_relax(); | ||
852 | if (preempted) | ||
853 | yield(); | ||
854 | goto repeat; | ||
855 | } | ||
856 | task_rq_unlock(rq, &flags); | ||
857 | } | ||
858 | |||
859 | /*** | ||
860 | * kick_process - kick a running thread to enter/exit the kernel | ||
861 | * @p: the to-be-kicked thread | ||
862 | * | ||
863 | * Cause a process which is running on another CPU to enter | ||
864 | * kernel-mode, without any delay. (to get signals handled.) | ||
865 | * | ||
866 | * NOTE: this function doesnt have to take the runqueue lock, | ||
867 | * because all it wants to ensure is that the remote task enters | ||
868 | * the kernel. If the IPI races and the task has been migrated | ||
869 | * to another CPU then no harm is done and the purpose has been | ||
870 | * achieved as well. | ||
871 | */ | ||
872 | void kick_process(task_t *p) | ||
873 | { | ||
874 | int cpu; | ||
875 | |||
876 | preempt_disable(); | ||
877 | cpu = task_cpu(p); | ||
878 | if ((cpu != smp_processor_id()) && task_curr(p)) | ||
879 | smp_send_reschedule(cpu); | ||
880 | preempt_enable(); | ||
881 | } | ||
882 | |||
883 | /* | ||
884 | * Return a low guess at the load of a migration-source cpu. | ||
885 | * | ||
886 | * We want to under-estimate the load of migration sources, to | ||
887 | * balance conservatively. | ||
888 | */ | ||
889 | static inline unsigned long source_load(int cpu) | ||
890 | { | ||
891 | runqueue_t *rq = cpu_rq(cpu); | ||
892 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | ||
893 | |||
894 | return min(rq->cpu_load, load_now); | ||
895 | } | ||
896 | |||
897 | /* | ||
898 | * Return a high guess at the load of a migration-target cpu | ||
899 | */ | ||
900 | static inline unsigned long target_load(int cpu) | ||
901 | { | ||
902 | runqueue_t *rq = cpu_rq(cpu); | ||
903 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | ||
904 | |||
905 | return max(rq->cpu_load, load_now); | ||
906 | } | ||
907 | |||
908 | #endif | ||
909 | |||
910 | /* | ||
911 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
912 | * not idle and an idle cpu is available. The span of cpus to | ||
913 | * search starts with cpus closest then further out as needed, | ||
914 | * so we always favor a closer, idle cpu. | ||
915 | * | ||
916 | * Returns the CPU we should wake onto. | ||
917 | */ | ||
918 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
919 | static int wake_idle(int cpu, task_t *p) | ||
920 | { | ||
921 | cpumask_t tmp; | ||
922 | struct sched_domain *sd; | ||
923 | int i; | ||
924 | |||
925 | if (idle_cpu(cpu)) | ||
926 | return cpu; | ||
927 | |||
928 | for_each_domain(cpu, sd) { | ||
929 | if (sd->flags & SD_WAKE_IDLE) { | ||
930 | cpus_and(tmp, sd->span, cpu_online_map); | ||
931 | cpus_and(tmp, tmp, p->cpus_allowed); | ||
932 | for_each_cpu_mask(i, tmp) { | ||
933 | if (idle_cpu(i)) | ||
934 | return i; | ||
935 | } | ||
936 | } | ||
937 | else break; | ||
938 | } | ||
939 | return cpu; | ||
940 | } | ||
941 | #else | ||
942 | static inline int wake_idle(int cpu, task_t *p) | ||
943 | { | ||
944 | return cpu; | ||
945 | } | ||
946 | #endif | ||
947 | |||
948 | /*** | ||
949 | * try_to_wake_up - wake up a thread | ||
950 | * @p: the to-be-woken-up thread | ||
951 | * @state: the mask of task states that can be woken | ||
952 | * @sync: do a synchronous wakeup? | ||
953 | * | ||
954 | * Put it on the run-queue if it's not already there. The "current" | ||
955 | * thread is always on the run-queue (except when the actual | ||
956 | * re-schedule is in progress), and as such you're allowed to do | ||
957 | * the simpler "current->state = TASK_RUNNING" to mark yourself | ||
958 | * runnable without the overhead of this. | ||
959 | * | ||
960 | * returns failure only if the task is already active. | ||
961 | */ | ||
962 | static int try_to_wake_up(task_t * p, unsigned int state, int sync) | ||
963 | { | ||
964 | int cpu, this_cpu, success = 0; | ||
965 | unsigned long flags; | ||
966 | long old_state; | ||
967 | runqueue_t *rq; | ||
968 | #ifdef CONFIG_SMP | ||
969 | unsigned long load, this_load; | ||
970 | struct sched_domain *sd; | ||
971 | int new_cpu; | ||
972 | #endif | ||
973 | |||
974 | rq = task_rq_lock(p, &flags); | ||
975 | old_state = p->state; | ||
976 | if (!(old_state & state)) | ||
977 | goto out; | ||
978 | |||
979 | if (p->array) | ||
980 | goto out_running; | ||
981 | |||
982 | cpu = task_cpu(p); | ||
983 | this_cpu = smp_processor_id(); | ||
984 | |||
985 | #ifdef CONFIG_SMP | ||
986 | if (unlikely(task_running(rq, p))) | ||
987 | goto out_activate; | ||
988 | |||
989 | #ifdef CONFIG_SCHEDSTATS | ||
990 | schedstat_inc(rq, ttwu_cnt); | ||
991 | if (cpu == this_cpu) { | ||
992 | schedstat_inc(rq, ttwu_local); | ||
993 | } else { | ||
994 | for_each_domain(this_cpu, sd) { | ||
995 | if (cpu_isset(cpu, sd->span)) { | ||
996 | schedstat_inc(sd, ttwu_wake_remote); | ||
997 | break; | ||
998 | } | ||
999 | } | ||
1000 | } | ||
1001 | #endif | ||
1002 | |||
1003 | new_cpu = cpu; | ||
1004 | if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1005 | goto out_set_cpu; | ||
1006 | |||
1007 | load = source_load(cpu); | ||
1008 | this_load = target_load(this_cpu); | ||
1009 | |||
1010 | /* | ||
1011 | * If sync wakeup then subtract the (maximum possible) effect of | ||
1012 | * the currently running task from the load of the current CPU: | ||
1013 | */ | ||
1014 | if (sync) | ||
1015 | this_load -= SCHED_LOAD_SCALE; | ||
1016 | |||
1017 | /* Don't pull the task off an idle CPU to a busy one */ | ||
1018 | if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) | ||
1019 | goto out_set_cpu; | ||
1020 | |||
1021 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1022 | |||
1023 | /* | ||
1024 | * Scan domains for affine wakeup and passive balancing | ||
1025 | * possibilities. | ||
1026 | */ | ||
1027 | for_each_domain(this_cpu, sd) { | ||
1028 | unsigned int imbalance; | ||
1029 | /* | ||
1030 | * Start passive balancing when half the imbalance_pct | ||
1031 | * limit is reached. | ||
1032 | */ | ||
1033 | imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; | ||
1034 | |||
1035 | if ((sd->flags & SD_WAKE_AFFINE) && | ||
1036 | !task_hot(p, rq->timestamp_last_tick, sd)) { | ||
1037 | /* | ||
1038 | * This domain has SD_WAKE_AFFINE and p is cache cold | ||
1039 | * in this domain. | ||
1040 | */ | ||
1041 | if (cpu_isset(cpu, sd->span)) { | ||
1042 | schedstat_inc(sd, ttwu_move_affine); | ||
1043 | goto out_set_cpu; | ||
1044 | } | ||
1045 | } else if ((sd->flags & SD_WAKE_BALANCE) && | ||
1046 | imbalance*this_load <= 100*load) { | ||
1047 | /* | ||
1048 | * This domain has SD_WAKE_BALANCE and there is | ||
1049 | * an imbalance. | ||
1050 | */ | ||
1051 | if (cpu_isset(cpu, sd->span)) { | ||
1052 | schedstat_inc(sd, ttwu_move_balance); | ||
1053 | goto out_set_cpu; | ||
1054 | } | ||
1055 | } | ||
1056 | } | ||
1057 | |||
1058 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1059 | out_set_cpu: | ||
1060 | new_cpu = wake_idle(new_cpu, p); | ||
1061 | if (new_cpu != cpu) { | ||
1062 | set_task_cpu(p, new_cpu); | ||
1063 | task_rq_unlock(rq, &flags); | ||
1064 | /* might preempt at this point */ | ||
1065 | rq = task_rq_lock(p, &flags); | ||
1066 | old_state = p->state; | ||
1067 | if (!(old_state & state)) | ||
1068 | goto out; | ||
1069 | if (p->array) | ||
1070 | goto out_running; | ||
1071 | |||
1072 | this_cpu = smp_processor_id(); | ||
1073 | cpu = task_cpu(p); | ||
1074 | } | ||
1075 | |||
1076 | out_activate: | ||
1077 | #endif /* CONFIG_SMP */ | ||
1078 | if (old_state == TASK_UNINTERRUPTIBLE) { | ||
1079 | rq->nr_uninterruptible--; | ||
1080 | /* | ||
1081 | * Tasks on involuntary sleep don't earn | ||
1082 | * sleep_avg beyond just interactive state. | ||
1083 | */ | ||
1084 | p->activated = -1; | ||
1085 | } | ||
1086 | |||
1087 | /* | ||
1088 | * Sync wakeups (i.e. those types of wakeups where the waker | ||
1089 | * has indicated that it will leave the CPU in short order) | ||
1090 | * don't trigger a preemption, if the woken up task will run on | ||
1091 | * this cpu. (in this case the 'I will reschedule' promise of | ||
1092 | * the waker guarantees that the freshly woken up task is going | ||
1093 | * to be considered on this CPU.) | ||
1094 | */ | ||
1095 | activate_task(p, rq, cpu == this_cpu); | ||
1096 | if (!sync || cpu != this_cpu) { | ||
1097 | if (TASK_PREEMPTS_CURR(p, rq)) | ||
1098 | resched_task(rq->curr); | ||
1099 | } | ||
1100 | success = 1; | ||
1101 | |||
1102 | out_running: | ||
1103 | p->state = TASK_RUNNING; | ||
1104 | out: | ||
1105 | task_rq_unlock(rq, &flags); | ||
1106 | |||
1107 | return success; | ||
1108 | } | ||
1109 | |||
1110 | int fastcall wake_up_process(task_t * p) | ||
1111 | { | ||
1112 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | ||
1113 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | ||
1114 | } | ||
1115 | |||
1116 | EXPORT_SYMBOL(wake_up_process); | ||
1117 | |||
1118 | int fastcall wake_up_state(task_t *p, unsigned int state) | ||
1119 | { | ||
1120 | return try_to_wake_up(p, state, 0); | ||
1121 | } | ||
1122 | |||
1123 | #ifdef CONFIG_SMP | ||
1124 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
1125 | struct sched_domain *sd); | ||
1126 | #endif | ||
1127 | |||
1128 | /* | ||
1129 | * Perform scheduler related setup for a newly forked process p. | ||
1130 | * p is forked by current. | ||
1131 | */ | ||
1132 | void fastcall sched_fork(task_t *p) | ||
1133 | { | ||
1134 | /* | ||
1135 | * We mark the process as running here, but have not actually | ||
1136 | * inserted it onto the runqueue yet. This guarantees that | ||
1137 | * nobody will actually run it, and a signal or other external | ||
1138 | * event cannot wake it up and insert it on the runqueue either. | ||
1139 | */ | ||
1140 | p->state = TASK_RUNNING; | ||
1141 | INIT_LIST_HEAD(&p->run_list); | ||
1142 | p->array = NULL; | ||
1143 | spin_lock_init(&p->switch_lock); | ||
1144 | #ifdef CONFIG_SCHEDSTATS | ||
1145 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||
1146 | #endif | ||
1147 | #ifdef CONFIG_PREEMPT | ||
1148 | /* | ||
1149 | * During context-switch we hold precisely one spinlock, which | ||
1150 | * schedule_tail drops. (in the common case it's this_rq()->lock, | ||
1151 | * but it also can be p->switch_lock.) So we compensate with a count | ||
1152 | * of 1. Also, we want to start with kernel preemption disabled. | ||
1153 | */ | ||
1154 | p->thread_info->preempt_count = 1; | ||
1155 | #endif | ||
1156 | /* | ||
1157 | * Share the timeslice between parent and child, thus the | ||
1158 | * total amount of pending timeslices in the system doesn't change, | ||
1159 | * resulting in more scheduling fairness. | ||
1160 | */ | ||
1161 | local_irq_disable(); | ||
1162 | p->time_slice = (current->time_slice + 1) >> 1; | ||
1163 | /* | ||
1164 | * The remainder of the first timeslice might be recovered by | ||
1165 | * the parent if the child exits early enough. | ||
1166 | */ | ||
1167 | p->first_time_slice = 1; | ||
1168 | current->time_slice >>= 1; | ||
1169 | p->timestamp = sched_clock(); | ||
1170 | if (unlikely(!current->time_slice)) { | ||
1171 | /* | ||
1172 | * This case is rare, it happens when the parent has only | ||
1173 | * a single jiffy left from its timeslice. Taking the | ||
1174 | * runqueue lock is not a problem. | ||
1175 | */ | ||
1176 | current->time_slice = 1; | ||
1177 | preempt_disable(); | ||
1178 | scheduler_tick(); | ||
1179 | local_irq_enable(); | ||
1180 | preempt_enable(); | ||
1181 | } else | ||
1182 | local_irq_enable(); | ||
1183 | } | ||
1184 | |||
1185 | /* | ||
1186 | * wake_up_new_task - wake up a newly created task for the first time. | ||
1187 | * | ||
1188 | * This function will do some initial scheduler statistics housekeeping | ||
1189 | * that must be done for every newly created context, then puts the task | ||
1190 | * on the runqueue and wakes it. | ||
1191 | */ | ||
1192 | void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | ||
1193 | { | ||
1194 | unsigned long flags; | ||
1195 | int this_cpu, cpu; | ||
1196 | runqueue_t *rq, *this_rq; | ||
1197 | |||
1198 | rq = task_rq_lock(p, &flags); | ||
1199 | cpu = task_cpu(p); | ||
1200 | this_cpu = smp_processor_id(); | ||
1201 | |||
1202 | BUG_ON(p->state != TASK_RUNNING); | ||
1203 | |||
1204 | /* | ||
1205 | * We decrease the sleep average of forking parents | ||
1206 | * and children as well, to keep max-interactive tasks | ||
1207 | * from forking tasks that are max-interactive. The parent | ||
1208 | * (current) is done further down, under its lock. | ||
1209 | */ | ||
1210 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | ||
1211 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
1212 | |||
1213 | p->prio = effective_prio(p); | ||
1214 | |||
1215 | if (likely(cpu == this_cpu)) { | ||
1216 | if (!(clone_flags & CLONE_VM)) { | ||
1217 | /* | ||
1218 | * The VM isn't cloned, so we're in a good position to | ||
1219 | * do child-runs-first in anticipation of an exec. This | ||
1220 | * usually avoids a lot of COW overhead. | ||
1221 | */ | ||
1222 | if (unlikely(!current->array)) | ||
1223 | __activate_task(p, rq); | ||
1224 | else { | ||
1225 | p->prio = current->prio; | ||
1226 | list_add_tail(&p->run_list, ¤t->run_list); | ||
1227 | p->array = current->array; | ||
1228 | p->array->nr_active++; | ||
1229 | rq->nr_running++; | ||
1230 | } | ||
1231 | set_need_resched(); | ||
1232 | } else | ||
1233 | /* Run child last */ | ||
1234 | __activate_task(p, rq); | ||
1235 | /* | ||
1236 | * We skip the following code due to cpu == this_cpu | ||
1237 | * | ||
1238 | * task_rq_unlock(rq, &flags); | ||
1239 | * this_rq = task_rq_lock(current, &flags); | ||
1240 | */ | ||
1241 | this_rq = rq; | ||
1242 | } else { | ||
1243 | this_rq = cpu_rq(this_cpu); | ||
1244 | |||
1245 | /* | ||
1246 | * Not the local CPU - must adjust timestamp. This should | ||
1247 | * get optimised away in the !CONFIG_SMP case. | ||
1248 | */ | ||
1249 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) | ||
1250 | + rq->timestamp_last_tick; | ||
1251 | __activate_task(p, rq); | ||
1252 | if (TASK_PREEMPTS_CURR(p, rq)) | ||
1253 | resched_task(rq->curr); | ||
1254 | |||
1255 | /* | ||
1256 | * Parent and child are on different CPUs, now get the | ||
1257 | * parent runqueue to update the parent's ->sleep_avg: | ||
1258 | */ | ||
1259 | task_rq_unlock(rq, &flags); | ||
1260 | this_rq = task_rq_lock(current, &flags); | ||
1261 | } | ||
1262 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | ||
1263 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
1264 | task_rq_unlock(this_rq, &flags); | ||
1265 | } | ||
1266 | |||
1267 | /* | ||
1268 | * Potentially available exiting-child timeslices are | ||
1269 | * retrieved here - this way the parent does not get | ||
1270 | * penalized for creating too many threads. | ||
1271 | * | ||
1272 | * (this cannot be used to 'generate' timeslices | ||
1273 | * artificially, because any timeslice recovered here | ||
1274 | * was given away by the parent in the first place.) | ||
1275 | */ | ||
1276 | void fastcall sched_exit(task_t * p) | ||
1277 | { | ||
1278 | unsigned long flags; | ||
1279 | runqueue_t *rq; | ||
1280 | |||
1281 | /* | ||
1282 | * If the child was a (relative-) CPU hog then decrease | ||
1283 | * the sleep_avg of the parent as well. | ||
1284 | */ | ||
1285 | rq = task_rq_lock(p->parent, &flags); | ||
1286 | if (p->first_time_slice) { | ||
1287 | p->parent->time_slice += p->time_slice; | ||
1288 | if (unlikely(p->parent->time_slice > task_timeslice(p))) | ||
1289 | p->parent->time_slice = task_timeslice(p); | ||
1290 | } | ||
1291 | if (p->sleep_avg < p->parent->sleep_avg) | ||
1292 | p->parent->sleep_avg = p->parent->sleep_avg / | ||
1293 | (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / | ||
1294 | (EXIT_WEIGHT + 1); | ||
1295 | task_rq_unlock(rq, &flags); | ||
1296 | } | ||
1297 | |||
1298 | /** | ||
1299 | * finish_task_switch - clean up after a task-switch | ||
1300 | * @prev: the thread we just switched away from. | ||
1301 | * | ||
1302 | * We enter this with the runqueue still locked, and finish_arch_switch() | ||
1303 | * will unlock it along with doing any other architecture-specific cleanup | ||
1304 | * actions. | ||
1305 | * | ||
1306 | * Note that we may have delayed dropping an mm in context_switch(). If | ||
1307 | * so, we finish that here outside of the runqueue lock. (Doing it | ||
1308 | * with the lock held can cause deadlocks; see schedule() for | ||
1309 | * details.) | ||
1310 | */ | ||
1311 | static inline void finish_task_switch(task_t *prev) | ||
1312 | __releases(rq->lock) | ||
1313 | { | ||
1314 | runqueue_t *rq = this_rq(); | ||
1315 | struct mm_struct *mm = rq->prev_mm; | ||
1316 | unsigned long prev_task_flags; | ||
1317 | |||
1318 | rq->prev_mm = NULL; | ||
1319 | |||
1320 | /* | ||
1321 | * A task struct has one reference for the use as "current". | ||
1322 | * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and | ||
1323 | * calls schedule one last time. The schedule call will never return, | ||
1324 | * and the scheduled task must drop that reference. | ||
1325 | * The test for EXIT_ZOMBIE must occur while the runqueue locks are | ||
1326 | * still held, otherwise prev could be scheduled on another cpu, die | ||
1327 | * there before we look at prev->state, and then the reference would | ||
1328 | * be dropped twice. | ||
1329 | * Manfred Spraul <manfred@colorfullife.com> | ||
1330 | */ | ||
1331 | prev_task_flags = prev->flags; | ||
1332 | finish_arch_switch(rq, prev); | ||
1333 | if (mm) | ||
1334 | mmdrop(mm); | ||
1335 | if (unlikely(prev_task_flags & PF_DEAD)) | ||
1336 | put_task_struct(prev); | ||
1337 | } | ||
1338 | |||
1339 | /** | ||
1340 | * schedule_tail - first thing a freshly forked thread must call. | ||
1341 | * @prev: the thread we just switched away from. | ||
1342 | */ | ||
1343 | asmlinkage void schedule_tail(task_t *prev) | ||
1344 | __releases(rq->lock) | ||
1345 | { | ||
1346 | finish_task_switch(prev); | ||
1347 | |||
1348 | if (current->set_child_tid) | ||
1349 | put_user(current->pid, current->set_child_tid); | ||
1350 | } | ||
1351 | |||
1352 | /* | ||
1353 | * context_switch - switch to the new MM and the new | ||
1354 | * thread's register state. | ||
1355 | */ | ||
1356 | static inline | ||
1357 | task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | ||
1358 | { | ||
1359 | struct mm_struct *mm = next->mm; | ||
1360 | struct mm_struct *oldmm = prev->active_mm; | ||
1361 | |||
1362 | if (unlikely(!mm)) { | ||
1363 | next->active_mm = oldmm; | ||
1364 | atomic_inc(&oldmm->mm_count); | ||
1365 | enter_lazy_tlb(oldmm, next); | ||
1366 | } else | ||
1367 | switch_mm(oldmm, mm, next); | ||
1368 | |||
1369 | if (unlikely(!prev->mm)) { | ||
1370 | prev->active_mm = NULL; | ||
1371 | WARN_ON(rq->prev_mm); | ||
1372 | rq->prev_mm = oldmm; | ||
1373 | } | ||
1374 | |||
1375 | /* Here we just switch the register state and the stack. */ | ||
1376 | switch_to(prev, next, prev); | ||
1377 | |||
1378 | return prev; | ||
1379 | } | ||
1380 | |||
1381 | /* | ||
1382 | * nr_running, nr_uninterruptible and nr_context_switches: | ||
1383 | * | ||
1384 | * externally visible scheduler statistics: current number of runnable | ||
1385 | * threads, current number of uninterruptible-sleeping threads, total | ||
1386 | * number of context switches performed since bootup. | ||
1387 | */ | ||
1388 | unsigned long nr_running(void) | ||
1389 | { | ||
1390 | unsigned long i, sum = 0; | ||
1391 | |||
1392 | for_each_online_cpu(i) | ||
1393 | sum += cpu_rq(i)->nr_running; | ||
1394 | |||
1395 | return sum; | ||
1396 | } | ||
1397 | |||
1398 | unsigned long nr_uninterruptible(void) | ||
1399 | { | ||
1400 | unsigned long i, sum = 0; | ||
1401 | |||
1402 | for_each_cpu(i) | ||
1403 | sum += cpu_rq(i)->nr_uninterruptible; | ||
1404 | |||
1405 | /* | ||
1406 | * Since we read the counters lockless, it might be slightly | ||
1407 | * inaccurate. Do not allow it to go below zero though: | ||
1408 | */ | ||
1409 | if (unlikely((long)sum < 0)) | ||
1410 | sum = 0; | ||
1411 | |||
1412 | return sum; | ||
1413 | } | ||
1414 | |||
1415 | unsigned long long nr_context_switches(void) | ||
1416 | { | ||
1417 | unsigned long long i, sum = 0; | ||
1418 | |||
1419 | for_each_cpu(i) | ||
1420 | sum += cpu_rq(i)->nr_switches; | ||
1421 | |||
1422 | return sum; | ||
1423 | } | ||
1424 | |||
1425 | unsigned long nr_iowait(void) | ||
1426 | { | ||
1427 | unsigned long i, sum = 0; | ||
1428 | |||
1429 | for_each_cpu(i) | ||
1430 | sum += atomic_read(&cpu_rq(i)->nr_iowait); | ||
1431 | |||
1432 | return sum; | ||
1433 | } | ||
1434 | |||
1435 | #ifdef CONFIG_SMP | ||
1436 | |||
1437 | /* | ||
1438 | * double_rq_lock - safely lock two runqueues | ||
1439 | * | ||
1440 | * Note this does not disable interrupts like task_rq_lock, | ||
1441 | * you need to do so manually before calling. | ||
1442 | */ | ||
1443 | static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | ||
1444 | __acquires(rq1->lock) | ||
1445 | __acquires(rq2->lock) | ||
1446 | { | ||
1447 | if (rq1 == rq2) { | ||
1448 | spin_lock(&rq1->lock); | ||
1449 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1450 | } else { | ||
1451 | if (rq1 < rq2) { | ||
1452 | spin_lock(&rq1->lock); | ||
1453 | spin_lock(&rq2->lock); | ||
1454 | } else { | ||
1455 | spin_lock(&rq2->lock); | ||
1456 | spin_lock(&rq1->lock); | ||
1457 | } | ||
1458 | } | ||
1459 | } | ||
1460 | |||
1461 | /* | ||
1462 | * double_rq_unlock - safely unlock two runqueues | ||
1463 | * | ||
1464 | * Note this does not restore interrupts like task_rq_unlock, | ||
1465 | * you need to do so manually after calling. | ||
1466 | */ | ||
1467 | static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | ||
1468 | __releases(rq1->lock) | ||
1469 | __releases(rq2->lock) | ||
1470 | { | ||
1471 | spin_unlock(&rq1->lock); | ||
1472 | if (rq1 != rq2) | ||
1473 | spin_unlock(&rq2->lock); | ||
1474 | else | ||
1475 | __release(rq2->lock); | ||
1476 | } | ||
1477 | |||
1478 | /* | ||
1479 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1480 | */ | ||
1481 | static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | ||
1482 | __releases(this_rq->lock) | ||
1483 | __acquires(busiest->lock) | ||
1484 | __acquires(this_rq->lock) | ||
1485 | { | ||
1486 | if (unlikely(!spin_trylock(&busiest->lock))) { | ||
1487 | if (busiest < this_rq) { | ||
1488 | spin_unlock(&this_rq->lock); | ||
1489 | spin_lock(&busiest->lock); | ||
1490 | spin_lock(&this_rq->lock); | ||
1491 | } else | ||
1492 | spin_lock(&busiest->lock); | ||
1493 | } | ||
1494 | } | ||
1495 | |||
1496 | /* | ||
1497 | * find_idlest_cpu - find the least busy runqueue. | ||
1498 | */ | ||
1499 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
1500 | struct sched_domain *sd) | ||
1501 | { | ||
1502 | unsigned long load, min_load, this_load; | ||
1503 | int i, min_cpu; | ||
1504 | cpumask_t mask; | ||
1505 | |||
1506 | min_cpu = UINT_MAX; | ||
1507 | min_load = ULONG_MAX; | ||
1508 | |||
1509 | cpus_and(mask, sd->span, p->cpus_allowed); | ||
1510 | |||
1511 | for_each_cpu_mask(i, mask) { | ||
1512 | load = target_load(i); | ||
1513 | |||
1514 | if (load < min_load) { | ||
1515 | min_cpu = i; | ||
1516 | min_load = load; | ||
1517 | |||
1518 | /* break out early on an idle CPU: */ | ||
1519 | if (!min_load) | ||
1520 | break; | ||
1521 | } | ||
1522 | } | ||
1523 | |||
1524 | /* add +1 to account for the new task */ | ||
1525 | this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; | ||
1526 | |||
1527 | /* | ||
1528 | * Would with the addition of the new task to the | ||
1529 | * current CPU there be an imbalance between this | ||
1530 | * CPU and the idlest CPU? | ||
1531 | * | ||
1532 | * Use half of the balancing threshold - new-context is | ||
1533 | * a good opportunity to balance. | ||
1534 | */ | ||
1535 | if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) | ||
1536 | return min_cpu; | ||
1537 | |||
1538 | return this_cpu; | ||
1539 | } | ||
1540 | |||
1541 | /* | ||
1542 | * If dest_cpu is allowed for this process, migrate the task to it. | ||
1543 | * This is accomplished by forcing the cpu_allowed mask to only | ||
1544 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | ||
1545 | * the cpu_allowed mask is restored. | ||
1546 | */ | ||
1547 | static void sched_migrate_task(task_t *p, int dest_cpu) | ||
1548 | { | ||
1549 | migration_req_t req; | ||
1550 | runqueue_t *rq; | ||
1551 | unsigned long flags; | ||
1552 | |||
1553 | rq = task_rq_lock(p, &flags); | ||
1554 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | ||
1555 | || unlikely(cpu_is_offline(dest_cpu))) | ||
1556 | goto out; | ||
1557 | |||
1558 | /* force the process onto the specified CPU */ | ||
1559 | if (migrate_task(p, dest_cpu, &req)) { | ||
1560 | /* Need to wait for migration thread (might exit: take ref). */ | ||
1561 | struct task_struct *mt = rq->migration_thread; | ||
1562 | get_task_struct(mt); | ||
1563 | task_rq_unlock(rq, &flags); | ||
1564 | wake_up_process(mt); | ||
1565 | put_task_struct(mt); | ||
1566 | wait_for_completion(&req.done); | ||
1567 | return; | ||
1568 | } | ||
1569 | out: | ||
1570 | task_rq_unlock(rq, &flags); | ||
1571 | } | ||
1572 | |||
1573 | /* | ||
1574 | * sched_exec(): find the highest-level, exec-balance-capable | ||
1575 | * domain and try to migrate the task to the least loaded CPU. | ||
1576 | * | ||
1577 | * execve() is a valuable balancing opportunity, because at this point | ||
1578 | * the task has the smallest effective memory and cache footprint. | ||
1579 | */ | ||
1580 | void sched_exec(void) | ||
1581 | { | ||
1582 | struct sched_domain *tmp, *sd = NULL; | ||
1583 | int new_cpu, this_cpu = get_cpu(); | ||
1584 | |||
1585 | /* Prefer the current CPU if there's only this task running */ | ||
1586 | if (this_rq()->nr_running <= 1) | ||
1587 | goto out; | ||
1588 | |||
1589 | for_each_domain(this_cpu, tmp) | ||
1590 | if (tmp->flags & SD_BALANCE_EXEC) | ||
1591 | sd = tmp; | ||
1592 | |||
1593 | if (sd) { | ||
1594 | schedstat_inc(sd, sbe_attempts); | ||
1595 | new_cpu = find_idlest_cpu(current, this_cpu, sd); | ||
1596 | if (new_cpu != this_cpu) { | ||
1597 | schedstat_inc(sd, sbe_pushed); | ||
1598 | put_cpu(); | ||
1599 | sched_migrate_task(current, new_cpu); | ||
1600 | return; | ||
1601 | } | ||
1602 | } | ||
1603 | out: | ||
1604 | put_cpu(); | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * pull_task - move a task from a remote runqueue to the local runqueue. | ||
1609 | * Both runqueues must be locked. | ||
1610 | */ | ||
1611 | static inline | ||
1612 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | ||
1613 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | ||
1614 | { | ||
1615 | dequeue_task(p, src_array); | ||
1616 | src_rq->nr_running--; | ||
1617 | set_task_cpu(p, this_cpu); | ||
1618 | this_rq->nr_running++; | ||
1619 | enqueue_task(p, this_array); | ||
1620 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | ||
1621 | + this_rq->timestamp_last_tick; | ||
1622 | /* | ||
1623 | * Note that idle threads have a prio of MAX_PRIO, for this test | ||
1624 | * to be always true for them. | ||
1625 | */ | ||
1626 | if (TASK_PREEMPTS_CURR(p, this_rq)) | ||
1627 | resched_task(this_rq->curr); | ||
1628 | } | ||
1629 | |||
1630 | /* | ||
1631 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | ||
1632 | */ | ||
1633 | static inline | ||
1634 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | ||
1635 | struct sched_domain *sd, enum idle_type idle) | ||
1636 | { | ||
1637 | /* | ||
1638 | * We do not migrate tasks that are: | ||
1639 | * 1) running (obviously), or | ||
1640 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | ||
1641 | * 3) are cache-hot on their current CPU. | ||
1642 | */ | ||
1643 | if (task_running(rq, p)) | ||
1644 | return 0; | ||
1645 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | ||
1646 | return 0; | ||
1647 | |||
1648 | /* | ||
1649 | * Aggressive migration if: | ||
1650 | * 1) the [whole] cpu is idle, or | ||
1651 | * 2) too many balance attempts have failed. | ||
1652 | */ | ||
1653 | |||
1654 | if (cpu_and_siblings_are_idle(this_cpu) || \ | ||
1655 | sd->nr_balance_failed > sd->cache_nice_tries) | ||
1656 | return 1; | ||
1657 | |||
1658 | if (task_hot(p, rq->timestamp_last_tick, sd)) | ||
1659 | return 0; | ||
1660 | return 1; | ||
1661 | } | ||
1662 | |||
1663 | /* | ||
1664 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | ||
1665 | * as part of a balancing operation within "domain". Returns the number of | ||
1666 | * tasks moved. | ||
1667 | * | ||
1668 | * Called with both runqueues locked. | ||
1669 | */ | ||
1670 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | ||
1671 | unsigned long max_nr_move, struct sched_domain *sd, | ||
1672 | enum idle_type idle) | ||
1673 | { | ||
1674 | prio_array_t *array, *dst_array; | ||
1675 | struct list_head *head, *curr; | ||
1676 | int idx, pulled = 0; | ||
1677 | task_t *tmp; | ||
1678 | |||
1679 | if (max_nr_move <= 0 || busiest->nr_running <= 1) | ||
1680 | goto out; | ||
1681 | |||
1682 | /* | ||
1683 | * We first consider expired tasks. Those will likely not be | ||
1684 | * executed in the near future, and they are most likely to | ||
1685 | * be cache-cold, thus switching CPUs has the least effect | ||
1686 | * on them. | ||
1687 | */ | ||
1688 | if (busiest->expired->nr_active) { | ||
1689 | array = busiest->expired; | ||
1690 | dst_array = this_rq->expired; | ||
1691 | } else { | ||
1692 | array = busiest->active; | ||
1693 | dst_array = this_rq->active; | ||
1694 | } | ||
1695 | |||
1696 | new_array: | ||
1697 | /* Start searching at priority 0: */ | ||
1698 | idx = 0; | ||
1699 | skip_bitmap: | ||
1700 | if (!idx) | ||
1701 | idx = sched_find_first_bit(array->bitmap); | ||
1702 | else | ||
1703 | idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | ||
1704 | if (idx >= MAX_PRIO) { | ||
1705 | if (array == busiest->expired && busiest->active->nr_active) { | ||
1706 | array = busiest->active; | ||
1707 | dst_array = this_rq->active; | ||
1708 | goto new_array; | ||
1709 | } | ||
1710 | goto out; | ||
1711 | } | ||
1712 | |||
1713 | head = array->queue + idx; | ||
1714 | curr = head->prev; | ||
1715 | skip_queue: | ||
1716 | tmp = list_entry(curr, task_t, run_list); | ||
1717 | |||
1718 | curr = curr->prev; | ||
1719 | |||
1720 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { | ||
1721 | if (curr != head) | ||
1722 | goto skip_queue; | ||
1723 | idx++; | ||
1724 | goto skip_bitmap; | ||
1725 | } | ||
1726 | |||
1727 | #ifdef CONFIG_SCHEDSTATS | ||
1728 | if (task_hot(tmp, busiest->timestamp_last_tick, sd)) | ||
1729 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
1730 | #endif | ||
1731 | |||
1732 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | ||
1733 | pulled++; | ||
1734 | |||
1735 | /* We only want to steal up to the prescribed number of tasks. */ | ||
1736 | if (pulled < max_nr_move) { | ||
1737 | if (curr != head) | ||
1738 | goto skip_queue; | ||
1739 | idx++; | ||
1740 | goto skip_bitmap; | ||
1741 | } | ||
1742 | out: | ||
1743 | /* | ||
1744 | * Right now, this is the only place pull_task() is called, | ||
1745 | * so we can safely collect pull_task() stats here rather than | ||
1746 | * inside pull_task(). | ||
1747 | */ | ||
1748 | schedstat_add(sd, lb_gained[idle], pulled); | ||
1749 | return pulled; | ||
1750 | } | ||
1751 | |||
1752 | /* | ||
1753 | * find_busiest_group finds and returns the busiest CPU group within the | ||
1754 | * domain. It calculates and returns the number of tasks which should be | ||
1755 | * moved to restore balance via the imbalance parameter. | ||
1756 | */ | ||
1757 | static struct sched_group * | ||
1758 | find_busiest_group(struct sched_domain *sd, int this_cpu, | ||
1759 | unsigned long *imbalance, enum idle_type idle) | ||
1760 | { | ||
1761 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | ||
1762 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | ||
1763 | |||
1764 | max_load = this_load = total_load = total_pwr = 0; | ||
1765 | |||
1766 | do { | ||
1767 | unsigned long load; | ||
1768 | int local_group; | ||
1769 | int i; | ||
1770 | |||
1771 | local_group = cpu_isset(this_cpu, group->cpumask); | ||
1772 | |||
1773 | /* Tally up the load of all CPUs in the group */ | ||
1774 | avg_load = 0; | ||
1775 | |||
1776 | for_each_cpu_mask(i, group->cpumask) { | ||
1777 | /* Bias balancing toward cpus of our domain */ | ||
1778 | if (local_group) | ||
1779 | load = target_load(i); | ||
1780 | else | ||
1781 | load = source_load(i); | ||
1782 | |||
1783 | avg_load += load; | ||
1784 | } | ||
1785 | |||
1786 | total_load += avg_load; | ||
1787 | total_pwr += group->cpu_power; | ||
1788 | |||
1789 | /* Adjust by relative CPU power of the group */ | ||
1790 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
1791 | |||
1792 | if (local_group) { | ||
1793 | this_load = avg_load; | ||
1794 | this = group; | ||
1795 | goto nextgroup; | ||
1796 | } else if (avg_load > max_load) { | ||
1797 | max_load = avg_load; | ||
1798 | busiest = group; | ||
1799 | } | ||
1800 | nextgroup: | ||
1801 | group = group->next; | ||
1802 | } while (group != sd->groups); | ||
1803 | |||
1804 | if (!busiest || this_load >= max_load) | ||
1805 | goto out_balanced; | ||
1806 | |||
1807 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | ||
1808 | |||
1809 | if (this_load >= avg_load || | ||
1810 | 100*max_load <= sd->imbalance_pct*this_load) | ||
1811 | goto out_balanced; | ||
1812 | |||
1813 | /* | ||
1814 | * We're trying to get all the cpus to the average_load, so we don't | ||
1815 | * want to push ourselves above the average load, nor do we wish to | ||
1816 | * reduce the max loaded cpu below the average load, as either of these | ||
1817 | * actions would just result in more rebalancing later, and ping-pong | ||
1818 | * tasks around. Thus we look for the minimum possible imbalance. | ||
1819 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
1820 | * be counted as no imbalance for these purposes -- we can't fix that | ||
1821 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
1822 | * appear as very large values with unsigned longs. | ||
1823 | */ | ||
1824 | /* How much load to actually move to equalise the imbalance */ | ||
1825 | *imbalance = min((max_load - avg_load) * busiest->cpu_power, | ||
1826 | (avg_load - this_load) * this->cpu_power) | ||
1827 | / SCHED_LOAD_SCALE; | ||
1828 | |||
1829 | if (*imbalance < SCHED_LOAD_SCALE) { | ||
1830 | unsigned long pwr_now = 0, pwr_move = 0; | ||
1831 | unsigned long tmp; | ||
1832 | |||
1833 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | ||
1834 | *imbalance = 1; | ||
1835 | return busiest; | ||
1836 | } | ||
1837 | |||
1838 | /* | ||
1839 | * OK, we don't have enough imbalance to justify moving tasks, | ||
1840 | * however we may be able to increase total CPU power used by | ||
1841 | * moving them. | ||
1842 | */ | ||
1843 | |||
1844 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | ||
1845 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | ||
1846 | pwr_now /= SCHED_LOAD_SCALE; | ||
1847 | |||
1848 | /* Amount of load we'd subtract */ | ||
1849 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | ||
1850 | if (max_load > tmp) | ||
1851 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | ||
1852 | max_load - tmp); | ||
1853 | |||
1854 | /* Amount of load we'd add */ | ||
1855 | if (max_load*busiest->cpu_power < | ||
1856 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | ||
1857 | tmp = max_load*busiest->cpu_power/this->cpu_power; | ||
1858 | else | ||
1859 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | ||
1860 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | ||
1861 | pwr_move /= SCHED_LOAD_SCALE; | ||
1862 | |||
1863 | /* Move if we gain throughput */ | ||
1864 | if (pwr_move <= pwr_now) | ||
1865 | goto out_balanced; | ||
1866 | |||
1867 | *imbalance = 1; | ||
1868 | return busiest; | ||
1869 | } | ||
1870 | |||
1871 | /* Get rid of the scaling factor, rounding down as we divide */ | ||
1872 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
1873 | |||
1874 | return busiest; | ||
1875 | |||
1876 | out_balanced: | ||
1877 | if (busiest && (idle == NEWLY_IDLE || | ||
1878 | (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { | ||
1879 | *imbalance = 1; | ||
1880 | return busiest; | ||
1881 | } | ||
1882 | |||
1883 | *imbalance = 0; | ||
1884 | return NULL; | ||
1885 | } | ||
1886 | |||
1887 | /* | ||
1888 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | ||
1889 | */ | ||
1890 | static runqueue_t *find_busiest_queue(struct sched_group *group) | ||
1891 | { | ||
1892 | unsigned long load, max_load = 0; | ||
1893 | runqueue_t *busiest = NULL; | ||
1894 | int i; | ||
1895 | |||
1896 | for_each_cpu_mask(i, group->cpumask) { | ||
1897 | load = source_load(i); | ||
1898 | |||
1899 | if (load > max_load) { | ||
1900 | max_load = load; | ||
1901 | busiest = cpu_rq(i); | ||
1902 | } | ||
1903 | } | ||
1904 | |||
1905 | return busiest; | ||
1906 | } | ||
1907 | |||
1908 | /* | ||
1909 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
1910 | * tasks if there is an imbalance. | ||
1911 | * | ||
1912 | * Called with this_rq unlocked. | ||
1913 | */ | ||
1914 | static int load_balance(int this_cpu, runqueue_t *this_rq, | ||
1915 | struct sched_domain *sd, enum idle_type idle) | ||
1916 | { | ||
1917 | struct sched_group *group; | ||
1918 | runqueue_t *busiest; | ||
1919 | unsigned long imbalance; | ||
1920 | int nr_moved; | ||
1921 | |||
1922 | spin_lock(&this_rq->lock); | ||
1923 | schedstat_inc(sd, lb_cnt[idle]); | ||
1924 | |||
1925 | group = find_busiest_group(sd, this_cpu, &imbalance, idle); | ||
1926 | if (!group) { | ||
1927 | schedstat_inc(sd, lb_nobusyg[idle]); | ||
1928 | goto out_balanced; | ||
1929 | } | ||
1930 | |||
1931 | busiest = find_busiest_queue(group); | ||
1932 | if (!busiest) { | ||
1933 | schedstat_inc(sd, lb_nobusyq[idle]); | ||
1934 | goto out_balanced; | ||
1935 | } | ||
1936 | |||
1937 | /* | ||
1938 | * This should be "impossible", but since load | ||
1939 | * balancing is inherently racy and statistical, | ||
1940 | * it could happen in theory. | ||
1941 | */ | ||
1942 | if (unlikely(busiest == this_rq)) { | ||
1943 | WARN_ON(1); | ||
1944 | goto out_balanced; | ||
1945 | } | ||
1946 | |||
1947 | schedstat_add(sd, lb_imbalance[idle], imbalance); | ||
1948 | |||
1949 | nr_moved = 0; | ||
1950 | if (busiest->nr_running > 1) { | ||
1951 | /* | ||
1952 | * Attempt to move tasks. If find_busiest_group has found | ||
1953 | * an imbalance but busiest->nr_running <= 1, the group is | ||
1954 | * still unbalanced. nr_moved simply stays zero, so it is | ||
1955 | * correctly treated as an imbalance. | ||
1956 | */ | ||
1957 | double_lock_balance(this_rq, busiest); | ||
1958 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | ||
1959 | imbalance, sd, idle); | ||
1960 | spin_unlock(&busiest->lock); | ||
1961 | } | ||
1962 | spin_unlock(&this_rq->lock); | ||
1963 | |||
1964 | if (!nr_moved) { | ||
1965 | schedstat_inc(sd, lb_failed[idle]); | ||
1966 | sd->nr_balance_failed++; | ||
1967 | |||
1968 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | ||
1969 | int wake = 0; | ||
1970 | |||
1971 | spin_lock(&busiest->lock); | ||
1972 | if (!busiest->active_balance) { | ||
1973 | busiest->active_balance = 1; | ||
1974 | busiest->push_cpu = this_cpu; | ||
1975 | wake = 1; | ||
1976 | } | ||
1977 | spin_unlock(&busiest->lock); | ||
1978 | if (wake) | ||
1979 | wake_up_process(busiest->migration_thread); | ||
1980 | |||
1981 | /* | ||
1982 | * We've kicked active balancing, reset the failure | ||
1983 | * counter. | ||
1984 | */ | ||
1985 | sd->nr_balance_failed = sd->cache_nice_tries; | ||
1986 | } | ||
1987 | |||
1988 | /* | ||
1989 | * We were unbalanced, but unsuccessful in move_tasks(), | ||
1990 | * so bump the balance_interval to lessen the lock contention. | ||
1991 | */ | ||
1992 | if (sd->balance_interval < sd->max_interval) | ||
1993 | sd->balance_interval++; | ||
1994 | } else { | ||
1995 | sd->nr_balance_failed = 0; | ||
1996 | |||
1997 | /* We were unbalanced, so reset the balancing interval */ | ||
1998 | sd->balance_interval = sd->min_interval; | ||
1999 | } | ||
2000 | |||
2001 | return nr_moved; | ||
2002 | |||
2003 | out_balanced: | ||
2004 | spin_unlock(&this_rq->lock); | ||
2005 | |||
2006 | schedstat_inc(sd, lb_balanced[idle]); | ||
2007 | |||
2008 | /* tune up the balancing interval */ | ||
2009 | if (sd->balance_interval < sd->max_interval) | ||
2010 | sd->balance_interval *= 2; | ||
2011 | |||
2012 | return 0; | ||
2013 | } | ||
2014 | |||
2015 | /* | ||
2016 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | ||
2017 | * tasks if there is an imbalance. | ||
2018 | * | ||
2019 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). | ||
2020 | * this_rq is locked. | ||
2021 | */ | ||
2022 | static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | ||
2023 | struct sched_domain *sd) | ||
2024 | { | ||
2025 | struct sched_group *group; | ||
2026 | runqueue_t *busiest = NULL; | ||
2027 | unsigned long imbalance; | ||
2028 | int nr_moved = 0; | ||
2029 | |||
2030 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | ||
2031 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); | ||
2032 | if (!group) { | ||
2033 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
2034 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | ||
2035 | goto out; | ||
2036 | } | ||
2037 | |||
2038 | busiest = find_busiest_queue(group); | ||
2039 | if (!busiest || busiest == this_rq) { | ||
2040 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
2041 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | ||
2042 | goto out; | ||
2043 | } | ||
2044 | |||
2045 | /* Attempt to move tasks */ | ||
2046 | double_lock_balance(this_rq, busiest); | ||
2047 | |||
2048 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | ||
2049 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | ||
2050 | imbalance, sd, NEWLY_IDLE); | ||
2051 | if (!nr_moved) | ||
2052 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | ||
2053 | |||
2054 | spin_unlock(&busiest->lock); | ||
2055 | |||
2056 | out: | ||
2057 | return nr_moved; | ||
2058 | } | ||
2059 | |||
2060 | /* | ||
2061 | * idle_balance is called by schedule() if this_cpu is about to become | ||
2062 | * idle. Attempts to pull tasks from other CPUs. | ||
2063 | */ | ||
2064 | static inline void idle_balance(int this_cpu, runqueue_t *this_rq) | ||
2065 | { | ||
2066 | struct sched_domain *sd; | ||
2067 | |||
2068 | for_each_domain(this_cpu, sd) { | ||
2069 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
2070 | if (load_balance_newidle(this_cpu, this_rq, sd)) { | ||
2071 | /* We've pulled tasks over so stop searching */ | ||
2072 | break; | ||
2073 | } | ||
2074 | } | ||
2075 | } | ||
2076 | } | ||
2077 | |||
2078 | /* | ||
2079 | * active_load_balance is run by migration threads. It pushes running tasks | ||
2080 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | ||
2081 | * running on each physical CPU where possible, and avoids physical / | ||
2082 | * logical imbalances. | ||
2083 | * | ||
2084 | * Called with busiest_rq locked. | ||
2085 | */ | ||
2086 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | ||
2087 | { | ||
2088 | struct sched_domain *sd; | ||
2089 | struct sched_group *cpu_group; | ||
2090 | runqueue_t *target_rq; | ||
2091 | cpumask_t visited_cpus; | ||
2092 | int cpu; | ||
2093 | |||
2094 | /* | ||
2095 | * Search for suitable CPUs to push tasks to in successively higher | ||
2096 | * domains with SD_LOAD_BALANCE set. | ||
2097 | */ | ||
2098 | visited_cpus = CPU_MASK_NONE; | ||
2099 | for_each_domain(busiest_cpu, sd) { | ||
2100 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
2101 | /* no more domains to search */ | ||
2102 | break; | ||
2103 | |||
2104 | schedstat_inc(sd, alb_cnt); | ||
2105 | |||
2106 | cpu_group = sd->groups; | ||
2107 | do { | ||
2108 | for_each_cpu_mask(cpu, cpu_group->cpumask) { | ||
2109 | if (busiest_rq->nr_running <= 1) | ||
2110 | /* no more tasks left to move */ | ||
2111 | return; | ||
2112 | if (cpu_isset(cpu, visited_cpus)) | ||
2113 | continue; | ||
2114 | cpu_set(cpu, visited_cpus); | ||
2115 | if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) | ||
2116 | continue; | ||
2117 | |||
2118 | target_rq = cpu_rq(cpu); | ||
2119 | /* | ||
2120 | * This condition is "impossible", if it occurs | ||
2121 | * we need to fix it. Originally reported by | ||
2122 | * Bjorn Helgaas on a 128-cpu setup. | ||
2123 | */ | ||
2124 | BUG_ON(busiest_rq == target_rq); | ||
2125 | |||
2126 | /* move a task from busiest_rq to target_rq */ | ||
2127 | double_lock_balance(busiest_rq, target_rq); | ||
2128 | if (move_tasks(target_rq, cpu, busiest_rq, | ||
2129 | 1, sd, SCHED_IDLE)) { | ||
2130 | schedstat_inc(sd, alb_pushed); | ||
2131 | } else { | ||
2132 | schedstat_inc(sd, alb_failed); | ||
2133 | } | ||
2134 | spin_unlock(&target_rq->lock); | ||
2135 | } | ||
2136 | cpu_group = cpu_group->next; | ||
2137 | } while (cpu_group != sd->groups); | ||
2138 | } | ||
2139 | } | ||
2140 | |||
2141 | /* | ||
2142 | * rebalance_tick will get called every timer tick, on every CPU. | ||
2143 | * | ||
2144 | * It checks each scheduling domain to see if it is due to be balanced, | ||
2145 | * and initiates a balancing operation if so. | ||
2146 | * | ||
2147 | * Balancing parameters are set up in arch_init_sched_domains. | ||
2148 | */ | ||
2149 | |||
2150 | /* Don't have all balancing operations going off at once */ | ||
2151 | #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) | ||
2152 | |||
2153 | static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | ||
2154 | enum idle_type idle) | ||
2155 | { | ||
2156 | unsigned long old_load, this_load; | ||
2157 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | ||
2158 | struct sched_domain *sd; | ||
2159 | |||
2160 | /* Update our load */ | ||
2161 | old_load = this_rq->cpu_load; | ||
2162 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | ||
2163 | /* | ||
2164 | * Round up the averaging division if load is increasing. This | ||
2165 | * prevents us from getting stuck on 9 if the load is 10, for | ||
2166 | * example. | ||
2167 | */ | ||
2168 | if (this_load > old_load) | ||
2169 | old_load++; | ||
2170 | this_rq->cpu_load = (old_load + this_load) / 2; | ||
2171 | |||
2172 | for_each_domain(this_cpu, sd) { | ||
2173 | unsigned long interval; | ||
2174 | |||
2175 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
2176 | continue; | ||
2177 | |||
2178 | interval = sd->balance_interval; | ||
2179 | if (idle != SCHED_IDLE) | ||
2180 | interval *= sd->busy_factor; | ||
2181 | |||
2182 | /* scale ms to jiffies */ | ||
2183 | interval = msecs_to_jiffies(interval); | ||
2184 | if (unlikely(!interval)) | ||
2185 | interval = 1; | ||
2186 | |||
2187 | if (j - sd->last_balance >= interval) { | ||
2188 | if (load_balance(this_cpu, this_rq, sd, idle)) { | ||
2189 | /* We've pulled tasks over so no longer idle */ | ||
2190 | idle = NOT_IDLE; | ||
2191 | } | ||
2192 | sd->last_balance += interval; | ||
2193 | } | ||
2194 | } | ||
2195 | } | ||
2196 | #else | ||
2197 | /* | ||
2198 | * on UP we do not need to balance between CPUs: | ||
2199 | */ | ||
2200 | static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) | ||
2201 | { | ||
2202 | } | ||
2203 | static inline void idle_balance(int cpu, runqueue_t *rq) | ||
2204 | { | ||
2205 | } | ||
2206 | #endif | ||
2207 | |||
2208 | static inline int wake_priority_sleeper(runqueue_t *rq) | ||
2209 | { | ||
2210 | int ret = 0; | ||
2211 | #ifdef CONFIG_SCHED_SMT | ||
2212 | spin_lock(&rq->lock); | ||
2213 | /* | ||
2214 | * If an SMT sibling task has been put to sleep for priority | ||
2215 | * reasons reschedule the idle task to see if it can now run. | ||
2216 | */ | ||
2217 | if (rq->nr_running) { | ||
2218 | resched_task(rq->idle); | ||
2219 | ret = 1; | ||
2220 | } | ||
2221 | spin_unlock(&rq->lock); | ||
2222 | #endif | ||
2223 | return ret; | ||
2224 | } | ||
2225 | |||
2226 | DEFINE_PER_CPU(struct kernel_stat, kstat); | ||
2227 | |||
2228 | EXPORT_PER_CPU_SYMBOL(kstat); | ||
2229 | |||
2230 | /* | ||
2231 | * This is called on clock ticks and on context switches. | ||
2232 | * Bank in p->sched_time the ns elapsed since the last tick or switch. | ||
2233 | */ | ||
2234 | static inline void update_cpu_clock(task_t *p, runqueue_t *rq, | ||
2235 | unsigned long long now) | ||
2236 | { | ||
2237 | unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); | ||
2238 | p->sched_time += now - last; | ||
2239 | } | ||
2240 | |||
2241 | /* | ||
2242 | * Return current->sched_time plus any more ns on the sched_clock | ||
2243 | * that have not yet been banked. | ||
2244 | */ | ||
2245 | unsigned long long current_sched_time(const task_t *tsk) | ||
2246 | { | ||
2247 | unsigned long long ns; | ||
2248 | unsigned long flags; | ||
2249 | local_irq_save(flags); | ||
2250 | ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); | ||
2251 | ns = tsk->sched_time + (sched_clock() - ns); | ||
2252 | local_irq_restore(flags); | ||
2253 | return ns; | ||
2254 | } | ||
2255 | |||
2256 | /* | ||
2257 | * We place interactive tasks back into the active array, if possible. | ||
2258 | * | ||
2259 | * To guarantee that this does not starve expired tasks we ignore the | ||
2260 | * interactivity of a task if the first expired task had to wait more | ||
2261 | * than a 'reasonable' amount of time. This deadline timeout is | ||
2262 | * load-dependent, as the frequency of array switched decreases with | ||
2263 | * increasing number of running tasks. We also ignore the interactivity | ||
2264 | * if a better static_prio task has expired: | ||
2265 | */ | ||
2266 | #define EXPIRED_STARVING(rq) \ | ||
2267 | ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ | ||
2268 | (jiffies - (rq)->expired_timestamp >= \ | ||
2269 | STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ | ||
2270 | ((rq)->curr->static_prio > (rq)->best_expired_prio)) | ||
2271 | |||
2272 | /* | ||
2273 | * Account user cpu time to a process. | ||
2274 | * @p: the process that the cpu time gets accounted to | ||
2275 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
2276 | * @cputime: the cpu time spent in user space since the last update | ||
2277 | */ | ||
2278 | void account_user_time(struct task_struct *p, cputime_t cputime) | ||
2279 | { | ||
2280 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2281 | cputime64_t tmp; | ||
2282 | |||
2283 | p->utime = cputime_add(p->utime, cputime); | ||
2284 | |||
2285 | /* Add user time to cpustat. */ | ||
2286 | tmp = cputime_to_cputime64(cputime); | ||
2287 | if (TASK_NICE(p) > 0) | ||
2288 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | ||
2289 | else | ||
2290 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
2291 | } | ||
2292 | |||
2293 | /* | ||
2294 | * Account system cpu time to a process. | ||
2295 | * @p: the process that the cpu time gets accounted to | ||
2296 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
2297 | * @cputime: the cpu time spent in kernel space since the last update | ||
2298 | */ | ||
2299 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
2300 | cputime_t cputime) | ||
2301 | { | ||
2302 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2303 | runqueue_t *rq = this_rq(); | ||
2304 | cputime64_t tmp; | ||
2305 | |||
2306 | p->stime = cputime_add(p->stime, cputime); | ||
2307 | |||
2308 | /* Add system time to cpustat. */ | ||
2309 | tmp = cputime_to_cputime64(cputime); | ||
2310 | if (hardirq_count() - hardirq_offset) | ||
2311 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
2312 | else if (softirq_count()) | ||
2313 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
2314 | else if (p != rq->idle) | ||
2315 | cpustat->system = cputime64_add(cpustat->system, tmp); | ||
2316 | else if (atomic_read(&rq->nr_iowait) > 0) | ||
2317 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | ||
2318 | else | ||
2319 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | ||
2320 | /* Account for system time used */ | ||
2321 | acct_update_integrals(p); | ||
2322 | /* Update rss highwater mark */ | ||
2323 | update_mem_hiwater(p); | ||
2324 | } | ||
2325 | |||
2326 | /* | ||
2327 | * Account for involuntary wait time. | ||
2328 | * @p: the process from which the cpu time has been stolen | ||
2329 | * @steal: the cpu time spent in involuntary wait | ||
2330 | */ | ||
2331 | void account_steal_time(struct task_struct *p, cputime_t steal) | ||
2332 | { | ||
2333 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
2334 | cputime64_t tmp = cputime_to_cputime64(steal); | ||
2335 | runqueue_t *rq = this_rq(); | ||
2336 | |||
2337 | if (p == rq->idle) { | ||
2338 | p->stime = cputime_add(p->stime, steal); | ||
2339 | if (atomic_read(&rq->nr_iowait) > 0) | ||
2340 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | ||
2341 | else | ||
2342 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | ||
2343 | } else | ||
2344 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | ||
2345 | } | ||
2346 | |||
2347 | /* | ||
2348 | * This function gets called by the timer code, with HZ frequency. | ||
2349 | * We call it with interrupts disabled. | ||
2350 | * | ||
2351 | * It also gets called by the fork code, when changing the parent's | ||
2352 | * timeslices. | ||
2353 | */ | ||
2354 | void scheduler_tick(void) | ||
2355 | { | ||
2356 | int cpu = smp_processor_id(); | ||
2357 | runqueue_t *rq = this_rq(); | ||
2358 | task_t *p = current; | ||
2359 | unsigned long long now = sched_clock(); | ||
2360 | |||
2361 | update_cpu_clock(p, rq, now); | ||
2362 | |||
2363 | rq->timestamp_last_tick = now; | ||
2364 | |||
2365 | if (p == rq->idle) { | ||
2366 | if (wake_priority_sleeper(rq)) | ||
2367 | goto out; | ||
2368 | rebalance_tick(cpu, rq, SCHED_IDLE); | ||
2369 | return; | ||
2370 | } | ||
2371 | |||
2372 | /* Task might have expired already, but not scheduled off yet */ | ||
2373 | if (p->array != rq->active) { | ||
2374 | set_tsk_need_resched(p); | ||
2375 | goto out; | ||
2376 | } | ||
2377 | spin_lock(&rq->lock); | ||
2378 | /* | ||
2379 | * The task was running during this tick - update the | ||
2380 | * time slice counter. Note: we do not update a thread's | ||
2381 | * priority until it either goes to sleep or uses up its | ||
2382 | * timeslice. This makes it possible for interactive tasks | ||
2383 | * to use up their timeslices at their highest priority levels. | ||
2384 | */ | ||
2385 | if (rt_task(p)) { | ||
2386 | /* | ||
2387 | * RR tasks need a special form of timeslice management. | ||
2388 | * FIFO tasks have no timeslices. | ||
2389 | */ | ||
2390 | if ((p->policy == SCHED_RR) && !--p->time_slice) { | ||
2391 | p->time_slice = task_timeslice(p); | ||
2392 | p->first_time_slice = 0; | ||
2393 | set_tsk_need_resched(p); | ||
2394 | |||
2395 | /* put it at the end of the queue: */ | ||
2396 | requeue_task(p, rq->active); | ||
2397 | } | ||
2398 | goto out_unlock; | ||
2399 | } | ||
2400 | if (!--p->time_slice) { | ||
2401 | dequeue_task(p, rq->active); | ||
2402 | set_tsk_need_resched(p); | ||
2403 | p->prio = effective_prio(p); | ||
2404 | p->time_slice = task_timeslice(p); | ||
2405 | p->first_time_slice = 0; | ||
2406 | |||
2407 | if (!rq->expired_timestamp) | ||
2408 | rq->expired_timestamp = jiffies; | ||
2409 | if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | ||
2410 | enqueue_task(p, rq->expired); | ||
2411 | if (p->static_prio < rq->best_expired_prio) | ||
2412 | rq->best_expired_prio = p->static_prio; | ||
2413 | } else | ||
2414 | enqueue_task(p, rq->active); | ||
2415 | } else { | ||
2416 | /* | ||
2417 | * Prevent a too long timeslice allowing a task to monopolize | ||
2418 | * the CPU. We do this by splitting up the timeslice into | ||
2419 | * smaller pieces. | ||
2420 | * | ||
2421 | * Note: this does not mean the task's timeslices expire or | ||
2422 | * get lost in any way, they just might be preempted by | ||
2423 | * another task of equal priority. (one with higher | ||
2424 | * priority would have preempted this task already.) We | ||
2425 | * requeue this task to the end of the list on this priority | ||
2426 | * level, which is in essence a round-robin of tasks with | ||
2427 | * equal priority. | ||
2428 | * | ||
2429 | * This only applies to tasks in the interactive | ||
2430 | * delta range with at least TIMESLICE_GRANULARITY to requeue. | ||
2431 | */ | ||
2432 | if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | ||
2433 | p->time_slice) % TIMESLICE_GRANULARITY(p)) && | ||
2434 | (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | ||
2435 | (p->array == rq->active)) { | ||
2436 | |||
2437 | requeue_task(p, rq->active); | ||
2438 | set_tsk_need_resched(p); | ||
2439 | } | ||
2440 | } | ||
2441 | out_unlock: | ||
2442 | spin_unlock(&rq->lock); | ||
2443 | out: | ||
2444 | rebalance_tick(cpu, rq, NOT_IDLE); | ||
2445 | } | ||
2446 | |||
2447 | #ifdef CONFIG_SCHED_SMT | ||
2448 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | ||
2449 | { | ||
2450 | struct sched_domain *sd = this_rq->sd; | ||
2451 | cpumask_t sibling_map; | ||
2452 | int i; | ||
2453 | |||
2454 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | ||
2455 | return; | ||
2456 | |||
2457 | /* | ||
2458 | * Unlock the current runqueue because we have to lock in | ||
2459 | * CPU order to avoid deadlocks. Caller knows that we might | ||
2460 | * unlock. We keep IRQs disabled. | ||
2461 | */ | ||
2462 | spin_unlock(&this_rq->lock); | ||
2463 | |||
2464 | sibling_map = sd->span; | ||
2465 | |||
2466 | for_each_cpu_mask(i, sibling_map) | ||
2467 | spin_lock(&cpu_rq(i)->lock); | ||
2468 | /* | ||
2469 | * We clear this CPU from the mask. This both simplifies the | ||
2470 | * inner loop and keps this_rq locked when we exit: | ||
2471 | */ | ||
2472 | cpu_clear(this_cpu, sibling_map); | ||
2473 | |||
2474 | for_each_cpu_mask(i, sibling_map) { | ||
2475 | runqueue_t *smt_rq = cpu_rq(i); | ||
2476 | |||
2477 | /* | ||
2478 | * If an SMT sibling task is sleeping due to priority | ||
2479 | * reasons wake it up now. | ||
2480 | */ | ||
2481 | if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) | ||
2482 | resched_task(smt_rq->idle); | ||
2483 | } | ||
2484 | |||
2485 | for_each_cpu_mask(i, sibling_map) | ||
2486 | spin_unlock(&cpu_rq(i)->lock); | ||
2487 | /* | ||
2488 | * We exit with this_cpu's rq still held and IRQs | ||
2489 | * still disabled: | ||
2490 | */ | ||
2491 | } | ||
2492 | |||
2493 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | ||
2494 | { | ||
2495 | struct sched_domain *sd = this_rq->sd; | ||
2496 | cpumask_t sibling_map; | ||
2497 | prio_array_t *array; | ||
2498 | int ret = 0, i; | ||
2499 | task_t *p; | ||
2500 | |||
2501 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | ||
2502 | return 0; | ||
2503 | |||
2504 | /* | ||
2505 | * The same locking rules and details apply as for | ||
2506 | * wake_sleeping_dependent(): | ||
2507 | */ | ||
2508 | spin_unlock(&this_rq->lock); | ||
2509 | sibling_map = sd->span; | ||
2510 | for_each_cpu_mask(i, sibling_map) | ||
2511 | spin_lock(&cpu_rq(i)->lock); | ||
2512 | cpu_clear(this_cpu, sibling_map); | ||
2513 | |||
2514 | /* | ||
2515 | * Establish next task to be run - it might have gone away because | ||
2516 | * we released the runqueue lock above: | ||
2517 | */ | ||
2518 | if (!this_rq->nr_running) | ||
2519 | goto out_unlock; | ||
2520 | array = this_rq->active; | ||
2521 | if (!array->nr_active) | ||
2522 | array = this_rq->expired; | ||
2523 | BUG_ON(!array->nr_active); | ||
2524 | |||
2525 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | ||
2526 | task_t, run_list); | ||
2527 | |||
2528 | for_each_cpu_mask(i, sibling_map) { | ||
2529 | runqueue_t *smt_rq = cpu_rq(i); | ||
2530 | task_t *smt_curr = smt_rq->curr; | ||
2531 | |||
2532 | /* | ||
2533 | * If a user task with lower static priority than the | ||
2534 | * running task on the SMT sibling is trying to schedule, | ||
2535 | * delay it till there is proportionately less timeslice | ||
2536 | * left of the sibling task to prevent a lower priority | ||
2537 | * task from using an unfair proportion of the | ||
2538 | * physical cpu's resources. -ck | ||
2539 | */ | ||
2540 | if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > | ||
2541 | task_timeslice(p) || rt_task(smt_curr)) && | ||
2542 | p->mm && smt_curr->mm && !rt_task(p)) | ||
2543 | ret = 1; | ||
2544 | |||
2545 | /* | ||
2546 | * Reschedule a lower priority task on the SMT sibling, | ||
2547 | * or wake it up if it has been put to sleep for priority | ||
2548 | * reasons. | ||
2549 | */ | ||
2550 | if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > | ||
2551 | task_timeslice(smt_curr) || rt_task(p)) && | ||
2552 | smt_curr->mm && p->mm && !rt_task(smt_curr)) || | ||
2553 | (smt_curr == smt_rq->idle && smt_rq->nr_running)) | ||
2554 | resched_task(smt_curr); | ||
2555 | } | ||
2556 | out_unlock: | ||
2557 | for_each_cpu_mask(i, sibling_map) | ||
2558 | spin_unlock(&cpu_rq(i)->lock); | ||
2559 | return ret; | ||
2560 | } | ||
2561 | #else | ||
2562 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | ||
2563 | { | ||
2564 | } | ||
2565 | |||
2566 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | ||
2567 | { | ||
2568 | return 0; | ||
2569 | } | ||
2570 | #endif | ||
2571 | |||
2572 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | ||
2573 | |||
2574 | void fastcall add_preempt_count(int val) | ||
2575 | { | ||
2576 | /* | ||
2577 | * Underflow? | ||
2578 | */ | ||
2579 | BUG_ON(((int)preempt_count() < 0)); | ||
2580 | preempt_count() += val; | ||
2581 | /* | ||
2582 | * Spinlock count overflowing soon? | ||
2583 | */ | ||
2584 | BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | ||
2585 | } | ||
2586 | EXPORT_SYMBOL(add_preempt_count); | ||
2587 | |||
2588 | void fastcall sub_preempt_count(int val) | ||
2589 | { | ||
2590 | /* | ||
2591 | * Underflow? | ||
2592 | */ | ||
2593 | BUG_ON(val > preempt_count()); | ||
2594 | /* | ||
2595 | * Is the spinlock portion underflowing? | ||
2596 | */ | ||
2597 | BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); | ||
2598 | preempt_count() -= val; | ||
2599 | } | ||
2600 | EXPORT_SYMBOL(sub_preempt_count); | ||
2601 | |||
2602 | #endif | ||
2603 | |||
2604 | /* | ||
2605 | * schedule() is the main scheduler function. | ||
2606 | */ | ||
2607 | asmlinkage void __sched schedule(void) | ||
2608 | { | ||
2609 | long *switch_count; | ||
2610 | task_t *prev, *next; | ||
2611 | runqueue_t *rq; | ||
2612 | prio_array_t *array; | ||
2613 | struct list_head *queue; | ||
2614 | unsigned long long now; | ||
2615 | unsigned long run_time; | ||
2616 | int cpu, idx; | ||
2617 | |||
2618 | /* | ||
2619 | * Test if we are atomic. Since do_exit() needs to call into | ||
2620 | * schedule() atomically, we ignore that path for now. | ||
2621 | * Otherwise, whine if we are scheduling when we should not be. | ||
2622 | */ | ||
2623 | if (likely(!current->exit_state)) { | ||
2624 | if (unlikely(in_atomic())) { | ||
2625 | printk(KERN_ERR "scheduling while atomic: " | ||
2626 | "%s/0x%08x/%d\n", | ||
2627 | current->comm, preempt_count(), current->pid); | ||
2628 | dump_stack(); | ||
2629 | } | ||
2630 | } | ||
2631 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | ||
2632 | |||
2633 | need_resched: | ||
2634 | preempt_disable(); | ||
2635 | prev = current; | ||
2636 | release_kernel_lock(prev); | ||
2637 | need_resched_nonpreemptible: | ||
2638 | rq = this_rq(); | ||
2639 | |||
2640 | /* | ||
2641 | * The idle thread is not allowed to schedule! | ||
2642 | * Remove this check after it has been exercised a bit. | ||
2643 | */ | ||
2644 | if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { | ||
2645 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
2646 | dump_stack(); | ||
2647 | } | ||
2648 | |||
2649 | schedstat_inc(rq, sched_cnt); | ||
2650 | now = sched_clock(); | ||
2651 | if (likely((long long)now - prev->timestamp < NS_MAX_SLEEP_AVG)) { | ||
2652 | run_time = now - prev->timestamp; | ||
2653 | if (unlikely((long long)now - prev->timestamp < 0)) | ||
2654 | run_time = 0; | ||
2655 | } else | ||
2656 | run_time = NS_MAX_SLEEP_AVG; | ||
2657 | |||
2658 | /* | ||
2659 | * Tasks charged proportionately less run_time at high sleep_avg to | ||
2660 | * delay them losing their interactive status | ||
2661 | */ | ||
2662 | run_time /= (CURRENT_BONUS(prev) ? : 1); | ||
2663 | |||
2664 | spin_lock_irq(&rq->lock); | ||
2665 | |||
2666 | if (unlikely(prev->flags & PF_DEAD)) | ||
2667 | prev->state = EXIT_DEAD; | ||
2668 | |||
2669 | switch_count = &prev->nivcsw; | ||
2670 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | ||
2671 | switch_count = &prev->nvcsw; | ||
2672 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | ||
2673 | unlikely(signal_pending(prev)))) | ||
2674 | prev->state = TASK_RUNNING; | ||
2675 | else { | ||
2676 | if (prev->state == TASK_UNINTERRUPTIBLE) | ||
2677 | rq->nr_uninterruptible++; | ||
2678 | deactivate_task(prev, rq); | ||
2679 | } | ||
2680 | } | ||
2681 | |||
2682 | cpu = smp_processor_id(); | ||
2683 | if (unlikely(!rq->nr_running)) { | ||
2684 | go_idle: | ||
2685 | idle_balance(cpu, rq); | ||
2686 | if (!rq->nr_running) { | ||
2687 | next = rq->idle; | ||
2688 | rq->expired_timestamp = 0; | ||
2689 | wake_sleeping_dependent(cpu, rq); | ||
2690 | /* | ||
2691 | * wake_sleeping_dependent() might have released | ||
2692 | * the runqueue, so break out if we got new | ||
2693 | * tasks meanwhile: | ||
2694 | */ | ||
2695 | if (!rq->nr_running) | ||
2696 | goto switch_tasks; | ||
2697 | } | ||
2698 | } else { | ||
2699 | if (dependent_sleeper(cpu, rq)) { | ||
2700 | next = rq->idle; | ||
2701 | goto switch_tasks; | ||
2702 | } | ||
2703 | /* | ||
2704 | * dependent_sleeper() releases and reacquires the runqueue | ||
2705 | * lock, hence go into the idle loop if the rq went | ||
2706 | * empty meanwhile: | ||
2707 | */ | ||
2708 | if (unlikely(!rq->nr_running)) | ||
2709 | goto go_idle; | ||
2710 | } | ||
2711 | |||
2712 | array = rq->active; | ||
2713 | if (unlikely(!array->nr_active)) { | ||
2714 | /* | ||
2715 | * Switch the active and expired arrays. | ||
2716 | */ | ||
2717 | schedstat_inc(rq, sched_switch); | ||
2718 | rq->active = rq->expired; | ||
2719 | rq->expired = array; | ||
2720 | array = rq->active; | ||
2721 | rq->expired_timestamp = 0; | ||
2722 | rq->best_expired_prio = MAX_PRIO; | ||
2723 | } | ||
2724 | |||
2725 | idx = sched_find_first_bit(array->bitmap); | ||
2726 | queue = array->queue + idx; | ||
2727 | next = list_entry(queue->next, task_t, run_list); | ||
2728 | |||
2729 | if (!rt_task(next) && next->activated > 0) { | ||
2730 | unsigned long long delta = now - next->timestamp; | ||
2731 | if (unlikely((long long)now - next->timestamp < 0)) | ||
2732 | delta = 0; | ||
2733 | |||
2734 | if (next->activated == 1) | ||
2735 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | ||
2736 | |||
2737 | array = next->array; | ||
2738 | dequeue_task(next, array); | ||
2739 | recalc_task_prio(next, next->timestamp + delta); | ||
2740 | enqueue_task(next, array); | ||
2741 | } | ||
2742 | next->activated = 0; | ||
2743 | switch_tasks: | ||
2744 | if (next == rq->idle) | ||
2745 | schedstat_inc(rq, sched_goidle); | ||
2746 | prefetch(next); | ||
2747 | clear_tsk_need_resched(prev); | ||
2748 | rcu_qsctr_inc(task_cpu(prev)); | ||
2749 | |||
2750 | update_cpu_clock(prev, rq, now); | ||
2751 | |||
2752 | prev->sleep_avg -= run_time; | ||
2753 | if ((long)prev->sleep_avg <= 0) | ||
2754 | prev->sleep_avg = 0; | ||
2755 | prev->timestamp = prev->last_ran = now; | ||
2756 | |||
2757 | sched_info_switch(prev, next); | ||
2758 | if (likely(prev != next)) { | ||
2759 | next->timestamp = now; | ||
2760 | rq->nr_switches++; | ||
2761 | rq->curr = next; | ||
2762 | ++*switch_count; | ||
2763 | |||
2764 | prepare_arch_switch(rq, next); | ||
2765 | prev = context_switch(rq, prev, next); | ||
2766 | barrier(); | ||
2767 | |||
2768 | finish_task_switch(prev); | ||
2769 | } else | ||
2770 | spin_unlock_irq(&rq->lock); | ||
2771 | |||
2772 | prev = current; | ||
2773 | if (unlikely(reacquire_kernel_lock(prev) < 0)) | ||
2774 | goto need_resched_nonpreemptible; | ||
2775 | preempt_enable_no_resched(); | ||
2776 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | ||
2777 | goto need_resched; | ||
2778 | } | ||
2779 | |||
2780 | EXPORT_SYMBOL(schedule); | ||
2781 | |||
2782 | #ifdef CONFIG_PREEMPT | ||
2783 | /* | ||
2784 | * this is is the entry point to schedule() from in-kernel preemption | ||
2785 | * off of preempt_enable. Kernel preemptions off return from interrupt | ||
2786 | * occur there and call schedule directly. | ||
2787 | */ | ||
2788 | asmlinkage void __sched preempt_schedule(void) | ||
2789 | { | ||
2790 | struct thread_info *ti = current_thread_info(); | ||
2791 | #ifdef CONFIG_PREEMPT_BKL | ||
2792 | struct task_struct *task = current; | ||
2793 | int saved_lock_depth; | ||
2794 | #endif | ||
2795 | /* | ||
2796 | * If there is a non-zero preempt_count or interrupts are disabled, | ||
2797 | * we do not want to preempt the current task. Just return.. | ||
2798 | */ | ||
2799 | if (unlikely(ti->preempt_count || irqs_disabled())) | ||
2800 | return; | ||
2801 | |||
2802 | need_resched: | ||
2803 | add_preempt_count(PREEMPT_ACTIVE); | ||
2804 | /* | ||
2805 | * We keep the big kernel semaphore locked, but we | ||
2806 | * clear ->lock_depth so that schedule() doesnt | ||
2807 | * auto-release the semaphore: | ||
2808 | */ | ||
2809 | #ifdef CONFIG_PREEMPT_BKL | ||
2810 | saved_lock_depth = task->lock_depth; | ||
2811 | task->lock_depth = -1; | ||
2812 | #endif | ||
2813 | schedule(); | ||
2814 | #ifdef CONFIG_PREEMPT_BKL | ||
2815 | task->lock_depth = saved_lock_depth; | ||
2816 | #endif | ||
2817 | sub_preempt_count(PREEMPT_ACTIVE); | ||
2818 | |||
2819 | /* we could miss a preemption opportunity between schedule and now */ | ||
2820 | barrier(); | ||
2821 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | ||
2822 | goto need_resched; | ||
2823 | } | ||
2824 | |||
2825 | EXPORT_SYMBOL(preempt_schedule); | ||
2826 | |||
2827 | /* | ||
2828 | * this is is the entry point to schedule() from kernel preemption | ||
2829 | * off of irq context. | ||
2830 | * Note, that this is called and return with irqs disabled. This will | ||
2831 | * protect us against recursive calling from irq. | ||
2832 | */ | ||
2833 | asmlinkage void __sched preempt_schedule_irq(void) | ||
2834 | { | ||
2835 | struct thread_info *ti = current_thread_info(); | ||
2836 | #ifdef CONFIG_PREEMPT_BKL | ||
2837 | struct task_struct *task = current; | ||
2838 | int saved_lock_depth; | ||
2839 | #endif | ||
2840 | /* Catch callers which need to be fixed*/ | ||
2841 | BUG_ON(ti->preempt_count || !irqs_disabled()); | ||
2842 | |||
2843 | need_resched: | ||
2844 | add_preempt_count(PREEMPT_ACTIVE); | ||
2845 | /* | ||
2846 | * We keep the big kernel semaphore locked, but we | ||
2847 | * clear ->lock_depth so that schedule() doesnt | ||
2848 | * auto-release the semaphore: | ||
2849 | */ | ||
2850 | #ifdef CONFIG_PREEMPT_BKL | ||
2851 | saved_lock_depth = task->lock_depth; | ||
2852 | task->lock_depth = -1; | ||
2853 | #endif | ||
2854 | local_irq_enable(); | ||
2855 | schedule(); | ||
2856 | local_irq_disable(); | ||
2857 | #ifdef CONFIG_PREEMPT_BKL | ||
2858 | task->lock_depth = saved_lock_depth; | ||
2859 | #endif | ||
2860 | sub_preempt_count(PREEMPT_ACTIVE); | ||
2861 | |||
2862 | /* we could miss a preemption opportunity between schedule and now */ | ||
2863 | barrier(); | ||
2864 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | ||
2865 | goto need_resched; | ||
2866 | } | ||
2867 | |||
2868 | #endif /* CONFIG_PREEMPT */ | ||
2869 | |||
2870 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) | ||
2871 | { | ||
2872 | task_t *p = curr->task; | ||
2873 | return try_to_wake_up(p, mode, sync); | ||
2874 | } | ||
2875 | |||
2876 | EXPORT_SYMBOL(default_wake_function); | ||
2877 | |||
2878 | /* | ||
2879 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | ||
2880 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | ||
2881 | * number) then we wake all the non-exclusive tasks and one exclusive task. | ||
2882 | * | ||
2883 | * There are circumstances in which we can try to wake a task which has already | ||
2884 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | ||
2885 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | ||
2886 | */ | ||
2887 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | ||
2888 | int nr_exclusive, int sync, void *key) | ||
2889 | { | ||
2890 | struct list_head *tmp, *next; | ||
2891 | |||
2892 | list_for_each_safe(tmp, next, &q->task_list) { | ||
2893 | wait_queue_t *curr; | ||
2894 | unsigned flags; | ||
2895 | curr = list_entry(tmp, wait_queue_t, task_list); | ||
2896 | flags = curr->flags; | ||
2897 | if (curr->func(curr, mode, sync, key) && | ||
2898 | (flags & WQ_FLAG_EXCLUSIVE) && | ||
2899 | !--nr_exclusive) | ||
2900 | break; | ||
2901 | } | ||
2902 | } | ||
2903 | |||
2904 | /** | ||
2905 | * __wake_up - wake up threads blocked on a waitqueue. | ||
2906 | * @q: the waitqueue | ||
2907 | * @mode: which threads | ||
2908 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
2909 | */ | ||
2910 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | ||
2911 | int nr_exclusive, void *key) | ||
2912 | { | ||
2913 | unsigned long flags; | ||
2914 | |||
2915 | spin_lock_irqsave(&q->lock, flags); | ||
2916 | __wake_up_common(q, mode, nr_exclusive, 0, key); | ||
2917 | spin_unlock_irqrestore(&q->lock, flags); | ||
2918 | } | ||
2919 | |||
2920 | EXPORT_SYMBOL(__wake_up); | ||
2921 | |||
2922 | /* | ||
2923 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | ||
2924 | */ | ||
2925 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | ||
2926 | { | ||
2927 | __wake_up_common(q, mode, 1, 0, NULL); | ||
2928 | } | ||
2929 | |||
2930 | /** | ||
2931 | * __wake_up - sync- wake up threads blocked on a waitqueue. | ||
2932 | * @q: the waitqueue | ||
2933 | * @mode: which threads | ||
2934 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
2935 | * | ||
2936 | * The sync wakeup differs that the waker knows that it will schedule | ||
2937 | * away soon, so while the target thread will be woken up, it will not | ||
2938 | * be migrated to another CPU - ie. the two threads are 'synchronized' | ||
2939 | * with each other. This can prevent needless bouncing between CPUs. | ||
2940 | * | ||
2941 | * On UP it can prevent extra preemption. | ||
2942 | */ | ||
2943 | void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
2944 | { | ||
2945 | unsigned long flags; | ||
2946 | int sync = 1; | ||
2947 | |||
2948 | if (unlikely(!q)) | ||
2949 | return; | ||
2950 | |||
2951 | if (unlikely(!nr_exclusive)) | ||
2952 | sync = 0; | ||
2953 | |||
2954 | spin_lock_irqsave(&q->lock, flags); | ||
2955 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); | ||
2956 | spin_unlock_irqrestore(&q->lock, flags); | ||
2957 | } | ||
2958 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | ||
2959 | |||
2960 | void fastcall complete(struct completion *x) | ||
2961 | { | ||
2962 | unsigned long flags; | ||
2963 | |||
2964 | spin_lock_irqsave(&x->wait.lock, flags); | ||
2965 | x->done++; | ||
2966 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | ||
2967 | 1, 0, NULL); | ||
2968 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
2969 | } | ||
2970 | EXPORT_SYMBOL(complete); | ||
2971 | |||
2972 | void fastcall complete_all(struct completion *x) | ||
2973 | { | ||
2974 | unsigned long flags; | ||
2975 | |||
2976 | spin_lock_irqsave(&x->wait.lock, flags); | ||
2977 | x->done += UINT_MAX/2; | ||
2978 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | ||
2979 | 0, 0, NULL); | ||
2980 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
2981 | } | ||
2982 | EXPORT_SYMBOL(complete_all); | ||
2983 | |||
2984 | void fastcall __sched wait_for_completion(struct completion *x) | ||
2985 | { | ||
2986 | might_sleep(); | ||
2987 | spin_lock_irq(&x->wait.lock); | ||
2988 | if (!x->done) { | ||
2989 | DECLARE_WAITQUEUE(wait, current); | ||
2990 | |||
2991 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
2992 | __add_wait_queue_tail(&x->wait, &wait); | ||
2993 | do { | ||
2994 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
2995 | spin_unlock_irq(&x->wait.lock); | ||
2996 | schedule(); | ||
2997 | spin_lock_irq(&x->wait.lock); | ||
2998 | } while (!x->done); | ||
2999 | __remove_wait_queue(&x->wait, &wait); | ||
3000 | } | ||
3001 | x->done--; | ||
3002 | spin_unlock_irq(&x->wait.lock); | ||
3003 | } | ||
3004 | EXPORT_SYMBOL(wait_for_completion); | ||
3005 | |||
3006 | unsigned long fastcall __sched | ||
3007 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
3008 | { | ||
3009 | might_sleep(); | ||
3010 | |||
3011 | spin_lock_irq(&x->wait.lock); | ||
3012 | if (!x->done) { | ||
3013 | DECLARE_WAITQUEUE(wait, current); | ||
3014 | |||
3015 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3016 | __add_wait_queue_tail(&x->wait, &wait); | ||
3017 | do { | ||
3018 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
3019 | spin_unlock_irq(&x->wait.lock); | ||
3020 | timeout = schedule_timeout(timeout); | ||
3021 | spin_lock_irq(&x->wait.lock); | ||
3022 | if (!timeout) { | ||
3023 | __remove_wait_queue(&x->wait, &wait); | ||
3024 | goto out; | ||
3025 | } | ||
3026 | } while (!x->done); | ||
3027 | __remove_wait_queue(&x->wait, &wait); | ||
3028 | } | ||
3029 | x->done--; | ||
3030 | out: | ||
3031 | spin_unlock_irq(&x->wait.lock); | ||
3032 | return timeout; | ||
3033 | } | ||
3034 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
3035 | |||
3036 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) | ||
3037 | { | ||
3038 | int ret = 0; | ||
3039 | |||
3040 | might_sleep(); | ||
3041 | |||
3042 | spin_lock_irq(&x->wait.lock); | ||
3043 | if (!x->done) { | ||
3044 | DECLARE_WAITQUEUE(wait, current); | ||
3045 | |||
3046 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3047 | __add_wait_queue_tail(&x->wait, &wait); | ||
3048 | do { | ||
3049 | if (signal_pending(current)) { | ||
3050 | ret = -ERESTARTSYS; | ||
3051 | __remove_wait_queue(&x->wait, &wait); | ||
3052 | goto out; | ||
3053 | } | ||
3054 | __set_current_state(TASK_INTERRUPTIBLE); | ||
3055 | spin_unlock_irq(&x->wait.lock); | ||
3056 | schedule(); | ||
3057 | spin_lock_irq(&x->wait.lock); | ||
3058 | } while (!x->done); | ||
3059 | __remove_wait_queue(&x->wait, &wait); | ||
3060 | } | ||
3061 | x->done--; | ||
3062 | out: | ||
3063 | spin_unlock_irq(&x->wait.lock); | ||
3064 | |||
3065 | return ret; | ||
3066 | } | ||
3067 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
3068 | |||
3069 | unsigned long fastcall __sched | ||
3070 | wait_for_completion_interruptible_timeout(struct completion *x, | ||
3071 | unsigned long timeout) | ||
3072 | { | ||
3073 | might_sleep(); | ||
3074 | |||
3075 | spin_lock_irq(&x->wait.lock); | ||
3076 | if (!x->done) { | ||
3077 | DECLARE_WAITQUEUE(wait, current); | ||
3078 | |||
3079 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
3080 | __add_wait_queue_tail(&x->wait, &wait); | ||
3081 | do { | ||
3082 | if (signal_pending(current)) { | ||
3083 | timeout = -ERESTARTSYS; | ||
3084 | __remove_wait_queue(&x->wait, &wait); | ||
3085 | goto out; | ||
3086 | } | ||
3087 | __set_current_state(TASK_INTERRUPTIBLE); | ||
3088 | spin_unlock_irq(&x->wait.lock); | ||
3089 | timeout = schedule_timeout(timeout); | ||
3090 | spin_lock_irq(&x->wait.lock); | ||
3091 | if (!timeout) { | ||
3092 | __remove_wait_queue(&x->wait, &wait); | ||
3093 | goto out; | ||
3094 | } | ||
3095 | } while (!x->done); | ||
3096 | __remove_wait_queue(&x->wait, &wait); | ||
3097 | } | ||
3098 | x->done--; | ||
3099 | out: | ||
3100 | spin_unlock_irq(&x->wait.lock); | ||
3101 | return timeout; | ||
3102 | } | ||
3103 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
3104 | |||
3105 | |||
3106 | #define SLEEP_ON_VAR \ | ||
3107 | unsigned long flags; \ | ||
3108 | wait_queue_t wait; \ | ||
3109 | init_waitqueue_entry(&wait, current); | ||
3110 | |||
3111 | #define SLEEP_ON_HEAD \ | ||
3112 | spin_lock_irqsave(&q->lock,flags); \ | ||
3113 | __add_wait_queue(q, &wait); \ | ||
3114 | spin_unlock(&q->lock); | ||
3115 | |||
3116 | #define SLEEP_ON_TAIL \ | ||
3117 | spin_lock_irq(&q->lock); \ | ||
3118 | __remove_wait_queue(q, &wait); \ | ||
3119 | spin_unlock_irqrestore(&q->lock, flags); | ||
3120 | |||
3121 | void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | ||
3122 | { | ||
3123 | SLEEP_ON_VAR | ||
3124 | |||
3125 | current->state = TASK_INTERRUPTIBLE; | ||
3126 | |||
3127 | SLEEP_ON_HEAD | ||
3128 | schedule(); | ||
3129 | SLEEP_ON_TAIL | ||
3130 | } | ||
3131 | |||
3132 | EXPORT_SYMBOL(interruptible_sleep_on); | ||
3133 | |||
3134 | long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | ||
3135 | { | ||
3136 | SLEEP_ON_VAR | ||
3137 | |||
3138 | current->state = TASK_INTERRUPTIBLE; | ||
3139 | |||
3140 | SLEEP_ON_HEAD | ||
3141 | timeout = schedule_timeout(timeout); | ||
3142 | SLEEP_ON_TAIL | ||
3143 | |||
3144 | return timeout; | ||
3145 | } | ||
3146 | |||
3147 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | ||
3148 | |||
3149 | void fastcall __sched sleep_on(wait_queue_head_t *q) | ||
3150 | { | ||
3151 | SLEEP_ON_VAR | ||
3152 | |||
3153 | current->state = TASK_UNINTERRUPTIBLE; | ||
3154 | |||
3155 | SLEEP_ON_HEAD | ||
3156 | schedule(); | ||
3157 | SLEEP_ON_TAIL | ||
3158 | } | ||
3159 | |||
3160 | EXPORT_SYMBOL(sleep_on); | ||
3161 | |||
3162 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | ||
3163 | { | ||
3164 | SLEEP_ON_VAR | ||
3165 | |||
3166 | current->state = TASK_UNINTERRUPTIBLE; | ||
3167 | |||
3168 | SLEEP_ON_HEAD | ||
3169 | timeout = schedule_timeout(timeout); | ||
3170 | SLEEP_ON_TAIL | ||
3171 | |||
3172 | return timeout; | ||
3173 | } | ||
3174 | |||
3175 | EXPORT_SYMBOL(sleep_on_timeout); | ||
3176 | |||
3177 | void set_user_nice(task_t *p, long nice) | ||
3178 | { | ||
3179 | unsigned long flags; | ||
3180 | prio_array_t *array; | ||
3181 | runqueue_t *rq; | ||
3182 | int old_prio, new_prio, delta; | ||
3183 | |||
3184 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | ||
3185 | return; | ||
3186 | /* | ||
3187 | * We have to be careful, if called from sys_setpriority(), | ||
3188 | * the task might be in the middle of scheduling on another CPU. | ||
3189 | */ | ||
3190 | rq = task_rq_lock(p, &flags); | ||
3191 | /* | ||
3192 | * The RT priorities are set via sched_setscheduler(), but we still | ||
3193 | * allow the 'normal' nice value to be set - but as expected | ||
3194 | * it wont have any effect on scheduling until the task is | ||
3195 | * not SCHED_NORMAL: | ||
3196 | */ | ||
3197 | if (rt_task(p)) { | ||
3198 | p->static_prio = NICE_TO_PRIO(nice); | ||
3199 | goto out_unlock; | ||
3200 | } | ||
3201 | array = p->array; | ||
3202 | if (array) | ||
3203 | dequeue_task(p, array); | ||
3204 | |||
3205 | old_prio = p->prio; | ||
3206 | new_prio = NICE_TO_PRIO(nice); | ||
3207 | delta = new_prio - old_prio; | ||
3208 | p->static_prio = NICE_TO_PRIO(nice); | ||
3209 | p->prio += delta; | ||
3210 | |||
3211 | if (array) { | ||
3212 | enqueue_task(p, array); | ||
3213 | /* | ||
3214 | * If the task increased its priority or is running and | ||
3215 | * lowered its priority, then reschedule its CPU: | ||
3216 | */ | ||
3217 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | ||
3218 | resched_task(rq->curr); | ||
3219 | } | ||
3220 | out_unlock: | ||
3221 | task_rq_unlock(rq, &flags); | ||
3222 | } | ||
3223 | |||
3224 | EXPORT_SYMBOL(set_user_nice); | ||
3225 | |||
3226 | #ifdef __ARCH_WANT_SYS_NICE | ||
3227 | |||
3228 | /* | ||
3229 | * sys_nice - change the priority of the current process. | ||
3230 | * @increment: priority increment | ||
3231 | * | ||
3232 | * sys_setpriority is a more generic, but much slower function that | ||
3233 | * does similar things. | ||
3234 | */ | ||
3235 | asmlinkage long sys_nice(int increment) | ||
3236 | { | ||
3237 | int retval; | ||
3238 | long nice; | ||
3239 | |||
3240 | /* | ||
3241 | * Setpriority might change our priority at the same moment. | ||
3242 | * We don't have to worry. Conceptually one call occurs first | ||
3243 | * and we have a single winner. | ||
3244 | */ | ||
3245 | if (increment < 0) { | ||
3246 | if (!capable(CAP_SYS_NICE)) | ||
3247 | return -EPERM; | ||
3248 | if (increment < -40) | ||
3249 | increment = -40; | ||
3250 | } | ||
3251 | if (increment > 40) | ||
3252 | increment = 40; | ||
3253 | |||
3254 | nice = PRIO_TO_NICE(current->static_prio) + increment; | ||
3255 | if (nice < -20) | ||
3256 | nice = -20; | ||
3257 | if (nice > 19) | ||
3258 | nice = 19; | ||
3259 | |||
3260 | retval = security_task_setnice(current, nice); | ||
3261 | if (retval) | ||
3262 | return retval; | ||
3263 | |||
3264 | set_user_nice(current, nice); | ||
3265 | return 0; | ||
3266 | } | ||
3267 | |||
3268 | #endif | ||
3269 | |||
3270 | /** | ||
3271 | * task_prio - return the priority value of a given task. | ||
3272 | * @p: the task in question. | ||
3273 | * | ||
3274 | * This is the priority value as seen by users in /proc. | ||
3275 | * RT tasks are offset by -200. Normal tasks are centered | ||
3276 | * around 0, value goes from -16 to +15. | ||
3277 | */ | ||
3278 | int task_prio(const task_t *p) | ||
3279 | { | ||
3280 | return p->prio - MAX_RT_PRIO; | ||
3281 | } | ||
3282 | |||
3283 | /** | ||
3284 | * task_nice - return the nice value of a given task. | ||
3285 | * @p: the task in question. | ||
3286 | */ | ||
3287 | int task_nice(const task_t *p) | ||
3288 | { | ||
3289 | return TASK_NICE(p); | ||
3290 | } | ||
3291 | |||
3292 | /* | ||
3293 | * The only users of task_nice are binfmt_elf and binfmt_elf32. | ||
3294 | * binfmt_elf is no longer modular, but binfmt_elf32 still is. | ||
3295 | * Therefore, task_nice is needed if there is a compat_mode. | ||
3296 | */ | ||
3297 | #ifdef CONFIG_COMPAT | ||
3298 | EXPORT_SYMBOL_GPL(task_nice); | ||
3299 | #endif | ||
3300 | |||
3301 | /** | ||
3302 | * idle_cpu - is a given cpu idle currently? | ||
3303 | * @cpu: the processor in question. | ||
3304 | */ | ||
3305 | int idle_cpu(int cpu) | ||
3306 | { | ||
3307 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | ||
3308 | } | ||
3309 | |||
3310 | EXPORT_SYMBOL_GPL(idle_cpu); | ||
3311 | |||
3312 | /** | ||
3313 | * idle_task - return the idle task for a given cpu. | ||
3314 | * @cpu: the processor in question. | ||
3315 | */ | ||
3316 | task_t *idle_task(int cpu) | ||
3317 | { | ||
3318 | return cpu_rq(cpu)->idle; | ||
3319 | } | ||
3320 | |||
3321 | /** | ||
3322 | * find_process_by_pid - find a process with a matching PID value. | ||
3323 | * @pid: the pid in question. | ||
3324 | */ | ||
3325 | static inline task_t *find_process_by_pid(pid_t pid) | ||
3326 | { | ||
3327 | return pid ? find_task_by_pid(pid) : current; | ||
3328 | } | ||
3329 | |||
3330 | /* Actually do priority change: must hold rq lock. */ | ||
3331 | static void __setscheduler(struct task_struct *p, int policy, int prio) | ||
3332 | { | ||
3333 | BUG_ON(p->array); | ||
3334 | p->policy = policy; | ||
3335 | p->rt_priority = prio; | ||
3336 | if (policy != SCHED_NORMAL) | ||
3337 | p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; | ||
3338 | else | ||
3339 | p->prio = p->static_prio; | ||
3340 | } | ||
3341 | |||
3342 | /** | ||
3343 | * sched_setscheduler - change the scheduling policy and/or RT priority of | ||
3344 | * a thread. | ||
3345 | * @p: the task in question. | ||
3346 | * @policy: new policy. | ||
3347 | * @param: structure containing the new RT priority. | ||
3348 | */ | ||
3349 | int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) | ||
3350 | { | ||
3351 | int retval; | ||
3352 | int oldprio, oldpolicy = -1; | ||
3353 | prio_array_t *array; | ||
3354 | unsigned long flags; | ||
3355 | runqueue_t *rq; | ||
3356 | |||
3357 | recheck: | ||
3358 | /* double check policy once rq lock held */ | ||
3359 | if (policy < 0) | ||
3360 | policy = oldpolicy = p->policy; | ||
3361 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
3362 | policy != SCHED_NORMAL) | ||
3363 | return -EINVAL; | ||
3364 | /* | ||
3365 | * Valid priorities for SCHED_FIFO and SCHED_RR are | ||
3366 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. | ||
3367 | */ | ||
3368 | if (param->sched_priority < 0 || | ||
3369 | param->sched_priority > MAX_USER_RT_PRIO-1) | ||
3370 | return -EINVAL; | ||
3371 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | ||
3372 | return -EINVAL; | ||
3373 | |||
3374 | if ((policy == SCHED_FIFO || policy == SCHED_RR) && | ||
3375 | !capable(CAP_SYS_NICE)) | ||
3376 | return -EPERM; | ||
3377 | if ((current->euid != p->euid) && (current->euid != p->uid) && | ||
3378 | !capable(CAP_SYS_NICE)) | ||
3379 | return -EPERM; | ||
3380 | |||
3381 | retval = security_task_setscheduler(p, policy, param); | ||
3382 | if (retval) | ||
3383 | return retval; | ||
3384 | /* | ||
3385 | * To be able to change p->policy safely, the apropriate | ||
3386 | * runqueue lock must be held. | ||
3387 | */ | ||
3388 | rq = task_rq_lock(p, &flags); | ||
3389 | /* recheck policy now with rq lock held */ | ||
3390 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | ||
3391 | policy = oldpolicy = -1; | ||
3392 | task_rq_unlock(rq, &flags); | ||
3393 | goto recheck; | ||
3394 | } | ||
3395 | array = p->array; | ||
3396 | if (array) | ||
3397 | deactivate_task(p, rq); | ||
3398 | oldprio = p->prio; | ||
3399 | __setscheduler(p, policy, param->sched_priority); | ||
3400 | if (array) { | ||
3401 | __activate_task(p, rq); | ||
3402 | /* | ||
3403 | * Reschedule if we are currently running on this runqueue and | ||
3404 | * our priority decreased, or if we are not currently running on | ||
3405 | * this runqueue and our priority is higher than the current's | ||
3406 | */ | ||
3407 | if (task_running(rq, p)) { | ||
3408 | if (p->prio > oldprio) | ||
3409 | resched_task(rq->curr); | ||
3410 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
3411 | resched_task(rq->curr); | ||
3412 | } | ||
3413 | task_rq_unlock(rq, &flags); | ||
3414 | return 0; | ||
3415 | } | ||
3416 | EXPORT_SYMBOL_GPL(sched_setscheduler); | ||
3417 | |||
3418 | static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | ||
3419 | { | ||
3420 | int retval; | ||
3421 | struct sched_param lparam; | ||
3422 | struct task_struct *p; | ||
3423 | |||
3424 | if (!param || pid < 0) | ||
3425 | return -EINVAL; | ||
3426 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) | ||
3427 | return -EFAULT; | ||
3428 | read_lock_irq(&tasklist_lock); | ||
3429 | p = find_process_by_pid(pid); | ||
3430 | if (!p) { | ||
3431 | read_unlock_irq(&tasklist_lock); | ||
3432 | return -ESRCH; | ||
3433 | } | ||
3434 | retval = sched_setscheduler(p, policy, &lparam); | ||
3435 | read_unlock_irq(&tasklist_lock); | ||
3436 | return retval; | ||
3437 | } | ||
3438 | |||
3439 | /** | ||
3440 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | ||
3441 | * @pid: the pid in question. | ||
3442 | * @policy: new policy. | ||
3443 | * @param: structure containing the new RT priority. | ||
3444 | */ | ||
3445 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | ||
3446 | struct sched_param __user *param) | ||
3447 | { | ||
3448 | return do_sched_setscheduler(pid, policy, param); | ||
3449 | } | ||
3450 | |||
3451 | /** | ||
3452 | * sys_sched_setparam - set/change the RT priority of a thread | ||
3453 | * @pid: the pid in question. | ||
3454 | * @param: structure containing the new RT priority. | ||
3455 | */ | ||
3456 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | ||
3457 | { | ||
3458 | return do_sched_setscheduler(pid, -1, param); | ||
3459 | } | ||
3460 | |||
3461 | /** | ||
3462 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | ||
3463 | * @pid: the pid in question. | ||
3464 | */ | ||
3465 | asmlinkage long sys_sched_getscheduler(pid_t pid) | ||
3466 | { | ||
3467 | int retval = -EINVAL; | ||
3468 | task_t *p; | ||
3469 | |||
3470 | if (pid < 0) | ||
3471 | goto out_nounlock; | ||
3472 | |||
3473 | retval = -ESRCH; | ||
3474 | read_lock(&tasklist_lock); | ||
3475 | p = find_process_by_pid(pid); | ||
3476 | if (p) { | ||
3477 | retval = security_task_getscheduler(p); | ||
3478 | if (!retval) | ||
3479 | retval = p->policy; | ||
3480 | } | ||
3481 | read_unlock(&tasklist_lock); | ||
3482 | |||
3483 | out_nounlock: | ||
3484 | return retval; | ||
3485 | } | ||
3486 | |||
3487 | /** | ||
3488 | * sys_sched_getscheduler - get the RT priority of a thread | ||
3489 | * @pid: the pid in question. | ||
3490 | * @param: structure containing the RT priority. | ||
3491 | */ | ||
3492 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | ||
3493 | { | ||
3494 | struct sched_param lp; | ||
3495 | int retval = -EINVAL; | ||
3496 | task_t *p; | ||
3497 | |||
3498 | if (!param || pid < 0) | ||
3499 | goto out_nounlock; | ||
3500 | |||
3501 | read_lock(&tasklist_lock); | ||
3502 | p = find_process_by_pid(pid); | ||
3503 | retval = -ESRCH; | ||
3504 | if (!p) | ||
3505 | goto out_unlock; | ||
3506 | |||
3507 | retval = security_task_getscheduler(p); | ||
3508 | if (retval) | ||
3509 | goto out_unlock; | ||
3510 | |||
3511 | lp.sched_priority = p->rt_priority; | ||
3512 | read_unlock(&tasklist_lock); | ||
3513 | |||
3514 | /* | ||
3515 | * This one might sleep, we cannot do it with a spinlock held ... | ||
3516 | */ | ||
3517 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | ||
3518 | |||
3519 | out_nounlock: | ||
3520 | return retval; | ||
3521 | |||
3522 | out_unlock: | ||
3523 | read_unlock(&tasklist_lock); | ||
3524 | return retval; | ||
3525 | } | ||
3526 | |||
3527 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | ||
3528 | { | ||
3529 | task_t *p; | ||
3530 | int retval; | ||
3531 | cpumask_t cpus_allowed; | ||
3532 | |||
3533 | lock_cpu_hotplug(); | ||
3534 | read_lock(&tasklist_lock); | ||
3535 | |||
3536 | p = find_process_by_pid(pid); | ||
3537 | if (!p) { | ||
3538 | read_unlock(&tasklist_lock); | ||
3539 | unlock_cpu_hotplug(); | ||
3540 | return -ESRCH; | ||
3541 | } | ||
3542 | |||
3543 | /* | ||
3544 | * It is not safe to call set_cpus_allowed with the | ||
3545 | * tasklist_lock held. We will bump the task_struct's | ||
3546 | * usage count and then drop tasklist_lock. | ||
3547 | */ | ||
3548 | get_task_struct(p); | ||
3549 | read_unlock(&tasklist_lock); | ||
3550 | |||
3551 | retval = -EPERM; | ||
3552 | if ((current->euid != p->euid) && (current->euid != p->uid) && | ||
3553 | !capable(CAP_SYS_NICE)) | ||
3554 | goto out_unlock; | ||
3555 | |||
3556 | cpus_allowed = cpuset_cpus_allowed(p); | ||
3557 | cpus_and(new_mask, new_mask, cpus_allowed); | ||
3558 | retval = set_cpus_allowed(p, new_mask); | ||
3559 | |||
3560 | out_unlock: | ||
3561 | put_task_struct(p); | ||
3562 | unlock_cpu_hotplug(); | ||
3563 | return retval; | ||
3564 | } | ||
3565 | |||
3566 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | ||
3567 | cpumask_t *new_mask) | ||
3568 | { | ||
3569 | if (len < sizeof(cpumask_t)) { | ||
3570 | memset(new_mask, 0, sizeof(cpumask_t)); | ||
3571 | } else if (len > sizeof(cpumask_t)) { | ||
3572 | len = sizeof(cpumask_t); | ||
3573 | } | ||
3574 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; | ||
3575 | } | ||
3576 | |||
3577 | /** | ||
3578 | * sys_sched_setaffinity - set the cpu affinity of a process | ||
3579 | * @pid: pid of the process | ||
3580 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | ||
3581 | * @user_mask_ptr: user-space pointer to the new cpu mask | ||
3582 | */ | ||
3583 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | ||
3584 | unsigned long __user *user_mask_ptr) | ||
3585 | { | ||
3586 | cpumask_t new_mask; | ||
3587 | int retval; | ||
3588 | |||
3589 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); | ||
3590 | if (retval) | ||
3591 | return retval; | ||
3592 | |||
3593 | return sched_setaffinity(pid, new_mask); | ||
3594 | } | ||
3595 | |||
3596 | /* | ||
3597 | * Represents all cpu's present in the system | ||
3598 | * In systems capable of hotplug, this map could dynamically grow | ||
3599 | * as new cpu's are detected in the system via any platform specific | ||
3600 | * method, such as ACPI for e.g. | ||
3601 | */ | ||
3602 | |||
3603 | cpumask_t cpu_present_map; | ||
3604 | EXPORT_SYMBOL(cpu_present_map); | ||
3605 | |||
3606 | #ifndef CONFIG_SMP | ||
3607 | cpumask_t cpu_online_map = CPU_MASK_ALL; | ||
3608 | cpumask_t cpu_possible_map = CPU_MASK_ALL; | ||
3609 | #endif | ||
3610 | |||
3611 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | ||
3612 | { | ||
3613 | int retval; | ||
3614 | task_t *p; | ||
3615 | |||
3616 | lock_cpu_hotplug(); | ||
3617 | read_lock(&tasklist_lock); | ||
3618 | |||
3619 | retval = -ESRCH; | ||
3620 | p = find_process_by_pid(pid); | ||
3621 | if (!p) | ||
3622 | goto out_unlock; | ||
3623 | |||
3624 | retval = 0; | ||
3625 | cpus_and(*mask, p->cpus_allowed, cpu_possible_map); | ||
3626 | |||
3627 | out_unlock: | ||
3628 | read_unlock(&tasklist_lock); | ||
3629 | unlock_cpu_hotplug(); | ||
3630 | if (retval) | ||
3631 | return retval; | ||
3632 | |||
3633 | return 0; | ||
3634 | } | ||
3635 | |||
3636 | /** | ||
3637 | * sys_sched_getaffinity - get the cpu affinity of a process | ||
3638 | * @pid: pid of the process | ||
3639 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | ||
3640 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | ||
3641 | */ | ||
3642 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | ||
3643 | unsigned long __user *user_mask_ptr) | ||
3644 | { | ||
3645 | int ret; | ||
3646 | cpumask_t mask; | ||
3647 | |||
3648 | if (len < sizeof(cpumask_t)) | ||
3649 | return -EINVAL; | ||
3650 | |||
3651 | ret = sched_getaffinity(pid, &mask); | ||
3652 | if (ret < 0) | ||
3653 | return ret; | ||
3654 | |||
3655 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) | ||
3656 | return -EFAULT; | ||
3657 | |||
3658 | return sizeof(cpumask_t); | ||
3659 | } | ||
3660 | |||
3661 | /** | ||
3662 | * sys_sched_yield - yield the current processor to other threads. | ||
3663 | * | ||
3664 | * this function yields the current CPU by moving the calling thread | ||
3665 | * to the expired array. If there are no other threads running on this | ||
3666 | * CPU then this function will return. | ||
3667 | */ | ||
3668 | asmlinkage long sys_sched_yield(void) | ||
3669 | { | ||
3670 | runqueue_t *rq = this_rq_lock(); | ||
3671 | prio_array_t *array = current->array; | ||
3672 | prio_array_t *target = rq->expired; | ||
3673 | |||
3674 | schedstat_inc(rq, yld_cnt); | ||
3675 | /* | ||
3676 | * We implement yielding by moving the task into the expired | ||
3677 | * queue. | ||
3678 | * | ||
3679 | * (special rule: RT tasks will just roundrobin in the active | ||
3680 | * array.) | ||
3681 | */ | ||
3682 | if (rt_task(current)) | ||
3683 | target = rq->active; | ||
3684 | |||
3685 | if (current->array->nr_active == 1) { | ||
3686 | schedstat_inc(rq, yld_act_empty); | ||
3687 | if (!rq->expired->nr_active) | ||
3688 | schedstat_inc(rq, yld_both_empty); | ||
3689 | } else if (!rq->expired->nr_active) | ||
3690 | schedstat_inc(rq, yld_exp_empty); | ||
3691 | |||
3692 | if (array != target) { | ||
3693 | dequeue_task(current, array); | ||
3694 | enqueue_task(current, target); | ||
3695 | } else | ||
3696 | /* | ||
3697 | * requeue_task is cheaper so perform that if possible. | ||
3698 | */ | ||
3699 | requeue_task(current, array); | ||
3700 | |||
3701 | /* | ||
3702 | * Since we are going to call schedule() anyway, there's | ||
3703 | * no need to preempt or enable interrupts: | ||
3704 | */ | ||
3705 | __release(rq->lock); | ||
3706 | _raw_spin_unlock(&rq->lock); | ||
3707 | preempt_enable_no_resched(); | ||
3708 | |||
3709 | schedule(); | ||
3710 | |||
3711 | return 0; | ||
3712 | } | ||
3713 | |||
3714 | static inline void __cond_resched(void) | ||
3715 | { | ||
3716 | do { | ||
3717 | add_preempt_count(PREEMPT_ACTIVE); | ||
3718 | schedule(); | ||
3719 | sub_preempt_count(PREEMPT_ACTIVE); | ||
3720 | } while (need_resched()); | ||
3721 | } | ||
3722 | |||
3723 | int __sched cond_resched(void) | ||
3724 | { | ||
3725 | if (need_resched()) { | ||
3726 | __cond_resched(); | ||
3727 | return 1; | ||
3728 | } | ||
3729 | return 0; | ||
3730 | } | ||
3731 | |||
3732 | EXPORT_SYMBOL(cond_resched); | ||
3733 | |||
3734 | /* | ||
3735 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | ||
3736 | * call schedule, and on return reacquire the lock. | ||
3737 | * | ||
3738 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | ||
3739 | * operations here to prevent schedule() from being called twice (once via | ||
3740 | * spin_unlock(), once by hand). | ||
3741 | */ | ||
3742 | int cond_resched_lock(spinlock_t * lock) | ||
3743 | { | ||
3744 | if (need_lockbreak(lock)) { | ||
3745 | spin_unlock(lock); | ||
3746 | cpu_relax(); | ||
3747 | spin_lock(lock); | ||
3748 | } | ||
3749 | if (need_resched()) { | ||
3750 | _raw_spin_unlock(lock); | ||
3751 | preempt_enable_no_resched(); | ||
3752 | __cond_resched(); | ||
3753 | spin_lock(lock); | ||
3754 | return 1; | ||
3755 | } | ||
3756 | return 0; | ||
3757 | } | ||
3758 | |||
3759 | EXPORT_SYMBOL(cond_resched_lock); | ||
3760 | |||
3761 | int __sched cond_resched_softirq(void) | ||
3762 | { | ||
3763 | BUG_ON(!in_softirq()); | ||
3764 | |||
3765 | if (need_resched()) { | ||
3766 | __local_bh_enable(); | ||
3767 | __cond_resched(); | ||
3768 | local_bh_disable(); | ||
3769 | return 1; | ||
3770 | } | ||
3771 | return 0; | ||
3772 | } | ||
3773 | |||
3774 | EXPORT_SYMBOL(cond_resched_softirq); | ||
3775 | |||
3776 | |||
3777 | /** | ||
3778 | * yield - yield the current processor to other threads. | ||
3779 | * | ||
3780 | * this is a shortcut for kernel-space yielding - it marks the | ||
3781 | * thread runnable and calls sys_sched_yield(). | ||
3782 | */ | ||
3783 | void __sched yield(void) | ||
3784 | { | ||
3785 | set_current_state(TASK_RUNNING); | ||
3786 | sys_sched_yield(); | ||
3787 | } | ||
3788 | |||
3789 | EXPORT_SYMBOL(yield); | ||
3790 | |||
3791 | /* | ||
3792 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | ||
3793 | * that process accounting knows that this is a task in IO wait state. | ||
3794 | * | ||
3795 | * But don't do that if it is a deliberate, throttling IO wait (this task | ||
3796 | * has set its backing_dev_info: the queue against which it should throttle) | ||
3797 | */ | ||
3798 | void __sched io_schedule(void) | ||
3799 | { | ||
3800 | struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); | ||
3801 | |||
3802 | atomic_inc(&rq->nr_iowait); | ||
3803 | schedule(); | ||
3804 | atomic_dec(&rq->nr_iowait); | ||
3805 | } | ||
3806 | |||
3807 | EXPORT_SYMBOL(io_schedule); | ||
3808 | |||
3809 | long __sched io_schedule_timeout(long timeout) | ||
3810 | { | ||
3811 | struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); | ||
3812 | long ret; | ||
3813 | |||
3814 | atomic_inc(&rq->nr_iowait); | ||
3815 | ret = schedule_timeout(timeout); | ||
3816 | atomic_dec(&rq->nr_iowait); | ||
3817 | return ret; | ||
3818 | } | ||
3819 | |||
3820 | /** | ||
3821 | * sys_sched_get_priority_max - return maximum RT priority. | ||
3822 | * @policy: scheduling class. | ||
3823 | * | ||
3824 | * this syscall returns the maximum rt_priority that can be used | ||
3825 | * by a given scheduling class. | ||
3826 | */ | ||
3827 | asmlinkage long sys_sched_get_priority_max(int policy) | ||
3828 | { | ||
3829 | int ret = -EINVAL; | ||
3830 | |||
3831 | switch (policy) { | ||
3832 | case SCHED_FIFO: | ||
3833 | case SCHED_RR: | ||
3834 | ret = MAX_USER_RT_PRIO-1; | ||
3835 | break; | ||
3836 | case SCHED_NORMAL: | ||
3837 | ret = 0; | ||
3838 | break; | ||
3839 | } | ||
3840 | return ret; | ||
3841 | } | ||
3842 | |||
3843 | /** | ||
3844 | * sys_sched_get_priority_min - return minimum RT priority. | ||
3845 | * @policy: scheduling class. | ||
3846 | * | ||
3847 | * this syscall returns the minimum rt_priority that can be used | ||
3848 | * by a given scheduling class. | ||
3849 | */ | ||
3850 | asmlinkage long sys_sched_get_priority_min(int policy) | ||
3851 | { | ||
3852 | int ret = -EINVAL; | ||
3853 | |||
3854 | switch (policy) { | ||
3855 | case SCHED_FIFO: | ||
3856 | case SCHED_RR: | ||
3857 | ret = 1; | ||
3858 | break; | ||
3859 | case SCHED_NORMAL: | ||
3860 | ret = 0; | ||
3861 | } | ||
3862 | return ret; | ||
3863 | } | ||
3864 | |||
3865 | /** | ||
3866 | * sys_sched_rr_get_interval - return the default timeslice of a process. | ||
3867 | * @pid: pid of the process. | ||
3868 | * @interval: userspace pointer to the timeslice value. | ||
3869 | * | ||
3870 | * this syscall writes the default timeslice value of a given process | ||
3871 | * into the user-space timespec buffer. A value of '0' means infinity. | ||
3872 | */ | ||
3873 | asmlinkage | ||
3874 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | ||
3875 | { | ||
3876 | int retval = -EINVAL; | ||
3877 | struct timespec t; | ||
3878 | task_t *p; | ||
3879 | |||
3880 | if (pid < 0) | ||
3881 | goto out_nounlock; | ||
3882 | |||
3883 | retval = -ESRCH; | ||
3884 | read_lock(&tasklist_lock); | ||
3885 | p = find_process_by_pid(pid); | ||
3886 | if (!p) | ||
3887 | goto out_unlock; | ||
3888 | |||
3889 | retval = security_task_getscheduler(p); | ||
3890 | if (retval) | ||
3891 | goto out_unlock; | ||
3892 | |||
3893 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | ||
3894 | 0 : task_timeslice(p), &t); | ||
3895 | read_unlock(&tasklist_lock); | ||
3896 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | ||
3897 | out_nounlock: | ||
3898 | return retval; | ||
3899 | out_unlock: | ||
3900 | read_unlock(&tasklist_lock); | ||
3901 | return retval; | ||
3902 | } | ||
3903 | |||
3904 | static inline struct task_struct *eldest_child(struct task_struct *p) | ||
3905 | { | ||
3906 | if (list_empty(&p->children)) return NULL; | ||
3907 | return list_entry(p->children.next,struct task_struct,sibling); | ||
3908 | } | ||
3909 | |||
3910 | static inline struct task_struct *older_sibling(struct task_struct *p) | ||
3911 | { | ||
3912 | if (p->sibling.prev==&p->parent->children) return NULL; | ||
3913 | return list_entry(p->sibling.prev,struct task_struct,sibling); | ||
3914 | } | ||
3915 | |||
3916 | static inline struct task_struct *younger_sibling(struct task_struct *p) | ||
3917 | { | ||
3918 | if (p->sibling.next==&p->parent->children) return NULL; | ||
3919 | return list_entry(p->sibling.next,struct task_struct,sibling); | ||
3920 | } | ||
3921 | |||
3922 | static void show_task(task_t * p) | ||
3923 | { | ||
3924 | task_t *relative; | ||
3925 | unsigned state; | ||
3926 | unsigned long free = 0; | ||
3927 | static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; | ||
3928 | |||
3929 | printk("%-13.13s ", p->comm); | ||
3930 | state = p->state ? __ffs(p->state) + 1 : 0; | ||
3931 | if (state < ARRAY_SIZE(stat_nam)) | ||
3932 | printk(stat_nam[state]); | ||
3933 | else | ||
3934 | printk("?"); | ||
3935 | #if (BITS_PER_LONG == 32) | ||
3936 | if (state == TASK_RUNNING) | ||
3937 | printk(" running "); | ||
3938 | else | ||
3939 | printk(" %08lX ", thread_saved_pc(p)); | ||
3940 | #else | ||
3941 | if (state == TASK_RUNNING) | ||
3942 | printk(" running task "); | ||
3943 | else | ||
3944 | printk(" %016lx ", thread_saved_pc(p)); | ||
3945 | #endif | ||
3946 | #ifdef CONFIG_DEBUG_STACK_USAGE | ||
3947 | { | ||
3948 | unsigned long * n = (unsigned long *) (p->thread_info+1); | ||
3949 | while (!*n) | ||
3950 | n++; | ||
3951 | free = (unsigned long) n - (unsigned long)(p->thread_info+1); | ||
3952 | } | ||
3953 | #endif | ||
3954 | printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); | ||
3955 | if ((relative = eldest_child(p))) | ||
3956 | printk("%5d ", relative->pid); | ||
3957 | else | ||
3958 | printk(" "); | ||
3959 | if ((relative = younger_sibling(p))) | ||
3960 | printk("%7d", relative->pid); | ||
3961 | else | ||
3962 | printk(" "); | ||
3963 | if ((relative = older_sibling(p))) | ||
3964 | printk(" %5d", relative->pid); | ||
3965 | else | ||
3966 | printk(" "); | ||
3967 | if (!p->mm) | ||
3968 | printk(" (L-TLB)\n"); | ||
3969 | else | ||
3970 | printk(" (NOTLB)\n"); | ||
3971 | |||
3972 | if (state != TASK_RUNNING) | ||
3973 | show_stack(p, NULL); | ||
3974 | } | ||
3975 | |||
3976 | void show_state(void) | ||
3977 | { | ||
3978 | task_t *g, *p; | ||
3979 | |||
3980 | #if (BITS_PER_LONG == 32) | ||
3981 | printk("\n" | ||
3982 | " sibling\n"); | ||
3983 | printk(" task PC pid father child younger older\n"); | ||
3984 | #else | ||
3985 | printk("\n" | ||
3986 | " sibling\n"); | ||
3987 | printk(" task PC pid father child younger older\n"); | ||
3988 | #endif | ||
3989 | read_lock(&tasklist_lock); | ||
3990 | do_each_thread(g, p) { | ||
3991 | /* | ||
3992 | * reset the NMI-timeout, listing all files on a slow | ||
3993 | * console might take alot of time: | ||
3994 | */ | ||
3995 | touch_nmi_watchdog(); | ||
3996 | show_task(p); | ||
3997 | } while_each_thread(g, p); | ||
3998 | |||
3999 | read_unlock(&tasklist_lock); | ||
4000 | } | ||
4001 | |||
4002 | void __devinit init_idle(task_t *idle, int cpu) | ||
4003 | { | ||
4004 | runqueue_t *rq = cpu_rq(cpu); | ||
4005 | unsigned long flags; | ||
4006 | |||
4007 | idle->sleep_avg = 0; | ||
4008 | idle->array = NULL; | ||
4009 | idle->prio = MAX_PRIO; | ||
4010 | idle->state = TASK_RUNNING; | ||
4011 | idle->cpus_allowed = cpumask_of_cpu(cpu); | ||
4012 | set_task_cpu(idle, cpu); | ||
4013 | |||
4014 | spin_lock_irqsave(&rq->lock, flags); | ||
4015 | rq->curr = rq->idle = idle; | ||
4016 | set_tsk_need_resched(idle); | ||
4017 | spin_unlock_irqrestore(&rq->lock, flags); | ||
4018 | |||
4019 | /* Set the preempt count _outside_ the spinlocks! */ | ||
4020 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | ||
4021 | idle->thread_info->preempt_count = (idle->lock_depth >= 0); | ||
4022 | #else | ||
4023 | idle->thread_info->preempt_count = 0; | ||
4024 | #endif | ||
4025 | } | ||
4026 | |||
4027 | /* | ||
4028 | * In a system that switches off the HZ timer nohz_cpu_mask | ||
4029 | * indicates which cpus entered this state. This is used | ||
4030 | * in the rcu update to wait only for active cpus. For system | ||
4031 | * which do not switch off the HZ timer nohz_cpu_mask should | ||
4032 | * always be CPU_MASK_NONE. | ||
4033 | */ | ||
4034 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | ||
4035 | |||
4036 | #ifdef CONFIG_SMP | ||
4037 | /* | ||
4038 | * This is how migration works: | ||
4039 | * | ||
4040 | * 1) we queue a migration_req_t structure in the source CPU's | ||
4041 | * runqueue and wake up that CPU's migration thread. | ||
4042 | * 2) we down() the locked semaphore => thread blocks. | ||
4043 | * 3) migration thread wakes up (implicitly it forces the migrated | ||
4044 | * thread off the CPU) | ||
4045 | * 4) it gets the migration request and checks whether the migrated | ||
4046 | * task is still in the wrong runqueue. | ||
4047 | * 5) if it's in the wrong runqueue then the migration thread removes | ||
4048 | * it and puts it into the right queue. | ||
4049 | * 6) migration thread up()s the semaphore. | ||
4050 | * 7) we wake up and the migration is done. | ||
4051 | */ | ||
4052 | |||
4053 | /* | ||
4054 | * Change a given task's CPU affinity. Migrate the thread to a | ||
4055 | * proper CPU and schedule it away if the CPU it's executing on | ||
4056 | * is removed from the allowed bitmask. | ||
4057 | * | ||
4058 | * NOTE: the caller must have a valid reference to the task, the | ||
4059 | * task must not exit() & deallocate itself prematurely. The | ||
4060 | * call is not atomic; no spinlocks may be held. | ||
4061 | */ | ||
4062 | int set_cpus_allowed(task_t *p, cpumask_t new_mask) | ||
4063 | { | ||
4064 | unsigned long flags; | ||
4065 | int ret = 0; | ||
4066 | migration_req_t req; | ||
4067 | runqueue_t *rq; | ||
4068 | |||
4069 | rq = task_rq_lock(p, &flags); | ||
4070 | if (!cpus_intersects(new_mask, cpu_online_map)) { | ||
4071 | ret = -EINVAL; | ||
4072 | goto out; | ||
4073 | } | ||
4074 | |||
4075 | p->cpus_allowed = new_mask; | ||
4076 | /* Can the task run on the task's current CPU? If so, we're done */ | ||
4077 | if (cpu_isset(task_cpu(p), new_mask)) | ||
4078 | goto out; | ||
4079 | |||
4080 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | ||
4081 | /* Need help from migration thread: drop lock and wait. */ | ||
4082 | task_rq_unlock(rq, &flags); | ||
4083 | wake_up_process(rq->migration_thread); | ||
4084 | wait_for_completion(&req.done); | ||
4085 | tlb_migrate_finish(p->mm); | ||
4086 | return 0; | ||
4087 | } | ||
4088 | out: | ||
4089 | task_rq_unlock(rq, &flags); | ||
4090 | return ret; | ||
4091 | } | ||
4092 | |||
4093 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | ||
4094 | |||
4095 | /* | ||
4096 | * Move (not current) task off this cpu, onto dest cpu. We're doing | ||
4097 | * this because either it can't run here any more (set_cpus_allowed() | ||
4098 | * away from this CPU, or CPU going down), or because we're | ||
4099 | * attempting to rebalance this task on exec (sched_exec). | ||
4100 | * | ||
4101 | * So we race with normal scheduler movements, but that's OK, as long | ||
4102 | * as the task is no longer on this CPU. | ||
4103 | */ | ||
4104 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | ||
4105 | { | ||
4106 | runqueue_t *rq_dest, *rq_src; | ||
4107 | |||
4108 | if (unlikely(cpu_is_offline(dest_cpu))) | ||
4109 | return; | ||
4110 | |||
4111 | rq_src = cpu_rq(src_cpu); | ||
4112 | rq_dest = cpu_rq(dest_cpu); | ||
4113 | |||
4114 | double_rq_lock(rq_src, rq_dest); | ||
4115 | /* Already moved. */ | ||
4116 | if (task_cpu(p) != src_cpu) | ||
4117 | goto out; | ||
4118 | /* Affinity changed (again). */ | ||
4119 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | ||
4120 | goto out; | ||
4121 | |||
4122 | set_task_cpu(p, dest_cpu); | ||
4123 | if (p->array) { | ||
4124 | /* | ||
4125 | * Sync timestamp with rq_dest's before activating. | ||
4126 | * The same thing could be achieved by doing this step | ||
4127 | * afterwards, and pretending it was a local activate. | ||
4128 | * This way is cleaner and logically correct. | ||
4129 | */ | ||
4130 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | ||
4131 | + rq_dest->timestamp_last_tick; | ||
4132 | deactivate_task(p, rq_src); | ||
4133 | activate_task(p, rq_dest, 0); | ||
4134 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | ||
4135 | resched_task(rq_dest->curr); | ||
4136 | } | ||
4137 | |||
4138 | out: | ||
4139 | double_rq_unlock(rq_src, rq_dest); | ||
4140 | } | ||
4141 | |||
4142 | /* | ||
4143 | * migration_thread - this is a highprio system thread that performs | ||
4144 | * thread migration by bumping thread off CPU then 'pushing' onto | ||
4145 | * another runqueue. | ||
4146 | */ | ||
4147 | static int migration_thread(void * data) | ||
4148 | { | ||
4149 | runqueue_t *rq; | ||
4150 | int cpu = (long)data; | ||
4151 | |||
4152 | rq = cpu_rq(cpu); | ||
4153 | BUG_ON(rq->migration_thread != current); | ||
4154 | |||
4155 | set_current_state(TASK_INTERRUPTIBLE); | ||
4156 | while (!kthread_should_stop()) { | ||
4157 | struct list_head *head; | ||
4158 | migration_req_t *req; | ||
4159 | |||
4160 | if (current->flags & PF_FREEZE) | ||
4161 | refrigerator(PF_FREEZE); | ||
4162 | |||
4163 | spin_lock_irq(&rq->lock); | ||
4164 | |||
4165 | if (cpu_is_offline(cpu)) { | ||
4166 | spin_unlock_irq(&rq->lock); | ||
4167 | goto wait_to_die; | ||
4168 | } | ||
4169 | |||
4170 | if (rq->active_balance) { | ||
4171 | active_load_balance(rq, cpu); | ||
4172 | rq->active_balance = 0; | ||
4173 | } | ||
4174 | |||
4175 | head = &rq->migration_queue; | ||
4176 | |||
4177 | if (list_empty(head)) { | ||
4178 | spin_unlock_irq(&rq->lock); | ||
4179 | schedule(); | ||
4180 | set_current_state(TASK_INTERRUPTIBLE); | ||
4181 | continue; | ||
4182 | } | ||
4183 | req = list_entry(head->next, migration_req_t, list); | ||
4184 | list_del_init(head->next); | ||
4185 | |||
4186 | if (req->type == REQ_MOVE_TASK) { | ||
4187 | spin_unlock(&rq->lock); | ||
4188 | __migrate_task(req->task, cpu, req->dest_cpu); | ||
4189 | local_irq_enable(); | ||
4190 | } else if (req->type == REQ_SET_DOMAIN) { | ||
4191 | rq->sd = req->sd; | ||
4192 | spin_unlock_irq(&rq->lock); | ||
4193 | } else { | ||
4194 | spin_unlock_irq(&rq->lock); | ||
4195 | WARN_ON(1); | ||
4196 | } | ||
4197 | |||
4198 | complete(&req->done); | ||
4199 | } | ||
4200 | __set_current_state(TASK_RUNNING); | ||
4201 | return 0; | ||
4202 | |||
4203 | wait_to_die: | ||
4204 | /* Wait for kthread_stop */ | ||
4205 | set_current_state(TASK_INTERRUPTIBLE); | ||
4206 | while (!kthread_should_stop()) { | ||
4207 | schedule(); | ||
4208 | set_current_state(TASK_INTERRUPTIBLE); | ||
4209 | } | ||
4210 | __set_current_state(TASK_RUNNING); | ||
4211 | return 0; | ||
4212 | } | ||
4213 | |||
4214 | #ifdef CONFIG_HOTPLUG_CPU | ||
4215 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | ||
4216 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | ||
4217 | { | ||
4218 | int dest_cpu; | ||
4219 | cpumask_t mask; | ||
4220 | |||
4221 | /* On same node? */ | ||
4222 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | ||
4223 | cpus_and(mask, mask, tsk->cpus_allowed); | ||
4224 | dest_cpu = any_online_cpu(mask); | ||
4225 | |||
4226 | /* On any allowed CPU? */ | ||
4227 | if (dest_cpu == NR_CPUS) | ||
4228 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | ||
4229 | |||
4230 | /* No more Mr. Nice Guy. */ | ||
4231 | if (dest_cpu == NR_CPUS) { | ||
4232 | tsk->cpus_allowed = cpuset_cpus_allowed(tsk); | ||
4233 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | ||
4234 | |||
4235 | /* | ||
4236 | * Don't tell them about moving exiting tasks or | ||
4237 | * kernel threads (both mm NULL), since they never | ||
4238 | * leave kernel. | ||
4239 | */ | ||
4240 | if (tsk->mm && printk_ratelimit()) | ||
4241 | printk(KERN_INFO "process %d (%s) no " | ||
4242 | "longer affine to cpu%d\n", | ||
4243 | tsk->pid, tsk->comm, dead_cpu); | ||
4244 | } | ||
4245 | __migrate_task(tsk, dead_cpu, dest_cpu); | ||
4246 | } | ||
4247 | |||
4248 | /* | ||
4249 | * While a dead CPU has no uninterruptible tasks queued at this point, | ||
4250 | * it might still have a nonzero ->nr_uninterruptible counter, because | ||
4251 | * for performance reasons the counter is not stricly tracking tasks to | ||
4252 | * their home CPUs. So we just add the counter to another CPU's counter, | ||
4253 | * to keep the global sum constant after CPU-down: | ||
4254 | */ | ||
4255 | static void migrate_nr_uninterruptible(runqueue_t *rq_src) | ||
4256 | { | ||
4257 | runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | ||
4258 | unsigned long flags; | ||
4259 | |||
4260 | local_irq_save(flags); | ||
4261 | double_rq_lock(rq_src, rq_dest); | ||
4262 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | ||
4263 | rq_src->nr_uninterruptible = 0; | ||
4264 | double_rq_unlock(rq_src, rq_dest); | ||
4265 | local_irq_restore(flags); | ||
4266 | } | ||
4267 | |||
4268 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
4269 | static void migrate_live_tasks(int src_cpu) | ||
4270 | { | ||
4271 | struct task_struct *tsk, *t; | ||
4272 | |||
4273 | write_lock_irq(&tasklist_lock); | ||
4274 | |||
4275 | do_each_thread(t, tsk) { | ||
4276 | if (tsk == current) | ||
4277 | continue; | ||
4278 | |||
4279 | if (task_cpu(tsk) == src_cpu) | ||
4280 | move_task_off_dead_cpu(src_cpu, tsk); | ||
4281 | } while_each_thread(t, tsk); | ||
4282 | |||
4283 | write_unlock_irq(&tasklist_lock); | ||
4284 | } | ||
4285 | |||
4286 | /* Schedules idle task to be the next runnable task on current CPU. | ||
4287 | * It does so by boosting its priority to highest possible and adding it to | ||
4288 | * the _front_ of runqueue. Used by CPU offline code. | ||
4289 | */ | ||
4290 | void sched_idle_next(void) | ||
4291 | { | ||
4292 | int cpu = smp_processor_id(); | ||
4293 | runqueue_t *rq = this_rq(); | ||
4294 | struct task_struct *p = rq->idle; | ||
4295 | unsigned long flags; | ||
4296 | |||
4297 | /* cpu has to be offline */ | ||
4298 | BUG_ON(cpu_online(cpu)); | ||
4299 | |||
4300 | /* Strictly not necessary since rest of the CPUs are stopped by now | ||
4301 | * and interrupts disabled on current cpu. | ||
4302 | */ | ||
4303 | spin_lock_irqsave(&rq->lock, flags); | ||
4304 | |||
4305 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
4306 | /* Add idle task to _front_ of it's priority queue */ | ||
4307 | __activate_idle_task(p, rq); | ||
4308 | |||
4309 | spin_unlock_irqrestore(&rq->lock, flags); | ||
4310 | } | ||
4311 | |||
4312 | /* Ensures that the idle task is using init_mm right before its cpu goes | ||
4313 | * offline. | ||
4314 | */ | ||
4315 | void idle_task_exit(void) | ||
4316 | { | ||
4317 | struct mm_struct *mm = current->active_mm; | ||
4318 | |||
4319 | BUG_ON(cpu_online(smp_processor_id())); | ||
4320 | |||
4321 | if (mm != &init_mm) | ||
4322 | switch_mm(mm, &init_mm, current); | ||
4323 | mmdrop(mm); | ||
4324 | } | ||
4325 | |||
4326 | static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | ||
4327 | { | ||
4328 | struct runqueue *rq = cpu_rq(dead_cpu); | ||
4329 | |||
4330 | /* Must be exiting, otherwise would be on tasklist. */ | ||
4331 | BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); | ||
4332 | |||
4333 | /* Cannot have done final schedule yet: would have vanished. */ | ||
4334 | BUG_ON(tsk->flags & PF_DEAD); | ||
4335 | |||
4336 | get_task_struct(tsk); | ||
4337 | |||
4338 | /* | ||
4339 | * Drop lock around migration; if someone else moves it, | ||
4340 | * that's OK. No task can be added to this CPU, so iteration is | ||
4341 | * fine. | ||
4342 | */ | ||
4343 | spin_unlock_irq(&rq->lock); | ||
4344 | move_task_off_dead_cpu(dead_cpu, tsk); | ||
4345 | spin_lock_irq(&rq->lock); | ||
4346 | |||
4347 | put_task_struct(tsk); | ||
4348 | } | ||
4349 | |||
4350 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
4351 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
4352 | { | ||
4353 | unsigned arr, i; | ||
4354 | struct runqueue *rq = cpu_rq(dead_cpu); | ||
4355 | |||
4356 | for (arr = 0; arr < 2; arr++) { | ||
4357 | for (i = 0; i < MAX_PRIO; i++) { | ||
4358 | struct list_head *list = &rq->arrays[arr].queue[i]; | ||
4359 | while (!list_empty(list)) | ||
4360 | migrate_dead(dead_cpu, | ||
4361 | list_entry(list->next, task_t, | ||
4362 | run_list)); | ||
4363 | } | ||
4364 | } | ||
4365 | } | ||
4366 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
4367 | |||
4368 | /* | ||
4369 | * migration_call - callback that gets triggered when a CPU is added. | ||
4370 | * Here we can start up the necessary migration thread for the new CPU. | ||
4371 | */ | ||
4372 | static int migration_call(struct notifier_block *nfb, unsigned long action, | ||
4373 | void *hcpu) | ||
4374 | { | ||
4375 | int cpu = (long)hcpu; | ||
4376 | struct task_struct *p; | ||
4377 | struct runqueue *rq; | ||
4378 | unsigned long flags; | ||
4379 | |||
4380 | switch (action) { | ||
4381 | case CPU_UP_PREPARE: | ||
4382 | p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); | ||
4383 | if (IS_ERR(p)) | ||
4384 | return NOTIFY_BAD; | ||
4385 | p->flags |= PF_NOFREEZE; | ||
4386 | kthread_bind(p, cpu); | ||
4387 | /* Must be high prio: stop_machine expects to yield to it. */ | ||
4388 | rq = task_rq_lock(p, &flags); | ||
4389 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
4390 | task_rq_unlock(rq, &flags); | ||
4391 | cpu_rq(cpu)->migration_thread = p; | ||
4392 | break; | ||
4393 | case CPU_ONLINE: | ||
4394 | /* Strictly unneccessary, as first user will wake it. */ | ||
4395 | wake_up_process(cpu_rq(cpu)->migration_thread); | ||
4396 | break; | ||
4397 | #ifdef CONFIG_HOTPLUG_CPU | ||
4398 | case CPU_UP_CANCELED: | ||
4399 | /* Unbind it from offline cpu so it can run. Fall thru. */ | ||
4400 | kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); | ||
4401 | kthread_stop(cpu_rq(cpu)->migration_thread); | ||
4402 | cpu_rq(cpu)->migration_thread = NULL; | ||
4403 | break; | ||
4404 | case CPU_DEAD: | ||
4405 | migrate_live_tasks(cpu); | ||
4406 | rq = cpu_rq(cpu); | ||
4407 | kthread_stop(rq->migration_thread); | ||
4408 | rq->migration_thread = NULL; | ||
4409 | /* Idle task back to normal (off runqueue, low prio) */ | ||
4410 | rq = task_rq_lock(rq->idle, &flags); | ||
4411 | deactivate_task(rq->idle, rq); | ||
4412 | rq->idle->static_prio = MAX_PRIO; | ||
4413 | __setscheduler(rq->idle, SCHED_NORMAL, 0); | ||
4414 | migrate_dead_tasks(cpu); | ||
4415 | task_rq_unlock(rq, &flags); | ||
4416 | migrate_nr_uninterruptible(rq); | ||
4417 | BUG_ON(rq->nr_running != 0); | ||
4418 | |||
4419 | /* No need to migrate the tasks: it was best-effort if | ||
4420 | * they didn't do lock_cpu_hotplug(). Just wake up | ||
4421 | * the requestors. */ | ||
4422 | spin_lock_irq(&rq->lock); | ||
4423 | while (!list_empty(&rq->migration_queue)) { | ||
4424 | migration_req_t *req; | ||
4425 | req = list_entry(rq->migration_queue.next, | ||
4426 | migration_req_t, list); | ||
4427 | BUG_ON(req->type != REQ_MOVE_TASK); | ||
4428 | list_del_init(&req->list); | ||
4429 | complete(&req->done); | ||
4430 | } | ||
4431 | spin_unlock_irq(&rq->lock); | ||
4432 | break; | ||
4433 | #endif | ||
4434 | } | ||
4435 | return NOTIFY_OK; | ||
4436 | } | ||
4437 | |||
4438 | /* Register at highest priority so that task migration (migrate_all_tasks) | ||
4439 | * happens before everything else. | ||
4440 | */ | ||
4441 | static struct notifier_block __devinitdata migration_notifier = { | ||
4442 | .notifier_call = migration_call, | ||
4443 | .priority = 10 | ||
4444 | }; | ||
4445 | |||
4446 | int __init migration_init(void) | ||
4447 | { | ||
4448 | void *cpu = (void *)(long)smp_processor_id(); | ||
4449 | /* Start one for boot CPU. */ | ||
4450 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | ||
4451 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | ||
4452 | register_cpu_notifier(&migration_notifier); | ||
4453 | return 0; | ||
4454 | } | ||
4455 | #endif | ||
4456 | |||
4457 | #ifdef CONFIG_SMP | ||
4458 | #define SCHED_DOMAIN_DEBUG | ||
4459 | #ifdef SCHED_DOMAIN_DEBUG | ||
4460 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
4461 | { | ||
4462 | int level = 0; | ||
4463 | |||
4464 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
4465 | |||
4466 | do { | ||
4467 | int i; | ||
4468 | char str[NR_CPUS]; | ||
4469 | struct sched_group *group = sd->groups; | ||
4470 | cpumask_t groupmask; | ||
4471 | |||
4472 | cpumask_scnprintf(str, NR_CPUS, sd->span); | ||
4473 | cpus_clear(groupmask); | ||
4474 | |||
4475 | printk(KERN_DEBUG); | ||
4476 | for (i = 0; i < level + 1; i++) | ||
4477 | printk(" "); | ||
4478 | printk("domain %d: ", level); | ||
4479 | |||
4480 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
4481 | printk("does not load-balance\n"); | ||
4482 | if (sd->parent) | ||
4483 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); | ||
4484 | break; | ||
4485 | } | ||
4486 | |||
4487 | printk("span %s\n", str); | ||
4488 | |||
4489 | if (!cpu_isset(cpu, sd->span)) | ||
4490 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); | ||
4491 | if (!cpu_isset(cpu, group->cpumask)) | ||
4492 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); | ||
4493 | |||
4494 | printk(KERN_DEBUG); | ||
4495 | for (i = 0; i < level + 2; i++) | ||
4496 | printk(" "); | ||
4497 | printk("groups:"); | ||
4498 | do { | ||
4499 | if (!group) { | ||
4500 | printk("\n"); | ||
4501 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
4502 | break; | ||
4503 | } | ||
4504 | |||
4505 | if (!group->cpu_power) { | ||
4506 | printk("\n"); | ||
4507 | printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); | ||
4508 | } | ||
4509 | |||
4510 | if (!cpus_weight(group->cpumask)) { | ||
4511 | printk("\n"); | ||
4512 | printk(KERN_ERR "ERROR: empty group\n"); | ||
4513 | } | ||
4514 | |||
4515 | if (cpus_intersects(groupmask, group->cpumask)) { | ||
4516 | printk("\n"); | ||
4517 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
4518 | } | ||
4519 | |||
4520 | cpus_or(groupmask, groupmask, group->cpumask); | ||
4521 | |||
4522 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | ||
4523 | printk(" %s", str); | ||
4524 | |||
4525 | group = group->next; | ||
4526 | } while (group != sd->groups); | ||
4527 | printk("\n"); | ||
4528 | |||
4529 | if (!cpus_equal(sd->span, groupmask)) | ||
4530 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
4531 | |||
4532 | level++; | ||
4533 | sd = sd->parent; | ||
4534 | |||
4535 | if (sd) { | ||
4536 | if (!cpus_subset(groupmask, sd->span)) | ||
4537 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); | ||
4538 | } | ||
4539 | |||
4540 | } while (sd); | ||
4541 | } | ||
4542 | #else | ||
4543 | #define sched_domain_debug(sd, cpu) {} | ||
4544 | #endif | ||
4545 | |||
4546 | /* | ||
4547 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
4548 | * hold the hotplug lock. | ||
4549 | */ | ||
4550 | void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) | ||
4551 | { | ||
4552 | migration_req_t req; | ||
4553 | unsigned long flags; | ||
4554 | runqueue_t *rq = cpu_rq(cpu); | ||
4555 | int local = 1; | ||
4556 | |||
4557 | sched_domain_debug(sd, cpu); | ||
4558 | |||
4559 | spin_lock_irqsave(&rq->lock, flags); | ||
4560 | |||
4561 | if (cpu == smp_processor_id() || !cpu_online(cpu)) { | ||
4562 | rq->sd = sd; | ||
4563 | } else { | ||
4564 | init_completion(&req.done); | ||
4565 | req.type = REQ_SET_DOMAIN; | ||
4566 | req.sd = sd; | ||
4567 | list_add(&req.list, &rq->migration_queue); | ||
4568 | local = 0; | ||
4569 | } | ||
4570 | |||
4571 | spin_unlock_irqrestore(&rq->lock, flags); | ||
4572 | |||
4573 | if (!local) { | ||
4574 | wake_up_process(rq->migration_thread); | ||
4575 | wait_for_completion(&req.done); | ||
4576 | } | ||
4577 | } | ||
4578 | |||
4579 | /* cpus with isolated domains */ | ||
4580 | cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; | ||
4581 | |||
4582 | /* Setup the mask of cpus configured for isolated domains */ | ||
4583 | static int __init isolated_cpu_setup(char *str) | ||
4584 | { | ||
4585 | int ints[NR_CPUS], i; | ||
4586 | |||
4587 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
4588 | cpus_clear(cpu_isolated_map); | ||
4589 | for (i = 1; i <= ints[0]; i++) | ||
4590 | if (ints[i] < NR_CPUS) | ||
4591 | cpu_set(ints[i], cpu_isolated_map); | ||
4592 | return 1; | ||
4593 | } | ||
4594 | |||
4595 | __setup ("isolcpus=", isolated_cpu_setup); | ||
4596 | |||
4597 | /* | ||
4598 | * init_sched_build_groups takes an array of groups, the cpumask we wish | ||
4599 | * to span, and a pointer to a function which identifies what group a CPU | ||
4600 | * belongs to. The return value of group_fn must be a valid index into the | ||
4601 | * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we | ||
4602 | * keep track of groups covered with a cpumask_t). | ||
4603 | * | ||
4604 | * init_sched_build_groups will build a circular linked list of the groups | ||
4605 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
4606 | * and ->cpu_power to 0. | ||
4607 | */ | ||
4608 | void __devinit init_sched_build_groups(struct sched_group groups[], | ||
4609 | cpumask_t span, int (*group_fn)(int cpu)) | ||
4610 | { | ||
4611 | struct sched_group *first = NULL, *last = NULL; | ||
4612 | cpumask_t covered = CPU_MASK_NONE; | ||
4613 | int i; | ||
4614 | |||
4615 | for_each_cpu_mask(i, span) { | ||
4616 | int group = group_fn(i); | ||
4617 | struct sched_group *sg = &groups[group]; | ||
4618 | int j; | ||
4619 | |||
4620 | if (cpu_isset(i, covered)) | ||
4621 | continue; | ||
4622 | |||
4623 | sg->cpumask = CPU_MASK_NONE; | ||
4624 | sg->cpu_power = 0; | ||
4625 | |||
4626 | for_each_cpu_mask(j, span) { | ||
4627 | if (group_fn(j) != group) | ||
4628 | continue; | ||
4629 | |||
4630 | cpu_set(j, covered); | ||
4631 | cpu_set(j, sg->cpumask); | ||
4632 | } | ||
4633 | if (!first) | ||
4634 | first = sg; | ||
4635 | if (last) | ||
4636 | last->next = sg; | ||
4637 | last = sg; | ||
4638 | } | ||
4639 | last->next = first; | ||
4640 | } | ||
4641 | |||
4642 | |||
4643 | #ifdef ARCH_HAS_SCHED_DOMAIN | ||
4644 | extern void __devinit arch_init_sched_domains(void); | ||
4645 | extern void __devinit arch_destroy_sched_domains(void); | ||
4646 | #else | ||
4647 | #ifdef CONFIG_SCHED_SMT | ||
4648 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | ||
4649 | static struct sched_group sched_group_cpus[NR_CPUS]; | ||
4650 | static int __devinit cpu_to_cpu_group(int cpu) | ||
4651 | { | ||
4652 | return cpu; | ||
4653 | } | ||
4654 | #endif | ||
4655 | |||
4656 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | ||
4657 | static struct sched_group sched_group_phys[NR_CPUS]; | ||
4658 | static int __devinit cpu_to_phys_group(int cpu) | ||
4659 | { | ||
4660 | #ifdef CONFIG_SCHED_SMT | ||
4661 | return first_cpu(cpu_sibling_map[cpu]); | ||
4662 | #else | ||
4663 | return cpu; | ||
4664 | #endif | ||
4665 | } | ||
4666 | |||
4667 | #ifdef CONFIG_NUMA | ||
4668 | |||
4669 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | ||
4670 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; | ||
4671 | static int __devinit cpu_to_node_group(int cpu) | ||
4672 | { | ||
4673 | return cpu_to_node(cpu); | ||
4674 | } | ||
4675 | #endif | ||
4676 | |||
4677 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
4678 | /* | ||
4679 | * The domains setup code relies on siblings not spanning | ||
4680 | * multiple nodes. Make sure the architecture has a proper | ||
4681 | * siblings map: | ||
4682 | */ | ||
4683 | static void check_sibling_maps(void) | ||
4684 | { | ||
4685 | int i, j; | ||
4686 | |||
4687 | for_each_online_cpu(i) { | ||
4688 | for_each_cpu_mask(j, cpu_sibling_map[i]) { | ||
4689 | if (cpu_to_node(i) != cpu_to_node(j)) { | ||
4690 | printk(KERN_INFO "warning: CPU %d siblings map " | ||
4691 | "to different node - isolating " | ||
4692 | "them.\n", i); | ||
4693 | cpu_sibling_map[i] = cpumask_of_cpu(i); | ||
4694 | break; | ||
4695 | } | ||
4696 | } | ||
4697 | } | ||
4698 | } | ||
4699 | #endif | ||
4700 | |||
4701 | /* | ||
4702 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
4703 | */ | ||
4704 | static void __devinit arch_init_sched_domains(void) | ||
4705 | { | ||
4706 | int i; | ||
4707 | cpumask_t cpu_default_map; | ||
4708 | |||
4709 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
4710 | check_sibling_maps(); | ||
4711 | #endif | ||
4712 | /* | ||
4713 | * Setup mask for cpus without special case scheduling requirements. | ||
4714 | * For now this just excludes isolated cpus, but could be used to | ||
4715 | * exclude other special cases in the future. | ||
4716 | */ | ||
4717 | cpus_complement(cpu_default_map, cpu_isolated_map); | ||
4718 | cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); | ||
4719 | |||
4720 | /* | ||
4721 | * Set up domains. Isolated domains just stay on the dummy domain. | ||
4722 | */ | ||
4723 | for_each_cpu_mask(i, cpu_default_map) { | ||
4724 | int group; | ||
4725 | struct sched_domain *sd = NULL, *p; | ||
4726 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | ||
4727 | |||
4728 | cpus_and(nodemask, nodemask, cpu_default_map); | ||
4729 | |||
4730 | #ifdef CONFIG_NUMA | ||
4731 | sd = &per_cpu(node_domains, i); | ||
4732 | group = cpu_to_node_group(i); | ||
4733 | *sd = SD_NODE_INIT; | ||
4734 | sd->span = cpu_default_map; | ||
4735 | sd->groups = &sched_group_nodes[group]; | ||
4736 | #endif | ||
4737 | |||
4738 | p = sd; | ||
4739 | sd = &per_cpu(phys_domains, i); | ||
4740 | group = cpu_to_phys_group(i); | ||
4741 | *sd = SD_CPU_INIT; | ||
4742 | sd->span = nodemask; | ||
4743 | sd->parent = p; | ||
4744 | sd->groups = &sched_group_phys[group]; | ||
4745 | |||
4746 | #ifdef CONFIG_SCHED_SMT | ||
4747 | p = sd; | ||
4748 | sd = &per_cpu(cpu_domains, i); | ||
4749 | group = cpu_to_cpu_group(i); | ||
4750 | *sd = SD_SIBLING_INIT; | ||
4751 | sd->span = cpu_sibling_map[i]; | ||
4752 | cpus_and(sd->span, sd->span, cpu_default_map); | ||
4753 | sd->parent = p; | ||
4754 | sd->groups = &sched_group_cpus[group]; | ||
4755 | #endif | ||
4756 | } | ||
4757 | |||
4758 | #ifdef CONFIG_SCHED_SMT | ||
4759 | /* Set up CPU (sibling) groups */ | ||
4760 | for_each_online_cpu(i) { | ||
4761 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | ||
4762 | cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); | ||
4763 | if (i != first_cpu(this_sibling_map)) | ||
4764 | continue; | ||
4765 | |||
4766 | init_sched_build_groups(sched_group_cpus, this_sibling_map, | ||
4767 | &cpu_to_cpu_group); | ||
4768 | } | ||
4769 | #endif | ||
4770 | |||
4771 | /* Set up physical groups */ | ||
4772 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
4773 | cpumask_t nodemask = node_to_cpumask(i); | ||
4774 | |||
4775 | cpus_and(nodemask, nodemask, cpu_default_map); | ||
4776 | if (cpus_empty(nodemask)) | ||
4777 | continue; | ||
4778 | |||
4779 | init_sched_build_groups(sched_group_phys, nodemask, | ||
4780 | &cpu_to_phys_group); | ||
4781 | } | ||
4782 | |||
4783 | #ifdef CONFIG_NUMA | ||
4784 | /* Set up node groups */ | ||
4785 | init_sched_build_groups(sched_group_nodes, cpu_default_map, | ||
4786 | &cpu_to_node_group); | ||
4787 | #endif | ||
4788 | |||
4789 | /* Calculate CPU power for physical packages and nodes */ | ||
4790 | for_each_cpu_mask(i, cpu_default_map) { | ||
4791 | int power; | ||
4792 | struct sched_domain *sd; | ||
4793 | #ifdef CONFIG_SCHED_SMT | ||
4794 | sd = &per_cpu(cpu_domains, i); | ||
4795 | power = SCHED_LOAD_SCALE; | ||
4796 | sd->groups->cpu_power = power; | ||
4797 | #endif | ||
4798 | |||
4799 | sd = &per_cpu(phys_domains, i); | ||
4800 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | ||
4801 | (cpus_weight(sd->groups->cpumask)-1) / 10; | ||
4802 | sd->groups->cpu_power = power; | ||
4803 | |||
4804 | #ifdef CONFIG_NUMA | ||
4805 | if (i == first_cpu(sd->groups->cpumask)) { | ||
4806 | /* Only add "power" once for each physical package. */ | ||
4807 | sd = &per_cpu(node_domains, i); | ||
4808 | sd->groups->cpu_power += power; | ||
4809 | } | ||
4810 | #endif | ||
4811 | } | ||
4812 | |||
4813 | /* Attach the domains */ | ||
4814 | for_each_online_cpu(i) { | ||
4815 | struct sched_domain *sd; | ||
4816 | #ifdef CONFIG_SCHED_SMT | ||
4817 | sd = &per_cpu(cpu_domains, i); | ||
4818 | #else | ||
4819 | sd = &per_cpu(phys_domains, i); | ||
4820 | #endif | ||
4821 | cpu_attach_domain(sd, i); | ||
4822 | } | ||
4823 | } | ||
4824 | |||
4825 | #ifdef CONFIG_HOTPLUG_CPU | ||
4826 | static void __devinit arch_destroy_sched_domains(void) | ||
4827 | { | ||
4828 | /* Do nothing: everything is statically allocated. */ | ||
4829 | } | ||
4830 | #endif | ||
4831 | |||
4832 | #endif /* ARCH_HAS_SCHED_DOMAIN */ | ||
4833 | |||
4834 | /* | ||
4835 | * Initial dummy domain for early boot and for hotplug cpu. Being static, | ||
4836 | * it is initialized to zero, so all balancing flags are cleared which is | ||
4837 | * what we want. | ||
4838 | */ | ||
4839 | static struct sched_domain sched_domain_dummy; | ||
4840 | |||
4841 | #ifdef CONFIG_HOTPLUG_CPU | ||
4842 | /* | ||
4843 | * Force a reinitialization of the sched domains hierarchy. The domains | ||
4844 | * and groups cannot be updated in place without racing with the balancing | ||
4845 | * code, so we temporarily attach all running cpus to a "dummy" domain | ||
4846 | * which will prevent rebalancing while the sched domains are recalculated. | ||
4847 | */ | ||
4848 | static int update_sched_domains(struct notifier_block *nfb, | ||
4849 | unsigned long action, void *hcpu) | ||
4850 | { | ||
4851 | int i; | ||
4852 | |||
4853 | switch (action) { | ||
4854 | case CPU_UP_PREPARE: | ||
4855 | case CPU_DOWN_PREPARE: | ||
4856 | for_each_online_cpu(i) | ||
4857 | cpu_attach_domain(&sched_domain_dummy, i); | ||
4858 | arch_destroy_sched_domains(); | ||
4859 | return NOTIFY_OK; | ||
4860 | |||
4861 | case CPU_UP_CANCELED: | ||
4862 | case CPU_DOWN_FAILED: | ||
4863 | case CPU_ONLINE: | ||
4864 | case CPU_DEAD: | ||
4865 | /* | ||
4866 | * Fall through and re-initialise the domains. | ||
4867 | */ | ||
4868 | break; | ||
4869 | default: | ||
4870 | return NOTIFY_DONE; | ||
4871 | } | ||
4872 | |||
4873 | /* The hotplug lock is already held by cpu_up/cpu_down */ | ||
4874 | arch_init_sched_domains(); | ||
4875 | |||
4876 | return NOTIFY_OK; | ||
4877 | } | ||
4878 | #endif | ||
4879 | |||
4880 | void __init sched_init_smp(void) | ||
4881 | { | ||
4882 | lock_cpu_hotplug(); | ||
4883 | arch_init_sched_domains(); | ||
4884 | unlock_cpu_hotplug(); | ||
4885 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | ||
4886 | hotcpu_notifier(update_sched_domains, 0); | ||
4887 | } | ||
4888 | #else | ||
4889 | void __init sched_init_smp(void) | ||
4890 | { | ||
4891 | } | ||
4892 | #endif /* CONFIG_SMP */ | ||
4893 | |||
4894 | int in_sched_functions(unsigned long addr) | ||
4895 | { | ||
4896 | /* Linker adds these: start and end of __sched functions */ | ||
4897 | extern char __sched_text_start[], __sched_text_end[]; | ||
4898 | return in_lock_functions(addr) || | ||
4899 | (addr >= (unsigned long)__sched_text_start | ||
4900 | && addr < (unsigned long)__sched_text_end); | ||
4901 | } | ||
4902 | |||
4903 | void __init sched_init(void) | ||
4904 | { | ||
4905 | runqueue_t *rq; | ||
4906 | int i, j, k; | ||
4907 | |||
4908 | for (i = 0; i < NR_CPUS; i++) { | ||
4909 | prio_array_t *array; | ||
4910 | |||
4911 | rq = cpu_rq(i); | ||
4912 | spin_lock_init(&rq->lock); | ||
4913 | rq->active = rq->arrays; | ||
4914 | rq->expired = rq->arrays + 1; | ||
4915 | rq->best_expired_prio = MAX_PRIO; | ||
4916 | |||
4917 | #ifdef CONFIG_SMP | ||
4918 | rq->sd = &sched_domain_dummy; | ||
4919 | rq->cpu_load = 0; | ||
4920 | rq->active_balance = 0; | ||
4921 | rq->push_cpu = 0; | ||
4922 | rq->migration_thread = NULL; | ||
4923 | INIT_LIST_HEAD(&rq->migration_queue); | ||
4924 | #endif | ||
4925 | atomic_set(&rq->nr_iowait, 0); | ||
4926 | |||
4927 | for (j = 0; j < 2; j++) { | ||
4928 | array = rq->arrays + j; | ||
4929 | for (k = 0; k < MAX_PRIO; k++) { | ||
4930 | INIT_LIST_HEAD(array->queue + k); | ||
4931 | __clear_bit(k, array->bitmap); | ||
4932 | } | ||
4933 | // delimiter for bitsearch | ||
4934 | __set_bit(MAX_PRIO, array->bitmap); | ||
4935 | } | ||
4936 | } | ||
4937 | |||
4938 | /* | ||
4939 | * The boot idle thread does lazy MMU switching as well: | ||
4940 | */ | ||
4941 | atomic_inc(&init_mm.mm_count); | ||
4942 | enter_lazy_tlb(&init_mm, current); | ||
4943 | |||
4944 | /* | ||
4945 | * Make us the idle thread. Technically, schedule() should not be | ||
4946 | * called from this thread, however somewhere below it might be, | ||
4947 | * but because we are the idle thread, we just pick up running again | ||
4948 | * when this runqueue becomes "idle". | ||
4949 | */ | ||
4950 | init_idle(current, smp_processor_id()); | ||
4951 | } | ||
4952 | |||
4953 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | ||
4954 | void __might_sleep(char *file, int line) | ||
4955 | { | ||
4956 | #if defined(in_atomic) | ||
4957 | static unsigned long prev_jiffy; /* ratelimiting */ | ||
4958 | |||
4959 | if ((in_atomic() || irqs_disabled()) && | ||
4960 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | ||
4961 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | ||
4962 | return; | ||
4963 | prev_jiffy = jiffies; | ||
4964 | printk(KERN_ERR "Debug: sleeping function called from invalid" | ||
4965 | " context at %s:%d\n", file, line); | ||
4966 | printk("in_atomic():%d, irqs_disabled():%d\n", | ||
4967 | in_atomic(), irqs_disabled()); | ||
4968 | dump_stack(); | ||
4969 | } | ||
4970 | #endif | ||
4971 | } | ||
4972 | EXPORT_SYMBOL(__might_sleep); | ||
4973 | #endif | ||
4974 | |||
4975 | #ifdef CONFIG_MAGIC_SYSRQ | ||
4976 | void normalize_rt_tasks(void) | ||
4977 | { | ||
4978 | struct task_struct *p; | ||
4979 | prio_array_t *array; | ||
4980 | unsigned long flags; | ||
4981 | runqueue_t *rq; | ||
4982 | |||
4983 | read_lock_irq(&tasklist_lock); | ||
4984 | for_each_process (p) { | ||
4985 | if (!rt_task(p)) | ||
4986 | continue; | ||
4987 | |||
4988 | rq = task_rq_lock(p, &flags); | ||
4989 | |||
4990 | array = p->array; | ||
4991 | if (array) | ||
4992 | deactivate_task(p, task_rq(p)); | ||
4993 | __setscheduler(p, SCHED_NORMAL, 0); | ||
4994 | if (array) { | ||
4995 | __activate_task(p, task_rq(p)); | ||
4996 | resched_task(rq->curr); | ||
4997 | } | ||
4998 | |||
4999 | task_rq_unlock(rq, &flags); | ||
5000 | } | ||
5001 | read_unlock_irq(&tasklist_lock); | ||
5002 | } | ||
5003 | |||
5004 | #endif /* CONFIG_MAGIC_SYSRQ */ | ||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c new file mode 100644 index 000000000000..c3391b6020e8 --- /dev/null +++ b/kernel/seccomp.c | |||
@@ -0,0 +1,56 @@ | |||
1 | /* | ||
2 | * linux/kernel/seccomp.c | ||
3 | * | ||
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> | ||
5 | * | ||
6 | * This defines a simple but solid secure-computing mode. | ||
7 | */ | ||
8 | |||
9 | #include <linux/seccomp.h> | ||
10 | #include <linux/sched.h> | ||
11 | |||
12 | /* #define SECCOMP_DEBUG 1 */ | ||
13 | |||
14 | /* | ||
15 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | ||
16 | * To be fully secure this must be combined with rlimit | ||
17 | * to limit the stack allocations too. | ||
18 | */ | ||
19 | static int mode1_syscalls[] = { | ||
20 | __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, | ||
21 | 0, /* null terminated */ | ||
22 | }; | ||
23 | |||
24 | #ifdef TIF_32BIT | ||
25 | static int mode1_syscalls_32[] = { | ||
26 | __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, | ||
27 | 0, /* null terminated */ | ||
28 | }; | ||
29 | #endif | ||
30 | |||
31 | void __secure_computing(int this_syscall) | ||
32 | { | ||
33 | int mode = current->seccomp.mode; | ||
34 | int * syscall; | ||
35 | |||
36 | switch (mode) { | ||
37 | case 1: | ||
38 | syscall = mode1_syscalls; | ||
39 | #ifdef TIF_32BIT | ||
40 | if (test_thread_flag(TIF_32BIT)) | ||
41 | syscall = mode1_syscalls_32; | ||
42 | #endif | ||
43 | do { | ||
44 | if (*syscall == this_syscall) | ||
45 | return; | ||
46 | } while (*++syscall); | ||
47 | break; | ||
48 | default: | ||
49 | BUG(); | ||
50 | } | ||
51 | |||
52 | #ifdef SECCOMP_DEBUG | ||
53 | dump_stack(); | ||
54 | #endif | ||
55 | do_exit(SIGKILL); | ||
56 | } | ||
diff --git a/kernel/signal.c b/kernel/signal.c new file mode 100644 index 000000000000..f00a1d610f0b --- /dev/null +++ b/kernel/signal.c | |||
@@ -0,0 +1,2662 @@ | |||
1 | /* | ||
2 | * linux/kernel/signal.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * | ||
6 | * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson | ||
7 | * | ||
8 | * 2003-06-02 Jim Houston - Concurrent Computer Corp. | ||
9 | * Changes to use preallocated sigqueue structures | ||
10 | * to allow signals to be sent reliably. | ||
11 | */ | ||
12 | |||
13 | #include <linux/config.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/smp_lock.h> | ||
17 | #include <linux/init.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/fs.h> | ||
20 | #include <linux/tty.h> | ||
21 | #include <linux/binfmts.h> | ||
22 | #include <linux/security.h> | ||
23 | #include <linux/syscalls.h> | ||
24 | #include <linux/ptrace.h> | ||
25 | #include <linux/posix-timers.h> | ||
26 | #include <asm/param.h> | ||
27 | #include <asm/uaccess.h> | ||
28 | #include <asm/unistd.h> | ||
29 | #include <asm/siginfo.h> | ||
30 | |||
31 | /* | ||
32 | * SLAB caches for signal bits. | ||
33 | */ | ||
34 | |||
35 | static kmem_cache_t *sigqueue_cachep; | ||
36 | |||
37 | /* | ||
38 | * In POSIX a signal is sent either to a specific thread (Linux task) | ||
39 | * or to the process as a whole (Linux thread group). How the signal | ||
40 | * is sent determines whether it's to one thread or the whole group, | ||
41 | * which determines which signal mask(s) are involved in blocking it | ||
42 | * from being delivered until later. When the signal is delivered, | ||
43 | * either it's caught or ignored by a user handler or it has a default | ||
44 | * effect that applies to the whole thread group (POSIX process). | ||
45 | * | ||
46 | * The possible effects an unblocked signal set to SIG_DFL can have are: | ||
47 | * ignore - Nothing Happens | ||
48 | * terminate - kill the process, i.e. all threads in the group, | ||
49 | * similar to exit_group. The group leader (only) reports | ||
50 | * WIFSIGNALED status to its parent. | ||
51 | * coredump - write a core dump file describing all threads using | ||
52 | * the same mm and then kill all those threads | ||
53 | * stop - stop all the threads in the group, i.e. TASK_STOPPED state | ||
54 | * | ||
55 | * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. | ||
56 | * Other signals when not blocked and set to SIG_DFL behaves as follows. | ||
57 | * The job control signals also have other special effects. | ||
58 | * | ||
59 | * +--------------------+------------------+ | ||
60 | * | POSIX signal | default action | | ||
61 | * +--------------------+------------------+ | ||
62 | * | SIGHUP | terminate | | ||
63 | * | SIGINT | terminate | | ||
64 | * | SIGQUIT | coredump | | ||
65 | * | SIGILL | coredump | | ||
66 | * | SIGTRAP | coredump | | ||
67 | * | SIGABRT/SIGIOT | coredump | | ||
68 | * | SIGBUS | coredump | | ||
69 | * | SIGFPE | coredump | | ||
70 | * | SIGKILL | terminate(+) | | ||
71 | * | SIGUSR1 | terminate | | ||
72 | * | SIGSEGV | coredump | | ||
73 | * | SIGUSR2 | terminate | | ||
74 | * | SIGPIPE | terminate | | ||
75 | * | SIGALRM | terminate | | ||
76 | * | SIGTERM | terminate | | ||
77 | * | SIGCHLD | ignore | | ||
78 | * | SIGCONT | ignore(*) | | ||
79 | * | SIGSTOP | stop(*)(+) | | ||
80 | * | SIGTSTP | stop(*) | | ||
81 | * | SIGTTIN | stop(*) | | ||
82 | * | SIGTTOU | stop(*) | | ||
83 | * | SIGURG | ignore | | ||
84 | * | SIGXCPU | coredump | | ||
85 | * | SIGXFSZ | coredump | | ||
86 | * | SIGVTALRM | terminate | | ||
87 | * | SIGPROF | terminate | | ||
88 | * | SIGPOLL/SIGIO | terminate | | ||
89 | * | SIGSYS/SIGUNUSED | coredump | | ||
90 | * | SIGSTKFLT | terminate | | ||
91 | * | SIGWINCH | ignore | | ||
92 | * | SIGPWR | terminate | | ||
93 | * | SIGRTMIN-SIGRTMAX | terminate | | ||
94 | * +--------------------+------------------+ | ||
95 | * | non-POSIX signal | default action | | ||
96 | * +--------------------+------------------+ | ||
97 | * | SIGEMT | coredump | | ||
98 | * +--------------------+------------------+ | ||
99 | * | ||
100 | * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". | ||
101 | * (*) Special job control effects: | ||
102 | * When SIGCONT is sent, it resumes the process (all threads in the group) | ||
103 | * from TASK_STOPPED state and also clears any pending/queued stop signals | ||
104 | * (any of those marked with "stop(*)"). This happens regardless of blocking, | ||
105 | * catching, or ignoring SIGCONT. When any stop signal is sent, it clears | ||
106 | * any pending/queued SIGCONT signals; this happens regardless of blocking, | ||
107 | * catching, or ignored the stop signal, though (except for SIGSTOP) the | ||
108 | * default action of stopping the process may happen later or never. | ||
109 | */ | ||
110 | |||
111 | #ifdef SIGEMT | ||
112 | #define M_SIGEMT M(SIGEMT) | ||
113 | #else | ||
114 | #define M_SIGEMT 0 | ||
115 | #endif | ||
116 | |||
117 | #if SIGRTMIN > BITS_PER_LONG | ||
118 | #define M(sig) (1ULL << ((sig)-1)) | ||
119 | #else | ||
120 | #define M(sig) (1UL << ((sig)-1)) | ||
121 | #endif | ||
122 | #define T(sig, mask) (M(sig) & (mask)) | ||
123 | |||
124 | #define SIG_KERNEL_ONLY_MASK (\ | ||
125 | M(SIGKILL) | M(SIGSTOP) ) | ||
126 | |||
127 | #define SIG_KERNEL_STOP_MASK (\ | ||
128 | M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) ) | ||
129 | |||
130 | #define SIG_KERNEL_COREDUMP_MASK (\ | ||
131 | M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \ | ||
132 | M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \ | ||
133 | M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT ) | ||
134 | |||
135 | #define SIG_KERNEL_IGNORE_MASK (\ | ||
136 | M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) ) | ||
137 | |||
138 | #define sig_kernel_only(sig) \ | ||
139 | (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK)) | ||
140 | #define sig_kernel_coredump(sig) \ | ||
141 | (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK)) | ||
142 | #define sig_kernel_ignore(sig) \ | ||
143 | (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK)) | ||
144 | #define sig_kernel_stop(sig) \ | ||
145 | (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) | ||
146 | |||
147 | #define sig_user_defined(t, signr) \ | ||
148 | (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ | ||
149 | ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) | ||
150 | |||
151 | #define sig_fatal(t, signr) \ | ||
152 | (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ | ||
153 | (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) | ||
154 | |||
155 | static int sig_ignored(struct task_struct *t, int sig) | ||
156 | { | ||
157 | void __user * handler; | ||
158 | |||
159 | /* | ||
160 | * Tracers always want to know about signals.. | ||
161 | */ | ||
162 | if (t->ptrace & PT_PTRACED) | ||
163 | return 0; | ||
164 | |||
165 | /* | ||
166 | * Blocked signals are never ignored, since the | ||
167 | * signal handler may change by the time it is | ||
168 | * unblocked. | ||
169 | */ | ||
170 | if (sigismember(&t->blocked, sig)) | ||
171 | return 0; | ||
172 | |||
173 | /* Is it explicitly or implicitly ignored? */ | ||
174 | handler = t->sighand->action[sig-1].sa.sa_handler; | ||
175 | return handler == SIG_IGN || | ||
176 | (handler == SIG_DFL && sig_kernel_ignore(sig)); | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * Re-calculate pending state from the set of locally pending | ||
181 | * signals, globally pending signals, and blocked signals. | ||
182 | */ | ||
183 | static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) | ||
184 | { | ||
185 | unsigned long ready; | ||
186 | long i; | ||
187 | |||
188 | switch (_NSIG_WORDS) { | ||
189 | default: | ||
190 | for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) | ||
191 | ready |= signal->sig[i] &~ blocked->sig[i]; | ||
192 | break; | ||
193 | |||
194 | case 4: ready = signal->sig[3] &~ blocked->sig[3]; | ||
195 | ready |= signal->sig[2] &~ blocked->sig[2]; | ||
196 | ready |= signal->sig[1] &~ blocked->sig[1]; | ||
197 | ready |= signal->sig[0] &~ blocked->sig[0]; | ||
198 | break; | ||
199 | |||
200 | case 2: ready = signal->sig[1] &~ blocked->sig[1]; | ||
201 | ready |= signal->sig[0] &~ blocked->sig[0]; | ||
202 | break; | ||
203 | |||
204 | case 1: ready = signal->sig[0] &~ blocked->sig[0]; | ||
205 | } | ||
206 | return ready != 0; | ||
207 | } | ||
208 | |||
209 | #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) | ||
210 | |||
211 | fastcall void recalc_sigpending_tsk(struct task_struct *t) | ||
212 | { | ||
213 | if (t->signal->group_stop_count > 0 || | ||
214 | PENDING(&t->pending, &t->blocked) || | ||
215 | PENDING(&t->signal->shared_pending, &t->blocked)) | ||
216 | set_tsk_thread_flag(t, TIF_SIGPENDING); | ||
217 | else | ||
218 | clear_tsk_thread_flag(t, TIF_SIGPENDING); | ||
219 | } | ||
220 | |||
221 | void recalc_sigpending(void) | ||
222 | { | ||
223 | recalc_sigpending_tsk(current); | ||
224 | } | ||
225 | |||
226 | /* Given the mask, find the first available signal that should be serviced. */ | ||
227 | |||
228 | static int | ||
229 | next_signal(struct sigpending *pending, sigset_t *mask) | ||
230 | { | ||
231 | unsigned long i, *s, *m, x; | ||
232 | int sig = 0; | ||
233 | |||
234 | s = pending->signal.sig; | ||
235 | m = mask->sig; | ||
236 | switch (_NSIG_WORDS) { | ||
237 | default: | ||
238 | for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) | ||
239 | if ((x = *s &~ *m) != 0) { | ||
240 | sig = ffz(~x) + i*_NSIG_BPW + 1; | ||
241 | break; | ||
242 | } | ||
243 | break; | ||
244 | |||
245 | case 2: if ((x = s[0] &~ m[0]) != 0) | ||
246 | sig = 1; | ||
247 | else if ((x = s[1] &~ m[1]) != 0) | ||
248 | sig = _NSIG_BPW + 1; | ||
249 | else | ||
250 | break; | ||
251 | sig += ffz(~x); | ||
252 | break; | ||
253 | |||
254 | case 1: if ((x = *s &~ *m) != 0) | ||
255 | sig = ffz(~x) + 1; | ||
256 | break; | ||
257 | } | ||
258 | |||
259 | return sig; | ||
260 | } | ||
261 | |||
262 | static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags, | ||
263 | int override_rlimit) | ||
264 | { | ||
265 | struct sigqueue *q = NULL; | ||
266 | |||
267 | atomic_inc(&t->user->sigpending); | ||
268 | if (override_rlimit || | ||
269 | atomic_read(&t->user->sigpending) <= | ||
270 | t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) | ||
271 | q = kmem_cache_alloc(sigqueue_cachep, flags); | ||
272 | if (unlikely(q == NULL)) { | ||
273 | atomic_dec(&t->user->sigpending); | ||
274 | } else { | ||
275 | INIT_LIST_HEAD(&q->list); | ||
276 | q->flags = 0; | ||
277 | q->lock = NULL; | ||
278 | q->user = get_uid(t->user); | ||
279 | } | ||
280 | return(q); | ||
281 | } | ||
282 | |||
283 | static inline void __sigqueue_free(struct sigqueue *q) | ||
284 | { | ||
285 | if (q->flags & SIGQUEUE_PREALLOC) | ||
286 | return; | ||
287 | atomic_dec(&q->user->sigpending); | ||
288 | free_uid(q->user); | ||
289 | kmem_cache_free(sigqueue_cachep, q); | ||
290 | } | ||
291 | |||
292 | static void flush_sigqueue(struct sigpending *queue) | ||
293 | { | ||
294 | struct sigqueue *q; | ||
295 | |||
296 | sigemptyset(&queue->signal); | ||
297 | while (!list_empty(&queue->list)) { | ||
298 | q = list_entry(queue->list.next, struct sigqueue , list); | ||
299 | list_del_init(&q->list); | ||
300 | __sigqueue_free(q); | ||
301 | } | ||
302 | } | ||
303 | |||
304 | /* | ||
305 | * Flush all pending signals for a task. | ||
306 | */ | ||
307 | |||
308 | void | ||
309 | flush_signals(struct task_struct *t) | ||
310 | { | ||
311 | unsigned long flags; | ||
312 | |||
313 | spin_lock_irqsave(&t->sighand->siglock, flags); | ||
314 | clear_tsk_thread_flag(t,TIF_SIGPENDING); | ||
315 | flush_sigqueue(&t->pending); | ||
316 | flush_sigqueue(&t->signal->shared_pending); | ||
317 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | ||
318 | } | ||
319 | |||
320 | /* | ||
321 | * This function expects the tasklist_lock write-locked. | ||
322 | */ | ||
323 | void __exit_sighand(struct task_struct *tsk) | ||
324 | { | ||
325 | struct sighand_struct * sighand = tsk->sighand; | ||
326 | |||
327 | /* Ok, we're done with the signal handlers */ | ||
328 | tsk->sighand = NULL; | ||
329 | if (atomic_dec_and_test(&sighand->count)) | ||
330 | kmem_cache_free(sighand_cachep, sighand); | ||
331 | } | ||
332 | |||
333 | void exit_sighand(struct task_struct *tsk) | ||
334 | { | ||
335 | write_lock_irq(&tasklist_lock); | ||
336 | __exit_sighand(tsk); | ||
337 | write_unlock_irq(&tasklist_lock); | ||
338 | } | ||
339 | |||
340 | /* | ||
341 | * This function expects the tasklist_lock write-locked. | ||
342 | */ | ||
343 | void __exit_signal(struct task_struct *tsk) | ||
344 | { | ||
345 | struct signal_struct * sig = tsk->signal; | ||
346 | struct sighand_struct * sighand = tsk->sighand; | ||
347 | |||
348 | if (!sig) | ||
349 | BUG(); | ||
350 | if (!atomic_read(&sig->count)) | ||
351 | BUG(); | ||
352 | spin_lock(&sighand->siglock); | ||
353 | posix_cpu_timers_exit(tsk); | ||
354 | if (atomic_dec_and_test(&sig->count)) { | ||
355 | posix_cpu_timers_exit_group(tsk); | ||
356 | if (tsk == sig->curr_target) | ||
357 | sig->curr_target = next_thread(tsk); | ||
358 | tsk->signal = NULL; | ||
359 | spin_unlock(&sighand->siglock); | ||
360 | flush_sigqueue(&sig->shared_pending); | ||
361 | } else { | ||
362 | /* | ||
363 | * If there is any task waiting for the group exit | ||
364 | * then notify it: | ||
365 | */ | ||
366 | if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { | ||
367 | wake_up_process(sig->group_exit_task); | ||
368 | sig->group_exit_task = NULL; | ||
369 | } | ||
370 | if (tsk == sig->curr_target) | ||
371 | sig->curr_target = next_thread(tsk); | ||
372 | tsk->signal = NULL; | ||
373 | /* | ||
374 | * Accumulate here the counters for all threads but the | ||
375 | * group leader as they die, so they can be added into | ||
376 | * the process-wide totals when those are taken. | ||
377 | * The group leader stays around as a zombie as long | ||
378 | * as there are other threads. When it gets reaped, | ||
379 | * the exit.c code will add its counts into these totals. | ||
380 | * We won't ever get here for the group leader, since it | ||
381 | * will have been the last reference on the signal_struct. | ||
382 | */ | ||
383 | sig->utime = cputime_add(sig->utime, tsk->utime); | ||
384 | sig->stime = cputime_add(sig->stime, tsk->stime); | ||
385 | sig->min_flt += tsk->min_flt; | ||
386 | sig->maj_flt += tsk->maj_flt; | ||
387 | sig->nvcsw += tsk->nvcsw; | ||
388 | sig->nivcsw += tsk->nivcsw; | ||
389 | sig->sched_time += tsk->sched_time; | ||
390 | spin_unlock(&sighand->siglock); | ||
391 | sig = NULL; /* Marker for below. */ | ||
392 | } | ||
393 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | ||
394 | flush_sigqueue(&tsk->pending); | ||
395 | if (sig) { | ||
396 | /* | ||
397 | * We are cleaning up the signal_struct here. We delayed | ||
398 | * calling exit_itimers until after flush_sigqueue, just in | ||
399 | * case our thread-local pending queue contained a queued | ||
400 | * timer signal that would have been cleared in | ||
401 | * exit_itimers. When that called sigqueue_free, it would | ||
402 | * attempt to re-take the tasklist_lock and deadlock. This | ||
403 | * can never happen if we ensure that all queues the | ||
404 | * timer's signal might be queued on have been flushed | ||
405 | * first. The shared_pending queue, and our own pending | ||
406 | * queue are the only queues the timer could be on, since | ||
407 | * there are no other threads left in the group and timer | ||
408 | * signals are constrained to threads inside the group. | ||
409 | */ | ||
410 | exit_itimers(sig); | ||
411 | exit_thread_group_keys(sig); | ||
412 | kmem_cache_free(signal_cachep, sig); | ||
413 | } | ||
414 | } | ||
415 | |||
416 | void exit_signal(struct task_struct *tsk) | ||
417 | { | ||
418 | write_lock_irq(&tasklist_lock); | ||
419 | __exit_signal(tsk); | ||
420 | write_unlock_irq(&tasklist_lock); | ||
421 | } | ||
422 | |||
423 | /* | ||
424 | * Flush all handlers for a task. | ||
425 | */ | ||
426 | |||
427 | void | ||
428 | flush_signal_handlers(struct task_struct *t, int force_default) | ||
429 | { | ||
430 | int i; | ||
431 | struct k_sigaction *ka = &t->sighand->action[0]; | ||
432 | for (i = _NSIG ; i != 0 ; i--) { | ||
433 | if (force_default || ka->sa.sa_handler != SIG_IGN) | ||
434 | ka->sa.sa_handler = SIG_DFL; | ||
435 | ka->sa.sa_flags = 0; | ||
436 | sigemptyset(&ka->sa.sa_mask); | ||
437 | ka++; | ||
438 | } | ||
439 | } | ||
440 | |||
441 | |||
442 | /* Notify the system that a driver wants to block all signals for this | ||
443 | * process, and wants to be notified if any signals at all were to be | ||
444 | * sent/acted upon. If the notifier routine returns non-zero, then the | ||
445 | * signal will be acted upon after all. If the notifier routine returns 0, | ||
446 | * then then signal will be blocked. Only one block per process is | ||
447 | * allowed. priv is a pointer to private data that the notifier routine | ||
448 | * can use to determine if the signal should be blocked or not. */ | ||
449 | |||
450 | void | ||
451 | block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) | ||
452 | { | ||
453 | unsigned long flags; | ||
454 | |||
455 | spin_lock_irqsave(¤t->sighand->siglock, flags); | ||
456 | current->notifier_mask = mask; | ||
457 | current->notifier_data = priv; | ||
458 | current->notifier = notifier; | ||
459 | spin_unlock_irqrestore(¤t->sighand->siglock, flags); | ||
460 | } | ||
461 | |||
462 | /* Notify the system that blocking has ended. */ | ||
463 | |||
464 | void | ||
465 | unblock_all_signals(void) | ||
466 | { | ||
467 | unsigned long flags; | ||
468 | |||
469 | spin_lock_irqsave(¤t->sighand->siglock, flags); | ||
470 | current->notifier = NULL; | ||
471 | current->notifier_data = NULL; | ||
472 | recalc_sigpending(); | ||
473 | spin_unlock_irqrestore(¤t->sighand->siglock, flags); | ||
474 | } | ||
475 | |||
476 | static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info) | ||
477 | { | ||
478 | struct sigqueue *q, *first = NULL; | ||
479 | int still_pending = 0; | ||
480 | |||
481 | if (unlikely(!sigismember(&list->signal, sig))) | ||
482 | return 0; | ||
483 | |||
484 | /* | ||
485 | * Collect the siginfo appropriate to this signal. Check if | ||
486 | * there is another siginfo for the same signal. | ||
487 | */ | ||
488 | list_for_each_entry(q, &list->list, list) { | ||
489 | if (q->info.si_signo == sig) { | ||
490 | if (first) { | ||
491 | still_pending = 1; | ||
492 | break; | ||
493 | } | ||
494 | first = q; | ||
495 | } | ||
496 | } | ||
497 | if (first) { | ||
498 | list_del_init(&first->list); | ||
499 | copy_siginfo(info, &first->info); | ||
500 | __sigqueue_free(first); | ||
501 | if (!still_pending) | ||
502 | sigdelset(&list->signal, sig); | ||
503 | } else { | ||
504 | |||
505 | /* Ok, it wasn't in the queue. This must be | ||
506 | a fast-pathed signal or we must have been | ||
507 | out of queue space. So zero out the info. | ||
508 | */ | ||
509 | sigdelset(&list->signal, sig); | ||
510 | info->si_signo = sig; | ||
511 | info->si_errno = 0; | ||
512 | info->si_code = 0; | ||
513 | info->si_pid = 0; | ||
514 | info->si_uid = 0; | ||
515 | } | ||
516 | return 1; | ||
517 | } | ||
518 | |||
519 | static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | ||
520 | siginfo_t *info) | ||
521 | { | ||
522 | int sig = 0; | ||
523 | |||
524 | sig = next_signal(pending, mask); | ||
525 | if (sig) { | ||
526 | if (current->notifier) { | ||
527 | if (sigismember(current->notifier_mask, sig)) { | ||
528 | if (!(current->notifier)(current->notifier_data)) { | ||
529 | clear_thread_flag(TIF_SIGPENDING); | ||
530 | return 0; | ||
531 | } | ||
532 | } | ||
533 | } | ||
534 | |||
535 | if (!collect_signal(sig, pending, info)) | ||
536 | sig = 0; | ||
537 | |||
538 | } | ||
539 | recalc_sigpending(); | ||
540 | |||
541 | return sig; | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | * Dequeue a signal and return the element to the caller, which is | ||
546 | * expected to free it. | ||
547 | * | ||
548 | * All callers have to hold the siglock. | ||
549 | */ | ||
550 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | ||
551 | { | ||
552 | int signr = __dequeue_signal(&tsk->pending, mask, info); | ||
553 | if (!signr) | ||
554 | signr = __dequeue_signal(&tsk->signal->shared_pending, | ||
555 | mask, info); | ||
556 | if (signr && unlikely(sig_kernel_stop(signr))) { | ||
557 | /* | ||
558 | * Set a marker that we have dequeued a stop signal. Our | ||
559 | * caller might release the siglock and then the pending | ||
560 | * stop signal it is about to process is no longer in the | ||
561 | * pending bitmasks, but must still be cleared by a SIGCONT | ||
562 | * (and overruled by a SIGKILL). So those cases clear this | ||
563 | * shared flag after we've set it. Note that this flag may | ||
564 | * remain set after the signal we return is ignored or | ||
565 | * handled. That doesn't matter because its only purpose | ||
566 | * is to alert stop-signal processing code when another | ||
567 | * processor has come along and cleared the flag. | ||
568 | */ | ||
569 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | ||
570 | } | ||
571 | if ( signr && | ||
572 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | ||
573 | info->si_sys_private){ | ||
574 | /* | ||
575 | * Release the siglock to ensure proper locking order | ||
576 | * of timer locks outside of siglocks. Note, we leave | ||
577 | * irqs disabled here, since the posix-timers code is | ||
578 | * about to disable them again anyway. | ||
579 | */ | ||
580 | spin_unlock(&tsk->sighand->siglock); | ||
581 | do_schedule_next_timer(info); | ||
582 | spin_lock(&tsk->sighand->siglock); | ||
583 | } | ||
584 | return signr; | ||
585 | } | ||
586 | |||
587 | /* | ||
588 | * Tell a process that it has a new active signal.. | ||
589 | * | ||
590 | * NOTE! we rely on the previous spin_lock to | ||
591 | * lock interrupts for us! We can only be called with | ||
592 | * "siglock" held, and the local interrupt must | ||
593 | * have been disabled when that got acquired! | ||
594 | * | ||
595 | * No need to set need_resched since signal event passing | ||
596 | * goes through ->blocked | ||
597 | */ | ||
598 | void signal_wake_up(struct task_struct *t, int resume) | ||
599 | { | ||
600 | unsigned int mask; | ||
601 | |||
602 | set_tsk_thread_flag(t, TIF_SIGPENDING); | ||
603 | |||
604 | /* | ||
605 | * For SIGKILL, we want to wake it up in the stopped/traced case. | ||
606 | * We don't check t->state here because there is a race with it | ||
607 | * executing another processor and just now entering stopped state. | ||
608 | * By using wake_up_state, we ensure the process will wake up and | ||
609 | * handle its death signal. | ||
610 | */ | ||
611 | mask = TASK_INTERRUPTIBLE; | ||
612 | if (resume) | ||
613 | mask |= TASK_STOPPED | TASK_TRACED; | ||
614 | if (!wake_up_state(t, mask)) | ||
615 | kick_process(t); | ||
616 | } | ||
617 | |||
618 | /* | ||
619 | * Remove signals in mask from the pending set and queue. | ||
620 | * Returns 1 if any signals were found. | ||
621 | * | ||
622 | * All callers must be holding the siglock. | ||
623 | */ | ||
624 | static int rm_from_queue(unsigned long mask, struct sigpending *s) | ||
625 | { | ||
626 | struct sigqueue *q, *n; | ||
627 | |||
628 | if (!sigtestsetmask(&s->signal, mask)) | ||
629 | return 0; | ||
630 | |||
631 | sigdelsetmask(&s->signal, mask); | ||
632 | list_for_each_entry_safe(q, n, &s->list, list) { | ||
633 | if (q->info.si_signo < SIGRTMIN && | ||
634 | (mask & sigmask(q->info.si_signo))) { | ||
635 | list_del_init(&q->list); | ||
636 | __sigqueue_free(q); | ||
637 | } | ||
638 | } | ||
639 | return 1; | ||
640 | } | ||
641 | |||
642 | /* | ||
643 | * Bad permissions for sending the signal | ||
644 | */ | ||
645 | static int check_kill_permission(int sig, struct siginfo *info, | ||
646 | struct task_struct *t) | ||
647 | { | ||
648 | int error = -EINVAL; | ||
649 | if (sig < 0 || sig > _NSIG) | ||
650 | return error; | ||
651 | error = -EPERM; | ||
652 | if ((!info || ((unsigned long)info != 1 && | ||
653 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
654 | && ((sig != SIGCONT) || | ||
655 | (current->signal->session != t->signal->session)) | ||
656 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | ||
657 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) | ||
658 | && !capable(CAP_KILL)) | ||
659 | return error; | ||
660 | return security_task_kill(t, info, sig); | ||
661 | } | ||
662 | |||
663 | /* forward decl */ | ||
664 | static void do_notify_parent_cldstop(struct task_struct *tsk, | ||
665 | struct task_struct *parent, | ||
666 | int why); | ||
667 | |||
668 | /* | ||
669 | * Handle magic process-wide effects of stop/continue signals. | ||
670 | * Unlike the signal actions, these happen immediately at signal-generation | ||
671 | * time regardless of blocking, ignoring, or handling. This does the | ||
672 | * actual continuing for SIGCONT, but not the actual stopping for stop | ||
673 | * signals. The process stop is done as a signal action for SIG_DFL. | ||
674 | */ | ||
675 | static void handle_stop_signal(int sig, struct task_struct *p) | ||
676 | { | ||
677 | struct task_struct *t; | ||
678 | |||
679 | if (p->flags & SIGNAL_GROUP_EXIT) | ||
680 | /* | ||
681 | * The process is in the middle of dying already. | ||
682 | */ | ||
683 | return; | ||
684 | |||
685 | if (sig_kernel_stop(sig)) { | ||
686 | /* | ||
687 | * This is a stop signal. Remove SIGCONT from all queues. | ||
688 | */ | ||
689 | rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); | ||
690 | t = p; | ||
691 | do { | ||
692 | rm_from_queue(sigmask(SIGCONT), &t->pending); | ||
693 | t = next_thread(t); | ||
694 | } while (t != p); | ||
695 | } else if (sig == SIGCONT) { | ||
696 | /* | ||
697 | * Remove all stop signals from all queues, | ||
698 | * and wake all threads. | ||
699 | */ | ||
700 | if (unlikely(p->signal->group_stop_count > 0)) { | ||
701 | /* | ||
702 | * There was a group stop in progress. We'll | ||
703 | * pretend it finished before we got here. We are | ||
704 | * obliged to report it to the parent: if the | ||
705 | * SIGSTOP happened "after" this SIGCONT, then it | ||
706 | * would have cleared this pending SIGCONT. If it | ||
707 | * happened "before" this SIGCONT, then the parent | ||
708 | * got the SIGCHLD about the stop finishing before | ||
709 | * the continue happened. We do the notification | ||
710 | * now, and it's as if the stop had finished and | ||
711 | * the SIGCHLD was pending on entry to this kill. | ||
712 | */ | ||
713 | p->signal->group_stop_count = 0; | ||
714 | p->signal->flags = SIGNAL_STOP_CONTINUED; | ||
715 | spin_unlock(&p->sighand->siglock); | ||
716 | if (p->ptrace & PT_PTRACED) | ||
717 | do_notify_parent_cldstop(p, p->parent, | ||
718 | CLD_STOPPED); | ||
719 | else | ||
720 | do_notify_parent_cldstop( | ||
721 | p->group_leader, | ||
722 | p->group_leader->real_parent, | ||
723 | CLD_STOPPED); | ||
724 | spin_lock(&p->sighand->siglock); | ||
725 | } | ||
726 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); | ||
727 | t = p; | ||
728 | do { | ||
729 | unsigned int state; | ||
730 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | ||
731 | |||
732 | /* | ||
733 | * If there is a handler for SIGCONT, we must make | ||
734 | * sure that no thread returns to user mode before | ||
735 | * we post the signal, in case it was the only | ||
736 | * thread eligible to run the signal handler--then | ||
737 | * it must not do anything between resuming and | ||
738 | * running the handler. With the TIF_SIGPENDING | ||
739 | * flag set, the thread will pause and acquire the | ||
740 | * siglock that we hold now and until we've queued | ||
741 | * the pending signal. | ||
742 | * | ||
743 | * Wake up the stopped thread _after_ setting | ||
744 | * TIF_SIGPENDING | ||
745 | */ | ||
746 | state = TASK_STOPPED; | ||
747 | if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { | ||
748 | set_tsk_thread_flag(t, TIF_SIGPENDING); | ||
749 | state |= TASK_INTERRUPTIBLE; | ||
750 | } | ||
751 | wake_up_state(t, state); | ||
752 | |||
753 | t = next_thread(t); | ||
754 | } while (t != p); | ||
755 | |||
756 | if (p->signal->flags & SIGNAL_STOP_STOPPED) { | ||
757 | /* | ||
758 | * We were in fact stopped, and are now continued. | ||
759 | * Notify the parent with CLD_CONTINUED. | ||
760 | */ | ||
761 | p->signal->flags = SIGNAL_STOP_CONTINUED; | ||
762 | p->signal->group_exit_code = 0; | ||
763 | spin_unlock(&p->sighand->siglock); | ||
764 | if (p->ptrace & PT_PTRACED) | ||
765 | do_notify_parent_cldstop(p, p->parent, | ||
766 | CLD_CONTINUED); | ||
767 | else | ||
768 | do_notify_parent_cldstop( | ||
769 | p->group_leader, | ||
770 | p->group_leader->real_parent, | ||
771 | CLD_CONTINUED); | ||
772 | spin_lock(&p->sighand->siglock); | ||
773 | } else { | ||
774 | /* | ||
775 | * We are not stopped, but there could be a stop | ||
776 | * signal in the middle of being processed after | ||
777 | * being removed from the queue. Clear that too. | ||
778 | */ | ||
779 | p->signal->flags = 0; | ||
780 | } | ||
781 | } else if (sig == SIGKILL) { | ||
782 | /* | ||
783 | * Make sure that any pending stop signal already dequeued | ||
784 | * is undone by the wakeup for SIGKILL. | ||
785 | */ | ||
786 | p->signal->flags = 0; | ||
787 | } | ||
788 | } | ||
789 | |||
790 | static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | ||
791 | struct sigpending *signals) | ||
792 | { | ||
793 | struct sigqueue * q = NULL; | ||
794 | int ret = 0; | ||
795 | |||
796 | /* | ||
797 | * fast-pathed signals for kernel-internal things like SIGSTOP | ||
798 | * or SIGKILL. | ||
799 | */ | ||
800 | if ((unsigned long)info == 2) | ||
801 | goto out_set; | ||
802 | |||
803 | /* Real-time signals must be queued if sent by sigqueue, or | ||
804 | some other real-time mechanism. It is implementation | ||
805 | defined whether kill() does so. We attempt to do so, on | ||
806 | the principle of least surprise, but since kill is not | ||
807 | allowed to fail with EAGAIN when low on memory we just | ||
808 | make sure at least one signal gets delivered and don't | ||
809 | pass on the info struct. */ | ||
810 | |||
811 | q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && | ||
812 | ((unsigned long) info < 2 || | ||
813 | info->si_code >= 0))); | ||
814 | if (q) { | ||
815 | list_add_tail(&q->list, &signals->list); | ||
816 | switch ((unsigned long) info) { | ||
817 | case 0: | ||
818 | q->info.si_signo = sig; | ||
819 | q->info.si_errno = 0; | ||
820 | q->info.si_code = SI_USER; | ||
821 | q->info.si_pid = current->pid; | ||
822 | q->info.si_uid = current->uid; | ||
823 | break; | ||
824 | case 1: | ||
825 | q->info.si_signo = sig; | ||
826 | q->info.si_errno = 0; | ||
827 | q->info.si_code = SI_KERNEL; | ||
828 | q->info.si_pid = 0; | ||
829 | q->info.si_uid = 0; | ||
830 | break; | ||
831 | default: | ||
832 | copy_siginfo(&q->info, info); | ||
833 | break; | ||
834 | } | ||
835 | } else { | ||
836 | if (sig >= SIGRTMIN && info && (unsigned long)info != 1 | ||
837 | && info->si_code != SI_USER) | ||
838 | /* | ||
839 | * Queue overflow, abort. We may abort if the signal was rt | ||
840 | * and sent by user using something other than kill(). | ||
841 | */ | ||
842 | return -EAGAIN; | ||
843 | if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) | ||
844 | /* | ||
845 | * Set up a return to indicate that we dropped | ||
846 | * the signal. | ||
847 | */ | ||
848 | ret = info->si_sys_private; | ||
849 | } | ||
850 | |||
851 | out_set: | ||
852 | sigaddset(&signals->signal, sig); | ||
853 | return ret; | ||
854 | } | ||
855 | |||
856 | #define LEGACY_QUEUE(sigptr, sig) \ | ||
857 | (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) | ||
858 | |||
859 | |||
860 | static int | ||
861 | specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) | ||
862 | { | ||
863 | int ret = 0; | ||
864 | |||
865 | if (!irqs_disabled()) | ||
866 | BUG(); | ||
867 | assert_spin_locked(&t->sighand->siglock); | ||
868 | |||
869 | if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) | ||
870 | /* | ||
871 | * Set up a return to indicate that we dropped the signal. | ||
872 | */ | ||
873 | ret = info->si_sys_private; | ||
874 | |||
875 | /* Short-circuit ignored signals. */ | ||
876 | if (sig_ignored(t, sig)) | ||
877 | goto out; | ||
878 | |||
879 | /* Support queueing exactly one non-rt signal, so that we | ||
880 | can get more detailed information about the cause of | ||
881 | the signal. */ | ||
882 | if (LEGACY_QUEUE(&t->pending, sig)) | ||
883 | goto out; | ||
884 | |||
885 | ret = send_signal(sig, info, t, &t->pending); | ||
886 | if (!ret && !sigismember(&t->blocked, sig)) | ||
887 | signal_wake_up(t, sig == SIGKILL); | ||
888 | out: | ||
889 | return ret; | ||
890 | } | ||
891 | |||
892 | /* | ||
893 | * Force a signal that the process can't ignore: if necessary | ||
894 | * we unblock the signal and change any SIG_IGN to SIG_DFL. | ||
895 | */ | ||
896 | |||
897 | int | ||
898 | force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | ||
899 | { | ||
900 | unsigned long int flags; | ||
901 | int ret; | ||
902 | |||
903 | spin_lock_irqsave(&t->sighand->siglock, flags); | ||
904 | if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { | ||
905 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | ||
906 | sigdelset(&t->blocked, sig); | ||
907 | recalc_sigpending_tsk(t); | ||
908 | } | ||
909 | ret = specific_send_sig_info(sig, info, t); | ||
910 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | ||
911 | |||
912 | return ret; | ||
913 | } | ||
914 | |||
915 | void | ||
916 | force_sig_specific(int sig, struct task_struct *t) | ||
917 | { | ||
918 | unsigned long int flags; | ||
919 | |||
920 | spin_lock_irqsave(&t->sighand->siglock, flags); | ||
921 | if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) | ||
922 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | ||
923 | sigdelset(&t->blocked, sig); | ||
924 | recalc_sigpending_tsk(t); | ||
925 | specific_send_sig_info(sig, (void *)2, t); | ||
926 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | ||
927 | } | ||
928 | |||
929 | /* | ||
930 | * Test if P wants to take SIG. After we've checked all threads with this, | ||
931 | * it's equivalent to finding no threads not blocking SIG. Any threads not | ||
932 | * blocking SIG were ruled out because they are not running and already | ||
933 | * have pending signals. Such threads will dequeue from the shared queue | ||
934 | * as soon as they're available, so putting the signal on the shared queue | ||
935 | * will be equivalent to sending it to one such thread. | ||
936 | */ | ||
937 | #define wants_signal(sig, p, mask) \ | ||
938 | (!sigismember(&(p)->blocked, sig) \ | ||
939 | && !((p)->state & mask) \ | ||
940 | && !((p)->flags & PF_EXITING) \ | ||
941 | && (task_curr(p) || !signal_pending(p))) | ||
942 | |||
943 | |||
944 | static void | ||
945 | __group_complete_signal(int sig, struct task_struct *p) | ||
946 | { | ||
947 | unsigned int mask; | ||
948 | struct task_struct *t; | ||
949 | |||
950 | /* | ||
951 | * Don't bother traced and stopped tasks (but | ||
952 | * SIGKILL will punch through that). | ||
953 | */ | ||
954 | mask = TASK_STOPPED | TASK_TRACED; | ||
955 | if (sig == SIGKILL) | ||
956 | mask = 0; | ||
957 | |||
958 | /* | ||
959 | * Now find a thread we can wake up to take the signal off the queue. | ||
960 | * | ||
961 | * If the main thread wants the signal, it gets first crack. | ||
962 | * Probably the least surprising to the average bear. | ||
963 | */ | ||
964 | if (wants_signal(sig, p, mask)) | ||
965 | t = p; | ||
966 | else if (thread_group_empty(p)) | ||
967 | /* | ||
968 | * There is just one thread and it does not need to be woken. | ||
969 | * It will dequeue unblocked signals before it runs again. | ||
970 | */ | ||
971 | return; | ||
972 | else { | ||
973 | /* | ||
974 | * Otherwise try to find a suitable thread. | ||
975 | */ | ||
976 | t = p->signal->curr_target; | ||
977 | if (t == NULL) | ||
978 | /* restart balancing at this thread */ | ||
979 | t = p->signal->curr_target = p; | ||
980 | BUG_ON(t->tgid != p->tgid); | ||
981 | |||
982 | while (!wants_signal(sig, t, mask)) { | ||
983 | t = next_thread(t); | ||
984 | if (t == p->signal->curr_target) | ||
985 | /* | ||
986 | * No thread needs to be woken. | ||
987 | * Any eligible threads will see | ||
988 | * the signal in the queue soon. | ||
989 | */ | ||
990 | return; | ||
991 | } | ||
992 | p->signal->curr_target = t; | ||
993 | } | ||
994 | |||
995 | /* | ||
996 | * Found a killable thread. If the signal will be fatal, | ||
997 | * then start taking the whole group down immediately. | ||
998 | */ | ||
999 | if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) && | ||
1000 | !sigismember(&t->real_blocked, sig) && | ||
1001 | (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { | ||
1002 | /* | ||
1003 | * This signal will be fatal to the whole group. | ||
1004 | */ | ||
1005 | if (!sig_kernel_coredump(sig)) { | ||
1006 | /* | ||
1007 | * Start a group exit and wake everybody up. | ||
1008 | * This way we don't have other threads | ||
1009 | * running and doing things after a slower | ||
1010 | * thread has the fatal signal pending. | ||
1011 | */ | ||
1012 | p->signal->flags = SIGNAL_GROUP_EXIT; | ||
1013 | p->signal->group_exit_code = sig; | ||
1014 | p->signal->group_stop_count = 0; | ||
1015 | t = p; | ||
1016 | do { | ||
1017 | sigaddset(&t->pending.signal, SIGKILL); | ||
1018 | signal_wake_up(t, 1); | ||
1019 | t = next_thread(t); | ||
1020 | } while (t != p); | ||
1021 | return; | ||
1022 | } | ||
1023 | |||
1024 | /* | ||
1025 | * There will be a core dump. We make all threads other | ||
1026 | * than the chosen one go into a group stop so that nothing | ||
1027 | * happens until it gets scheduled, takes the signal off | ||
1028 | * the shared queue, and does the core dump. This is a | ||
1029 | * little more complicated than strictly necessary, but it | ||
1030 | * keeps the signal state that winds up in the core dump | ||
1031 | * unchanged from the death state, e.g. which thread had | ||
1032 | * the core-dump signal unblocked. | ||
1033 | */ | ||
1034 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | ||
1035 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); | ||
1036 | p->signal->group_stop_count = 0; | ||
1037 | p->signal->group_exit_task = t; | ||
1038 | t = p; | ||
1039 | do { | ||
1040 | p->signal->group_stop_count++; | ||
1041 | signal_wake_up(t, 0); | ||
1042 | t = next_thread(t); | ||
1043 | } while (t != p); | ||
1044 | wake_up_process(p->signal->group_exit_task); | ||
1045 | return; | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * The signal is already in the shared-pending queue. | ||
1050 | * Tell the chosen thread to wake up and dequeue it. | ||
1051 | */ | ||
1052 | signal_wake_up(t, sig == SIGKILL); | ||
1053 | return; | ||
1054 | } | ||
1055 | |||
1056 | int | ||
1057 | __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | ||
1058 | { | ||
1059 | int ret = 0; | ||
1060 | |||
1061 | assert_spin_locked(&p->sighand->siglock); | ||
1062 | handle_stop_signal(sig, p); | ||
1063 | |||
1064 | if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) | ||
1065 | /* | ||
1066 | * Set up a return to indicate that we dropped the signal. | ||
1067 | */ | ||
1068 | ret = info->si_sys_private; | ||
1069 | |||
1070 | /* Short-circuit ignored signals. */ | ||
1071 | if (sig_ignored(p, sig)) | ||
1072 | return ret; | ||
1073 | |||
1074 | if (LEGACY_QUEUE(&p->signal->shared_pending, sig)) | ||
1075 | /* This is a non-RT signal and we already have one queued. */ | ||
1076 | return ret; | ||
1077 | |||
1078 | /* | ||
1079 | * Put this signal on the shared-pending queue, or fail with EAGAIN. | ||
1080 | * We always use the shared queue for process-wide signals, | ||
1081 | * to avoid several races. | ||
1082 | */ | ||
1083 | ret = send_signal(sig, info, p, &p->signal->shared_pending); | ||
1084 | if (unlikely(ret)) | ||
1085 | return ret; | ||
1086 | |||
1087 | __group_complete_signal(sig, p); | ||
1088 | return 0; | ||
1089 | } | ||
1090 | |||
1091 | /* | ||
1092 | * Nuke all other threads in the group. | ||
1093 | */ | ||
1094 | void zap_other_threads(struct task_struct *p) | ||
1095 | { | ||
1096 | struct task_struct *t; | ||
1097 | |||
1098 | p->signal->flags = SIGNAL_GROUP_EXIT; | ||
1099 | p->signal->group_stop_count = 0; | ||
1100 | |||
1101 | if (thread_group_empty(p)) | ||
1102 | return; | ||
1103 | |||
1104 | for (t = next_thread(p); t != p; t = next_thread(t)) { | ||
1105 | /* | ||
1106 | * Don't bother with already dead threads | ||
1107 | */ | ||
1108 | if (t->exit_state) | ||
1109 | continue; | ||
1110 | |||
1111 | /* | ||
1112 | * We don't want to notify the parent, since we are | ||
1113 | * killed as part of a thread group due to another | ||
1114 | * thread doing an execve() or similar. So set the | ||
1115 | * exit signal to -1 to allow immediate reaping of | ||
1116 | * the process. But don't detach the thread group | ||
1117 | * leader. | ||
1118 | */ | ||
1119 | if (t != p->group_leader) | ||
1120 | t->exit_signal = -1; | ||
1121 | |||
1122 | sigaddset(&t->pending.signal, SIGKILL); | ||
1123 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | ||
1124 | signal_wake_up(t, 1); | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1129 | * Must be called with the tasklist_lock held for reading! | ||
1130 | */ | ||
1131 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | ||
1132 | { | ||
1133 | unsigned long flags; | ||
1134 | int ret; | ||
1135 | |||
1136 | ret = check_kill_permission(sig, info, p); | ||
1137 | if (!ret && sig && p->sighand) { | ||
1138 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1139 | ret = __group_send_sig_info(sig, info, p); | ||
1140 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1141 | } | ||
1142 | |||
1143 | return ret; | ||
1144 | } | ||
1145 | |||
1146 | /* | ||
1147 | * kill_pg_info() sends a signal to a process group: this is what the tty | ||
1148 | * control characters do (^C, ^Z etc) | ||
1149 | */ | ||
1150 | |||
1151 | int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) | ||
1152 | { | ||
1153 | struct task_struct *p = NULL; | ||
1154 | int retval, success; | ||
1155 | |||
1156 | if (pgrp <= 0) | ||
1157 | return -EINVAL; | ||
1158 | |||
1159 | success = 0; | ||
1160 | retval = -ESRCH; | ||
1161 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | ||
1162 | int err = group_send_sig_info(sig, info, p); | ||
1163 | success |= !err; | ||
1164 | retval = err; | ||
1165 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | ||
1166 | return success ? 0 : retval; | ||
1167 | } | ||
1168 | |||
1169 | int | ||
1170 | kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) | ||
1171 | { | ||
1172 | int retval; | ||
1173 | |||
1174 | read_lock(&tasklist_lock); | ||
1175 | retval = __kill_pg_info(sig, info, pgrp); | ||
1176 | read_unlock(&tasklist_lock); | ||
1177 | |||
1178 | return retval; | ||
1179 | } | ||
1180 | |||
1181 | int | ||
1182 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) | ||
1183 | { | ||
1184 | int error; | ||
1185 | struct task_struct *p; | ||
1186 | |||
1187 | read_lock(&tasklist_lock); | ||
1188 | p = find_task_by_pid(pid); | ||
1189 | error = -ESRCH; | ||
1190 | if (p) | ||
1191 | error = group_send_sig_info(sig, info, p); | ||
1192 | read_unlock(&tasklist_lock); | ||
1193 | return error; | ||
1194 | } | ||
1195 | |||
1196 | |||
1197 | /* | ||
1198 | * kill_something_info() interprets pid in interesting ways just like kill(2). | ||
1199 | * | ||
1200 | * POSIX specifies that kill(-1,sig) is unspecified, but what we have | ||
1201 | * is probably wrong. Should make it like BSD or SYSV. | ||
1202 | */ | ||
1203 | |||
1204 | static int kill_something_info(int sig, struct siginfo *info, int pid) | ||
1205 | { | ||
1206 | if (!pid) { | ||
1207 | return kill_pg_info(sig, info, process_group(current)); | ||
1208 | } else if (pid == -1) { | ||
1209 | int retval = 0, count = 0; | ||
1210 | struct task_struct * p; | ||
1211 | |||
1212 | read_lock(&tasklist_lock); | ||
1213 | for_each_process(p) { | ||
1214 | if (p->pid > 1 && p->tgid != current->tgid) { | ||
1215 | int err = group_send_sig_info(sig, info, p); | ||
1216 | ++count; | ||
1217 | if (err != -EPERM) | ||
1218 | retval = err; | ||
1219 | } | ||
1220 | } | ||
1221 | read_unlock(&tasklist_lock); | ||
1222 | return count ? retval : -ESRCH; | ||
1223 | } else if (pid < 0) { | ||
1224 | return kill_pg_info(sig, info, -pid); | ||
1225 | } else { | ||
1226 | return kill_proc_info(sig, info, pid); | ||
1227 | } | ||
1228 | } | ||
1229 | |||
1230 | /* | ||
1231 | * These are for backward compatibility with the rest of the kernel source. | ||
1232 | */ | ||
1233 | |||
1234 | /* | ||
1235 | * These two are the most common entry points. They send a signal | ||
1236 | * just to the specific thread. | ||
1237 | */ | ||
1238 | int | ||
1239 | send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | ||
1240 | { | ||
1241 | int ret; | ||
1242 | unsigned long flags; | ||
1243 | |||
1244 | /* | ||
1245 | * Make sure legacy kernel users don't send in bad values | ||
1246 | * (normal paths check this in check_kill_permission). | ||
1247 | */ | ||
1248 | if (sig < 0 || sig > _NSIG) | ||
1249 | return -EINVAL; | ||
1250 | |||
1251 | /* | ||
1252 | * We need the tasklist lock even for the specific | ||
1253 | * thread case (when we don't need to follow the group | ||
1254 | * lists) in order to avoid races with "p->sighand" | ||
1255 | * going away or changing from under us. | ||
1256 | */ | ||
1257 | read_lock(&tasklist_lock); | ||
1258 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1259 | ret = specific_send_sig_info(sig, info, p); | ||
1260 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1261 | read_unlock(&tasklist_lock); | ||
1262 | return ret; | ||
1263 | } | ||
1264 | |||
1265 | int | ||
1266 | send_sig(int sig, struct task_struct *p, int priv) | ||
1267 | { | ||
1268 | return send_sig_info(sig, (void*)(long)(priv != 0), p); | ||
1269 | } | ||
1270 | |||
1271 | /* | ||
1272 | * This is the entry point for "process-wide" signals. | ||
1273 | * They will go to an appropriate thread in the thread group. | ||
1274 | */ | ||
1275 | int | ||
1276 | send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) | ||
1277 | { | ||
1278 | int ret; | ||
1279 | read_lock(&tasklist_lock); | ||
1280 | ret = group_send_sig_info(sig, info, p); | ||
1281 | read_unlock(&tasklist_lock); | ||
1282 | return ret; | ||
1283 | } | ||
1284 | |||
1285 | void | ||
1286 | force_sig(int sig, struct task_struct *p) | ||
1287 | { | ||
1288 | force_sig_info(sig, (void*)1L, p); | ||
1289 | } | ||
1290 | |||
1291 | /* | ||
1292 | * When things go south during signal handling, we | ||
1293 | * will force a SIGSEGV. And if the signal that caused | ||
1294 | * the problem was already a SIGSEGV, we'll want to | ||
1295 | * make sure we don't even try to deliver the signal.. | ||
1296 | */ | ||
1297 | int | ||
1298 | force_sigsegv(int sig, struct task_struct *p) | ||
1299 | { | ||
1300 | if (sig == SIGSEGV) { | ||
1301 | unsigned long flags; | ||
1302 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1303 | p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; | ||
1304 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1305 | } | ||
1306 | force_sig(SIGSEGV, p); | ||
1307 | return 0; | ||
1308 | } | ||
1309 | |||
1310 | int | ||
1311 | kill_pg(pid_t pgrp, int sig, int priv) | ||
1312 | { | ||
1313 | return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); | ||
1314 | } | ||
1315 | |||
1316 | int | ||
1317 | kill_proc(pid_t pid, int sig, int priv) | ||
1318 | { | ||
1319 | return kill_proc_info(sig, (void *)(long)(priv != 0), pid); | ||
1320 | } | ||
1321 | |||
1322 | /* | ||
1323 | * These functions support sending signals using preallocated sigqueue | ||
1324 | * structures. This is needed "because realtime applications cannot | ||
1325 | * afford to lose notifications of asynchronous events, like timer | ||
1326 | * expirations or I/O completions". In the case of Posix Timers | ||
1327 | * we allocate the sigqueue structure from the timer_create. If this | ||
1328 | * allocation fails we are able to report the failure to the application | ||
1329 | * with an EAGAIN error. | ||
1330 | */ | ||
1331 | |||
1332 | struct sigqueue *sigqueue_alloc(void) | ||
1333 | { | ||
1334 | struct sigqueue *q; | ||
1335 | |||
1336 | if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) | ||
1337 | q->flags |= SIGQUEUE_PREALLOC; | ||
1338 | return(q); | ||
1339 | } | ||
1340 | |||
1341 | void sigqueue_free(struct sigqueue *q) | ||
1342 | { | ||
1343 | unsigned long flags; | ||
1344 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | ||
1345 | /* | ||
1346 | * If the signal is still pending remove it from the | ||
1347 | * pending queue. | ||
1348 | */ | ||
1349 | if (unlikely(!list_empty(&q->list))) { | ||
1350 | read_lock(&tasklist_lock); | ||
1351 | spin_lock_irqsave(q->lock, flags); | ||
1352 | if (!list_empty(&q->list)) | ||
1353 | list_del_init(&q->list); | ||
1354 | spin_unlock_irqrestore(q->lock, flags); | ||
1355 | read_unlock(&tasklist_lock); | ||
1356 | } | ||
1357 | q->flags &= ~SIGQUEUE_PREALLOC; | ||
1358 | __sigqueue_free(q); | ||
1359 | } | ||
1360 | |||
1361 | int | ||
1362 | send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | ||
1363 | { | ||
1364 | unsigned long flags; | ||
1365 | int ret = 0; | ||
1366 | |||
1367 | /* | ||
1368 | * We need the tasklist lock even for the specific | ||
1369 | * thread case (when we don't need to follow the group | ||
1370 | * lists) in order to avoid races with "p->sighand" | ||
1371 | * going away or changing from under us. | ||
1372 | */ | ||
1373 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | ||
1374 | read_lock(&tasklist_lock); | ||
1375 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1376 | |||
1377 | if (unlikely(!list_empty(&q->list))) { | ||
1378 | /* | ||
1379 | * If an SI_TIMER entry is already queue just increment | ||
1380 | * the overrun count. | ||
1381 | */ | ||
1382 | if (q->info.si_code != SI_TIMER) | ||
1383 | BUG(); | ||
1384 | q->info.si_overrun++; | ||
1385 | goto out; | ||
1386 | } | ||
1387 | /* Short-circuit ignored signals. */ | ||
1388 | if (sig_ignored(p, sig)) { | ||
1389 | ret = 1; | ||
1390 | goto out; | ||
1391 | } | ||
1392 | |||
1393 | q->lock = &p->sighand->siglock; | ||
1394 | list_add_tail(&q->list, &p->pending.list); | ||
1395 | sigaddset(&p->pending.signal, sig); | ||
1396 | if (!sigismember(&p->blocked, sig)) | ||
1397 | signal_wake_up(p, sig == SIGKILL); | ||
1398 | |||
1399 | out: | ||
1400 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1401 | read_unlock(&tasklist_lock); | ||
1402 | return(ret); | ||
1403 | } | ||
1404 | |||
1405 | int | ||
1406 | send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | ||
1407 | { | ||
1408 | unsigned long flags; | ||
1409 | int ret = 0; | ||
1410 | |||
1411 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | ||
1412 | read_lock(&tasklist_lock); | ||
1413 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1414 | handle_stop_signal(sig, p); | ||
1415 | |||
1416 | /* Short-circuit ignored signals. */ | ||
1417 | if (sig_ignored(p, sig)) { | ||
1418 | ret = 1; | ||
1419 | goto out; | ||
1420 | } | ||
1421 | |||
1422 | if (unlikely(!list_empty(&q->list))) { | ||
1423 | /* | ||
1424 | * If an SI_TIMER entry is already queue just increment | ||
1425 | * the overrun count. Other uses should not try to | ||
1426 | * send the signal multiple times. | ||
1427 | */ | ||
1428 | if (q->info.si_code != SI_TIMER) | ||
1429 | BUG(); | ||
1430 | q->info.si_overrun++; | ||
1431 | goto out; | ||
1432 | } | ||
1433 | |||
1434 | /* | ||
1435 | * Put this signal on the shared-pending queue. | ||
1436 | * We always use the shared queue for process-wide signals, | ||
1437 | * to avoid several races. | ||
1438 | */ | ||
1439 | q->lock = &p->sighand->siglock; | ||
1440 | list_add_tail(&q->list, &p->signal->shared_pending.list); | ||
1441 | sigaddset(&p->signal->shared_pending.signal, sig); | ||
1442 | |||
1443 | __group_complete_signal(sig, p); | ||
1444 | out: | ||
1445 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1446 | read_unlock(&tasklist_lock); | ||
1447 | return(ret); | ||
1448 | } | ||
1449 | |||
1450 | /* | ||
1451 | * Wake up any threads in the parent blocked in wait* syscalls. | ||
1452 | */ | ||
1453 | static inline void __wake_up_parent(struct task_struct *p, | ||
1454 | struct task_struct *parent) | ||
1455 | { | ||
1456 | wake_up_interruptible_sync(&parent->signal->wait_chldexit); | ||
1457 | } | ||
1458 | |||
1459 | /* | ||
1460 | * Let a parent know about the death of a child. | ||
1461 | * For a stopped/continued status change, use do_notify_parent_cldstop instead. | ||
1462 | */ | ||
1463 | |||
1464 | void do_notify_parent(struct task_struct *tsk, int sig) | ||
1465 | { | ||
1466 | struct siginfo info; | ||
1467 | unsigned long flags; | ||
1468 | struct sighand_struct *psig; | ||
1469 | |||
1470 | BUG_ON(sig == -1); | ||
1471 | |||
1472 | /* do_notify_parent_cldstop should have been called instead. */ | ||
1473 | BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED)); | ||
1474 | |||
1475 | BUG_ON(!tsk->ptrace && | ||
1476 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); | ||
1477 | |||
1478 | info.si_signo = sig; | ||
1479 | info.si_errno = 0; | ||
1480 | info.si_pid = tsk->pid; | ||
1481 | info.si_uid = tsk->uid; | ||
1482 | |||
1483 | /* FIXME: find out whether or not this is supposed to be c*time. */ | ||
1484 | info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime, | ||
1485 | tsk->signal->utime)); | ||
1486 | info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime, | ||
1487 | tsk->signal->stime)); | ||
1488 | |||
1489 | info.si_status = tsk->exit_code & 0x7f; | ||
1490 | if (tsk->exit_code & 0x80) | ||
1491 | info.si_code = CLD_DUMPED; | ||
1492 | else if (tsk->exit_code & 0x7f) | ||
1493 | info.si_code = CLD_KILLED; | ||
1494 | else { | ||
1495 | info.si_code = CLD_EXITED; | ||
1496 | info.si_status = tsk->exit_code >> 8; | ||
1497 | } | ||
1498 | |||
1499 | psig = tsk->parent->sighand; | ||
1500 | spin_lock_irqsave(&psig->siglock, flags); | ||
1501 | if (sig == SIGCHLD && | ||
1502 | (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || | ||
1503 | (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { | ||
1504 | /* | ||
1505 | * We are exiting and our parent doesn't care. POSIX.1 | ||
1506 | * defines special semantics for setting SIGCHLD to SIG_IGN | ||
1507 | * or setting the SA_NOCLDWAIT flag: we should be reaped | ||
1508 | * automatically and not left for our parent's wait4 call. | ||
1509 | * Rather than having the parent do it as a magic kind of | ||
1510 | * signal handler, we just set this to tell do_exit that we | ||
1511 | * can be cleaned up without becoming a zombie. Note that | ||
1512 | * we still call __wake_up_parent in this case, because a | ||
1513 | * blocked sys_wait4 might now return -ECHILD. | ||
1514 | * | ||
1515 | * Whether we send SIGCHLD or not for SA_NOCLDWAIT | ||
1516 | * is implementation-defined: we do (if you don't want | ||
1517 | * it, just use SIG_IGN instead). | ||
1518 | */ | ||
1519 | tsk->exit_signal = -1; | ||
1520 | if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) | ||
1521 | sig = 0; | ||
1522 | } | ||
1523 | if (sig > 0 && sig <= _NSIG) | ||
1524 | __group_send_sig_info(sig, &info, tsk->parent); | ||
1525 | __wake_up_parent(tsk, tsk->parent); | ||
1526 | spin_unlock_irqrestore(&psig->siglock, flags); | ||
1527 | } | ||
1528 | |||
1529 | static void | ||
1530 | do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, | ||
1531 | int why) | ||
1532 | { | ||
1533 | struct siginfo info; | ||
1534 | unsigned long flags; | ||
1535 | struct sighand_struct *sighand; | ||
1536 | |||
1537 | info.si_signo = SIGCHLD; | ||
1538 | info.si_errno = 0; | ||
1539 | info.si_pid = tsk->pid; | ||
1540 | info.si_uid = tsk->uid; | ||
1541 | |||
1542 | /* FIXME: find out whether or not this is supposed to be c*time. */ | ||
1543 | info.si_utime = cputime_to_jiffies(tsk->utime); | ||
1544 | info.si_stime = cputime_to_jiffies(tsk->stime); | ||
1545 | |||
1546 | info.si_code = why; | ||
1547 | switch (why) { | ||
1548 | case CLD_CONTINUED: | ||
1549 | info.si_status = SIGCONT; | ||
1550 | break; | ||
1551 | case CLD_STOPPED: | ||
1552 | info.si_status = tsk->signal->group_exit_code & 0x7f; | ||
1553 | break; | ||
1554 | case CLD_TRAPPED: | ||
1555 | info.si_status = tsk->exit_code & 0x7f; | ||
1556 | break; | ||
1557 | default: | ||
1558 | BUG(); | ||
1559 | } | ||
1560 | |||
1561 | sighand = parent->sighand; | ||
1562 | spin_lock_irqsave(&sighand->siglock, flags); | ||
1563 | if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && | ||
1564 | !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) | ||
1565 | __group_send_sig_info(SIGCHLD, &info, parent); | ||
1566 | /* | ||
1567 | * Even if SIGCHLD is not generated, we must wake up wait4 calls. | ||
1568 | */ | ||
1569 | __wake_up_parent(tsk, parent); | ||
1570 | spin_unlock_irqrestore(&sighand->siglock, flags); | ||
1571 | } | ||
1572 | |||
1573 | /* | ||
1574 | * This must be called with current->sighand->siglock held. | ||
1575 | * | ||
1576 | * This should be the path for all ptrace stops. | ||
1577 | * We always set current->last_siginfo while stopped here. | ||
1578 | * That makes it a way to test a stopped process for | ||
1579 | * being ptrace-stopped vs being job-control-stopped. | ||
1580 | * | ||
1581 | * If we actually decide not to stop at all because the tracer is gone, | ||
1582 | * we leave nostop_code in current->exit_code. | ||
1583 | */ | ||
1584 | static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) | ||
1585 | { | ||
1586 | /* | ||
1587 | * If there is a group stop in progress, | ||
1588 | * we must participate in the bookkeeping. | ||
1589 | */ | ||
1590 | if (current->signal->group_stop_count > 0) | ||
1591 | --current->signal->group_stop_count; | ||
1592 | |||
1593 | current->last_siginfo = info; | ||
1594 | current->exit_code = exit_code; | ||
1595 | |||
1596 | /* Let the debugger run. */ | ||
1597 | set_current_state(TASK_TRACED); | ||
1598 | spin_unlock_irq(¤t->sighand->siglock); | ||
1599 | read_lock(&tasklist_lock); | ||
1600 | if (likely(current->ptrace & PT_PTRACED) && | ||
1601 | likely(current->parent != current->real_parent || | ||
1602 | !(current->ptrace & PT_ATTACHED)) && | ||
1603 | (likely(current->parent->signal != current->signal) || | ||
1604 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { | ||
1605 | do_notify_parent_cldstop(current, current->parent, | ||
1606 | CLD_TRAPPED); | ||
1607 | read_unlock(&tasklist_lock); | ||
1608 | schedule(); | ||
1609 | } else { | ||
1610 | /* | ||
1611 | * By the time we got the lock, our tracer went away. | ||
1612 | * Don't stop here. | ||
1613 | */ | ||
1614 | read_unlock(&tasklist_lock); | ||
1615 | set_current_state(TASK_RUNNING); | ||
1616 | current->exit_code = nostop_code; | ||
1617 | } | ||
1618 | |||
1619 | /* | ||
1620 | * We are back. Now reacquire the siglock before touching | ||
1621 | * last_siginfo, so that we are sure to have synchronized with | ||
1622 | * any signal-sending on another CPU that wants to examine it. | ||
1623 | */ | ||
1624 | spin_lock_irq(¤t->sighand->siglock); | ||
1625 | current->last_siginfo = NULL; | ||
1626 | |||
1627 | /* | ||
1628 | * Queued signals ignored us while we were stopped for tracing. | ||
1629 | * So check for any that we should take before resuming user mode. | ||
1630 | */ | ||
1631 | recalc_sigpending(); | ||
1632 | } | ||
1633 | |||
1634 | void ptrace_notify(int exit_code) | ||
1635 | { | ||
1636 | siginfo_t info; | ||
1637 | |||
1638 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); | ||
1639 | |||
1640 | memset(&info, 0, sizeof info); | ||
1641 | info.si_signo = SIGTRAP; | ||
1642 | info.si_code = exit_code; | ||
1643 | info.si_pid = current->pid; | ||
1644 | info.si_uid = current->uid; | ||
1645 | |||
1646 | /* Let the debugger run. */ | ||
1647 | spin_lock_irq(¤t->sighand->siglock); | ||
1648 | ptrace_stop(exit_code, 0, &info); | ||
1649 | spin_unlock_irq(¤t->sighand->siglock); | ||
1650 | } | ||
1651 | |||
1652 | #ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER | ||
1653 | |||
1654 | static void | ||
1655 | finish_stop(int stop_count) | ||
1656 | { | ||
1657 | /* | ||
1658 | * If there are no other threads in the group, or if there is | ||
1659 | * a group stop in progress and we are the last to stop, | ||
1660 | * report to the parent. When ptraced, every thread reports itself. | ||
1661 | */ | ||
1662 | if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { | ||
1663 | read_lock(&tasklist_lock); | ||
1664 | do_notify_parent_cldstop(current, current->parent, | ||
1665 | CLD_STOPPED); | ||
1666 | read_unlock(&tasklist_lock); | ||
1667 | } | ||
1668 | else if (stop_count == 0) { | ||
1669 | read_lock(&tasklist_lock); | ||
1670 | do_notify_parent_cldstop(current->group_leader, | ||
1671 | current->group_leader->real_parent, | ||
1672 | CLD_STOPPED); | ||
1673 | read_unlock(&tasklist_lock); | ||
1674 | } | ||
1675 | |||
1676 | schedule(); | ||
1677 | /* | ||
1678 | * Now we don't run again until continued. | ||
1679 | */ | ||
1680 | current->exit_code = 0; | ||
1681 | } | ||
1682 | |||
1683 | /* | ||
1684 | * This performs the stopping for SIGSTOP and other stop signals. | ||
1685 | * We have to stop all threads in the thread group. | ||
1686 | * Returns nonzero if we've actually stopped and released the siglock. | ||
1687 | * Returns zero if we didn't stop and still hold the siglock. | ||
1688 | */ | ||
1689 | static int | ||
1690 | do_signal_stop(int signr) | ||
1691 | { | ||
1692 | struct signal_struct *sig = current->signal; | ||
1693 | struct sighand_struct *sighand = current->sighand; | ||
1694 | int stop_count = -1; | ||
1695 | |||
1696 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) | ||
1697 | return 0; | ||
1698 | |||
1699 | if (sig->group_stop_count > 0) { | ||
1700 | /* | ||
1701 | * There is a group stop in progress. We don't need to | ||
1702 | * start another one. | ||
1703 | */ | ||
1704 | signr = sig->group_exit_code; | ||
1705 | stop_count = --sig->group_stop_count; | ||
1706 | current->exit_code = signr; | ||
1707 | set_current_state(TASK_STOPPED); | ||
1708 | if (stop_count == 0) | ||
1709 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1710 | spin_unlock_irq(&sighand->siglock); | ||
1711 | } | ||
1712 | else if (thread_group_empty(current)) { | ||
1713 | /* | ||
1714 | * Lock must be held through transition to stopped state. | ||
1715 | */ | ||
1716 | current->exit_code = current->signal->group_exit_code = signr; | ||
1717 | set_current_state(TASK_STOPPED); | ||
1718 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1719 | spin_unlock_irq(&sighand->siglock); | ||
1720 | } | ||
1721 | else { | ||
1722 | /* | ||
1723 | * There is no group stop already in progress. | ||
1724 | * We must initiate one now, but that requires | ||
1725 | * dropping siglock to get both the tasklist lock | ||
1726 | * and siglock again in the proper order. Note that | ||
1727 | * this allows an intervening SIGCONT to be posted. | ||
1728 | * We need to check for that and bail out if necessary. | ||
1729 | */ | ||
1730 | struct task_struct *t; | ||
1731 | |||
1732 | spin_unlock_irq(&sighand->siglock); | ||
1733 | |||
1734 | /* signals can be posted during this window */ | ||
1735 | |||
1736 | read_lock(&tasklist_lock); | ||
1737 | spin_lock_irq(&sighand->siglock); | ||
1738 | |||
1739 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) { | ||
1740 | /* | ||
1741 | * Another stop or continue happened while we | ||
1742 | * didn't have the lock. We can just swallow this | ||
1743 | * signal now. If we raced with a SIGCONT, that | ||
1744 | * should have just cleared it now. If we raced | ||
1745 | * with another processor delivering a stop signal, | ||
1746 | * then the SIGCONT that wakes us up should clear it. | ||
1747 | */ | ||
1748 | read_unlock(&tasklist_lock); | ||
1749 | return 0; | ||
1750 | } | ||
1751 | |||
1752 | if (sig->group_stop_count == 0) { | ||
1753 | sig->group_exit_code = signr; | ||
1754 | stop_count = 0; | ||
1755 | for (t = next_thread(current); t != current; | ||
1756 | t = next_thread(t)) | ||
1757 | /* | ||
1758 | * Setting state to TASK_STOPPED for a group | ||
1759 | * stop is always done with the siglock held, | ||
1760 | * so this check has no races. | ||
1761 | */ | ||
1762 | if (t->state < TASK_STOPPED) { | ||
1763 | stop_count++; | ||
1764 | signal_wake_up(t, 0); | ||
1765 | } | ||
1766 | sig->group_stop_count = stop_count; | ||
1767 | } | ||
1768 | else { | ||
1769 | /* A race with another thread while unlocked. */ | ||
1770 | signr = sig->group_exit_code; | ||
1771 | stop_count = --sig->group_stop_count; | ||
1772 | } | ||
1773 | |||
1774 | current->exit_code = signr; | ||
1775 | set_current_state(TASK_STOPPED); | ||
1776 | if (stop_count == 0) | ||
1777 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1778 | |||
1779 | spin_unlock_irq(&sighand->siglock); | ||
1780 | read_unlock(&tasklist_lock); | ||
1781 | } | ||
1782 | |||
1783 | finish_stop(stop_count); | ||
1784 | return 1; | ||
1785 | } | ||
1786 | |||
1787 | /* | ||
1788 | * Do appropriate magic when group_stop_count > 0. | ||
1789 | * We return nonzero if we stopped, after releasing the siglock. | ||
1790 | * We return zero if we still hold the siglock and should look | ||
1791 | * for another signal without checking group_stop_count again. | ||
1792 | */ | ||
1793 | static inline int handle_group_stop(void) | ||
1794 | { | ||
1795 | int stop_count; | ||
1796 | |||
1797 | if (current->signal->group_exit_task == current) { | ||
1798 | /* | ||
1799 | * Group stop is so we can do a core dump, | ||
1800 | * We are the initiating thread, so get on with it. | ||
1801 | */ | ||
1802 | current->signal->group_exit_task = NULL; | ||
1803 | return 0; | ||
1804 | } | ||
1805 | |||
1806 | if (current->signal->flags & SIGNAL_GROUP_EXIT) | ||
1807 | /* | ||
1808 | * Group stop is so another thread can do a core dump, | ||
1809 | * or else we are racing against a death signal. | ||
1810 | * Just punt the stop so we can get the next signal. | ||
1811 | */ | ||
1812 | return 0; | ||
1813 | |||
1814 | /* | ||
1815 | * There is a group stop in progress. We stop | ||
1816 | * without any associated signal being in our queue. | ||
1817 | */ | ||
1818 | stop_count = --current->signal->group_stop_count; | ||
1819 | if (stop_count == 0) | ||
1820 | current->signal->flags = SIGNAL_STOP_STOPPED; | ||
1821 | current->exit_code = current->signal->group_exit_code; | ||
1822 | set_current_state(TASK_STOPPED); | ||
1823 | spin_unlock_irq(¤t->sighand->siglock); | ||
1824 | finish_stop(stop_count); | ||
1825 | return 1; | ||
1826 | } | ||
1827 | |||
1828 | int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | ||
1829 | struct pt_regs *regs, void *cookie) | ||
1830 | { | ||
1831 | sigset_t *mask = ¤t->blocked; | ||
1832 | int signr = 0; | ||
1833 | |||
1834 | relock: | ||
1835 | spin_lock_irq(¤t->sighand->siglock); | ||
1836 | for (;;) { | ||
1837 | struct k_sigaction *ka; | ||
1838 | |||
1839 | if (unlikely(current->signal->group_stop_count > 0) && | ||
1840 | handle_group_stop()) | ||
1841 | goto relock; | ||
1842 | |||
1843 | signr = dequeue_signal(current, mask, info); | ||
1844 | |||
1845 | if (!signr) | ||
1846 | break; /* will return 0 */ | ||
1847 | |||
1848 | if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { | ||
1849 | ptrace_signal_deliver(regs, cookie); | ||
1850 | |||
1851 | /* Let the debugger run. */ | ||
1852 | ptrace_stop(signr, signr, info); | ||
1853 | |||
1854 | /* We're back. Did the debugger cancel the sig? */ | ||
1855 | signr = current->exit_code; | ||
1856 | if (signr == 0) | ||
1857 | continue; | ||
1858 | |||
1859 | current->exit_code = 0; | ||
1860 | |||
1861 | /* Update the siginfo structure if the signal has | ||
1862 | changed. If the debugger wanted something | ||
1863 | specific in the siginfo structure then it should | ||
1864 | have updated *info via PTRACE_SETSIGINFO. */ | ||
1865 | if (signr != info->si_signo) { | ||
1866 | info->si_signo = signr; | ||
1867 | info->si_errno = 0; | ||
1868 | info->si_code = SI_USER; | ||
1869 | info->si_pid = current->parent->pid; | ||
1870 | info->si_uid = current->parent->uid; | ||
1871 | } | ||
1872 | |||
1873 | /* If the (new) signal is now blocked, requeue it. */ | ||
1874 | if (sigismember(¤t->blocked, signr)) { | ||
1875 | specific_send_sig_info(signr, info, current); | ||
1876 | continue; | ||
1877 | } | ||
1878 | } | ||
1879 | |||
1880 | ka = ¤t->sighand->action[signr-1]; | ||
1881 | if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ | ||
1882 | continue; | ||
1883 | if (ka->sa.sa_handler != SIG_DFL) { | ||
1884 | /* Run the handler. */ | ||
1885 | *return_ka = *ka; | ||
1886 | |||
1887 | if (ka->sa.sa_flags & SA_ONESHOT) | ||
1888 | ka->sa.sa_handler = SIG_DFL; | ||
1889 | |||
1890 | break; /* will return non-zero "signr" value */ | ||
1891 | } | ||
1892 | |||
1893 | /* | ||
1894 | * Now we are doing the default action for this signal. | ||
1895 | */ | ||
1896 | if (sig_kernel_ignore(signr)) /* Default is nothing. */ | ||
1897 | continue; | ||
1898 | |||
1899 | /* Init gets no signals it doesn't want. */ | ||
1900 | if (current->pid == 1) | ||
1901 | continue; | ||
1902 | |||
1903 | if (sig_kernel_stop(signr)) { | ||
1904 | /* | ||
1905 | * The default action is to stop all threads in | ||
1906 | * the thread group. The job control signals | ||
1907 | * do nothing in an orphaned pgrp, but SIGSTOP | ||
1908 | * always works. Note that siglock needs to be | ||
1909 | * dropped during the call to is_orphaned_pgrp() | ||
1910 | * because of lock ordering with tasklist_lock. | ||
1911 | * This allows an intervening SIGCONT to be posted. | ||
1912 | * We need to check for that and bail out if necessary. | ||
1913 | */ | ||
1914 | if (signr != SIGSTOP) { | ||
1915 | spin_unlock_irq(¤t->sighand->siglock); | ||
1916 | |||
1917 | /* signals can be posted during this window */ | ||
1918 | |||
1919 | if (is_orphaned_pgrp(process_group(current))) | ||
1920 | goto relock; | ||
1921 | |||
1922 | spin_lock_irq(¤t->sighand->siglock); | ||
1923 | } | ||
1924 | |||
1925 | if (likely(do_signal_stop(signr))) { | ||
1926 | /* It released the siglock. */ | ||
1927 | goto relock; | ||
1928 | } | ||
1929 | |||
1930 | /* | ||
1931 | * We didn't actually stop, due to a race | ||
1932 | * with SIGCONT or something like that. | ||
1933 | */ | ||
1934 | continue; | ||
1935 | } | ||
1936 | |||
1937 | spin_unlock_irq(¤t->sighand->siglock); | ||
1938 | |||
1939 | /* | ||
1940 | * Anything else is fatal, maybe with a core dump. | ||
1941 | */ | ||
1942 | current->flags |= PF_SIGNALED; | ||
1943 | if (sig_kernel_coredump(signr)) { | ||
1944 | /* | ||
1945 | * If it was able to dump core, this kills all | ||
1946 | * other threads in the group and synchronizes with | ||
1947 | * their demise. If we lost the race with another | ||
1948 | * thread getting here, it set group_exit_code | ||
1949 | * first and our do_group_exit call below will use | ||
1950 | * that value and ignore the one we pass it. | ||
1951 | */ | ||
1952 | do_coredump((long)signr, signr, regs); | ||
1953 | } | ||
1954 | |||
1955 | /* | ||
1956 | * Death signals, no core dump. | ||
1957 | */ | ||
1958 | do_group_exit(signr); | ||
1959 | /* NOTREACHED */ | ||
1960 | } | ||
1961 | spin_unlock_irq(¤t->sighand->siglock); | ||
1962 | return signr; | ||
1963 | } | ||
1964 | |||
1965 | #endif | ||
1966 | |||
1967 | EXPORT_SYMBOL(recalc_sigpending); | ||
1968 | EXPORT_SYMBOL_GPL(dequeue_signal); | ||
1969 | EXPORT_SYMBOL(flush_signals); | ||
1970 | EXPORT_SYMBOL(force_sig); | ||
1971 | EXPORT_SYMBOL(kill_pg); | ||
1972 | EXPORT_SYMBOL(kill_proc); | ||
1973 | EXPORT_SYMBOL(ptrace_notify); | ||
1974 | EXPORT_SYMBOL(send_sig); | ||
1975 | EXPORT_SYMBOL(send_sig_info); | ||
1976 | EXPORT_SYMBOL(sigprocmask); | ||
1977 | EXPORT_SYMBOL(block_all_signals); | ||
1978 | EXPORT_SYMBOL(unblock_all_signals); | ||
1979 | |||
1980 | |||
1981 | /* | ||
1982 | * System call entry points. | ||
1983 | */ | ||
1984 | |||
1985 | asmlinkage long sys_restart_syscall(void) | ||
1986 | { | ||
1987 | struct restart_block *restart = ¤t_thread_info()->restart_block; | ||
1988 | return restart->fn(restart); | ||
1989 | } | ||
1990 | |||
1991 | long do_no_restart_syscall(struct restart_block *param) | ||
1992 | { | ||
1993 | return -EINTR; | ||
1994 | } | ||
1995 | |||
1996 | /* | ||
1997 | * We don't need to get the kernel lock - this is all local to this | ||
1998 | * particular thread.. (and that's good, because this is _heavily_ | ||
1999 | * used by various programs) | ||
2000 | */ | ||
2001 | |||
2002 | /* | ||
2003 | * This is also useful for kernel threads that want to temporarily | ||
2004 | * (or permanently) block certain signals. | ||
2005 | * | ||
2006 | * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel | ||
2007 | * interface happily blocks "unblockable" signals like SIGKILL | ||
2008 | * and friends. | ||
2009 | */ | ||
2010 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | ||
2011 | { | ||
2012 | int error; | ||
2013 | sigset_t old_block; | ||
2014 | |||
2015 | spin_lock_irq(¤t->sighand->siglock); | ||
2016 | old_block = current->blocked; | ||
2017 | error = 0; | ||
2018 | switch (how) { | ||
2019 | case SIG_BLOCK: | ||
2020 | sigorsets(¤t->blocked, ¤t->blocked, set); | ||
2021 | break; | ||
2022 | case SIG_UNBLOCK: | ||
2023 | signandsets(¤t->blocked, ¤t->blocked, set); | ||
2024 | break; | ||
2025 | case SIG_SETMASK: | ||
2026 | current->blocked = *set; | ||
2027 | break; | ||
2028 | default: | ||
2029 | error = -EINVAL; | ||
2030 | } | ||
2031 | recalc_sigpending(); | ||
2032 | spin_unlock_irq(¤t->sighand->siglock); | ||
2033 | if (oldset) | ||
2034 | *oldset = old_block; | ||
2035 | return error; | ||
2036 | } | ||
2037 | |||
2038 | asmlinkage long | ||
2039 | sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize) | ||
2040 | { | ||
2041 | int error = -EINVAL; | ||
2042 | sigset_t old_set, new_set; | ||
2043 | |||
2044 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
2045 | if (sigsetsize != sizeof(sigset_t)) | ||
2046 | goto out; | ||
2047 | |||
2048 | if (set) { | ||
2049 | error = -EFAULT; | ||
2050 | if (copy_from_user(&new_set, set, sizeof(*set))) | ||
2051 | goto out; | ||
2052 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
2053 | |||
2054 | error = sigprocmask(how, &new_set, &old_set); | ||
2055 | if (error) | ||
2056 | goto out; | ||
2057 | if (oset) | ||
2058 | goto set_old; | ||
2059 | } else if (oset) { | ||
2060 | spin_lock_irq(¤t->sighand->siglock); | ||
2061 | old_set = current->blocked; | ||
2062 | spin_unlock_irq(¤t->sighand->siglock); | ||
2063 | |||
2064 | set_old: | ||
2065 | error = -EFAULT; | ||
2066 | if (copy_to_user(oset, &old_set, sizeof(*oset))) | ||
2067 | goto out; | ||
2068 | } | ||
2069 | error = 0; | ||
2070 | out: | ||
2071 | return error; | ||
2072 | } | ||
2073 | |||
2074 | long do_sigpending(void __user *set, unsigned long sigsetsize) | ||
2075 | { | ||
2076 | long error = -EINVAL; | ||
2077 | sigset_t pending; | ||
2078 | |||
2079 | if (sigsetsize > sizeof(sigset_t)) | ||
2080 | goto out; | ||
2081 | |||
2082 | spin_lock_irq(¤t->sighand->siglock); | ||
2083 | sigorsets(&pending, ¤t->pending.signal, | ||
2084 | ¤t->signal->shared_pending.signal); | ||
2085 | spin_unlock_irq(¤t->sighand->siglock); | ||
2086 | |||
2087 | /* Outside the lock because only this thread touches it. */ | ||
2088 | sigandsets(&pending, ¤t->blocked, &pending); | ||
2089 | |||
2090 | error = -EFAULT; | ||
2091 | if (!copy_to_user(set, &pending, sigsetsize)) | ||
2092 | error = 0; | ||
2093 | |||
2094 | out: | ||
2095 | return error; | ||
2096 | } | ||
2097 | |||
2098 | asmlinkage long | ||
2099 | sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize) | ||
2100 | { | ||
2101 | return do_sigpending(set, sigsetsize); | ||
2102 | } | ||
2103 | |||
2104 | #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER | ||
2105 | |||
2106 | int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | ||
2107 | { | ||
2108 | int err; | ||
2109 | |||
2110 | if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) | ||
2111 | return -EFAULT; | ||
2112 | if (from->si_code < 0) | ||
2113 | return __copy_to_user(to, from, sizeof(siginfo_t)) | ||
2114 | ? -EFAULT : 0; | ||
2115 | /* | ||
2116 | * If you change siginfo_t structure, please be sure | ||
2117 | * this code is fixed accordingly. | ||
2118 | * It should never copy any pad contained in the structure | ||
2119 | * to avoid security leaks, but must copy the generic | ||
2120 | * 3 ints plus the relevant union member. | ||
2121 | */ | ||
2122 | err = __put_user(from->si_signo, &to->si_signo); | ||
2123 | err |= __put_user(from->si_errno, &to->si_errno); | ||
2124 | err |= __put_user((short)from->si_code, &to->si_code); | ||
2125 | switch (from->si_code & __SI_MASK) { | ||
2126 | case __SI_KILL: | ||
2127 | err |= __put_user(from->si_pid, &to->si_pid); | ||
2128 | err |= __put_user(from->si_uid, &to->si_uid); | ||
2129 | break; | ||
2130 | case __SI_TIMER: | ||
2131 | err |= __put_user(from->si_tid, &to->si_tid); | ||
2132 | err |= __put_user(from->si_overrun, &to->si_overrun); | ||
2133 | err |= __put_user(from->si_ptr, &to->si_ptr); | ||
2134 | break; | ||
2135 | case __SI_POLL: | ||
2136 | err |= __put_user(from->si_band, &to->si_band); | ||
2137 | err |= __put_user(from->si_fd, &to->si_fd); | ||
2138 | break; | ||
2139 | case __SI_FAULT: | ||
2140 | err |= __put_user(from->si_addr, &to->si_addr); | ||
2141 | #ifdef __ARCH_SI_TRAPNO | ||
2142 | err |= __put_user(from->si_trapno, &to->si_trapno); | ||
2143 | #endif | ||
2144 | break; | ||
2145 | case __SI_CHLD: | ||
2146 | err |= __put_user(from->si_pid, &to->si_pid); | ||
2147 | err |= __put_user(from->si_uid, &to->si_uid); | ||
2148 | err |= __put_user(from->si_status, &to->si_status); | ||
2149 | err |= __put_user(from->si_utime, &to->si_utime); | ||
2150 | err |= __put_user(from->si_stime, &to->si_stime); | ||
2151 | break; | ||
2152 | case __SI_RT: /* This is not generated by the kernel as of now. */ | ||
2153 | case __SI_MESGQ: /* But this is */ | ||
2154 | err |= __put_user(from->si_pid, &to->si_pid); | ||
2155 | err |= __put_user(from->si_uid, &to->si_uid); | ||
2156 | err |= __put_user(from->si_ptr, &to->si_ptr); | ||
2157 | break; | ||
2158 | default: /* this is just in case for now ... */ | ||
2159 | err |= __put_user(from->si_pid, &to->si_pid); | ||
2160 | err |= __put_user(from->si_uid, &to->si_uid); | ||
2161 | break; | ||
2162 | } | ||
2163 | return err; | ||
2164 | } | ||
2165 | |||
2166 | #endif | ||
2167 | |||
2168 | asmlinkage long | ||
2169 | sys_rt_sigtimedwait(const sigset_t __user *uthese, | ||
2170 | siginfo_t __user *uinfo, | ||
2171 | const struct timespec __user *uts, | ||
2172 | size_t sigsetsize) | ||
2173 | { | ||
2174 | int ret, sig; | ||
2175 | sigset_t these; | ||
2176 | struct timespec ts; | ||
2177 | siginfo_t info; | ||
2178 | long timeout = 0; | ||
2179 | |||
2180 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
2181 | if (sigsetsize != sizeof(sigset_t)) | ||
2182 | return -EINVAL; | ||
2183 | |||
2184 | if (copy_from_user(&these, uthese, sizeof(these))) | ||
2185 | return -EFAULT; | ||
2186 | |||
2187 | /* | ||
2188 | * Invert the set of allowed signals to get those we | ||
2189 | * want to block. | ||
2190 | */ | ||
2191 | sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
2192 | signotset(&these); | ||
2193 | |||
2194 | if (uts) { | ||
2195 | if (copy_from_user(&ts, uts, sizeof(ts))) | ||
2196 | return -EFAULT; | ||
2197 | if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 | ||
2198 | || ts.tv_sec < 0) | ||
2199 | return -EINVAL; | ||
2200 | } | ||
2201 | |||
2202 | spin_lock_irq(¤t->sighand->siglock); | ||
2203 | sig = dequeue_signal(current, &these, &info); | ||
2204 | if (!sig) { | ||
2205 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
2206 | if (uts) | ||
2207 | timeout = (timespec_to_jiffies(&ts) | ||
2208 | + (ts.tv_sec || ts.tv_nsec)); | ||
2209 | |||
2210 | if (timeout) { | ||
2211 | /* None ready -- temporarily unblock those we're | ||
2212 | * interested while we are sleeping in so that we'll | ||
2213 | * be awakened when they arrive. */ | ||
2214 | current->real_blocked = current->blocked; | ||
2215 | sigandsets(¤t->blocked, ¤t->blocked, &these); | ||
2216 | recalc_sigpending(); | ||
2217 | spin_unlock_irq(¤t->sighand->siglock); | ||
2218 | |||
2219 | current->state = TASK_INTERRUPTIBLE; | ||
2220 | timeout = schedule_timeout(timeout); | ||
2221 | |||
2222 | if (current->flags & PF_FREEZE) | ||
2223 | refrigerator(PF_FREEZE); | ||
2224 | spin_lock_irq(¤t->sighand->siglock); | ||
2225 | sig = dequeue_signal(current, &these, &info); | ||
2226 | current->blocked = current->real_blocked; | ||
2227 | siginitset(¤t->real_blocked, 0); | ||
2228 | recalc_sigpending(); | ||
2229 | } | ||
2230 | } | ||
2231 | spin_unlock_irq(¤t->sighand->siglock); | ||
2232 | |||
2233 | if (sig) { | ||
2234 | ret = sig; | ||
2235 | if (uinfo) { | ||
2236 | if (copy_siginfo_to_user(uinfo, &info)) | ||
2237 | ret = -EFAULT; | ||
2238 | } | ||
2239 | } else { | ||
2240 | ret = -EAGAIN; | ||
2241 | if (timeout) | ||
2242 | ret = -EINTR; | ||
2243 | } | ||
2244 | |||
2245 | return ret; | ||
2246 | } | ||
2247 | |||
2248 | asmlinkage long | ||
2249 | sys_kill(int pid, int sig) | ||
2250 | { | ||
2251 | struct siginfo info; | ||
2252 | |||
2253 | info.si_signo = sig; | ||
2254 | info.si_errno = 0; | ||
2255 | info.si_code = SI_USER; | ||
2256 | info.si_pid = current->tgid; | ||
2257 | info.si_uid = current->uid; | ||
2258 | |||
2259 | return kill_something_info(sig, &info, pid); | ||
2260 | } | ||
2261 | |||
2262 | /** | ||
2263 | * sys_tgkill - send signal to one specific thread | ||
2264 | * @tgid: the thread group ID of the thread | ||
2265 | * @pid: the PID of the thread | ||
2266 | * @sig: signal to be sent | ||
2267 | * | ||
2268 | * This syscall also checks the tgid and returns -ESRCH even if the PID | ||
2269 | * exists but it's not belonging to the target process anymore. This | ||
2270 | * method solves the problem of threads exiting and PIDs getting reused. | ||
2271 | */ | ||
2272 | asmlinkage long sys_tgkill(int tgid, int pid, int sig) | ||
2273 | { | ||
2274 | struct siginfo info; | ||
2275 | int error; | ||
2276 | struct task_struct *p; | ||
2277 | |||
2278 | /* This is only valid for single tasks */ | ||
2279 | if (pid <= 0 || tgid <= 0) | ||
2280 | return -EINVAL; | ||
2281 | |||
2282 | info.si_signo = sig; | ||
2283 | info.si_errno = 0; | ||
2284 | info.si_code = SI_TKILL; | ||
2285 | info.si_pid = current->tgid; | ||
2286 | info.si_uid = current->uid; | ||
2287 | |||
2288 | read_lock(&tasklist_lock); | ||
2289 | p = find_task_by_pid(pid); | ||
2290 | error = -ESRCH; | ||
2291 | if (p && (p->tgid == tgid)) { | ||
2292 | error = check_kill_permission(sig, &info, p); | ||
2293 | /* | ||
2294 | * The null signal is a permissions and process existence | ||
2295 | * probe. No signal is actually delivered. | ||
2296 | */ | ||
2297 | if (!error && sig && p->sighand) { | ||
2298 | spin_lock_irq(&p->sighand->siglock); | ||
2299 | handle_stop_signal(sig, p); | ||
2300 | error = specific_send_sig_info(sig, &info, p); | ||
2301 | spin_unlock_irq(&p->sighand->siglock); | ||
2302 | } | ||
2303 | } | ||
2304 | read_unlock(&tasklist_lock); | ||
2305 | return error; | ||
2306 | } | ||
2307 | |||
2308 | /* | ||
2309 | * Send a signal to only one task, even if it's a CLONE_THREAD task. | ||
2310 | */ | ||
2311 | asmlinkage long | ||
2312 | sys_tkill(int pid, int sig) | ||
2313 | { | ||
2314 | struct siginfo info; | ||
2315 | int error; | ||
2316 | struct task_struct *p; | ||
2317 | |||
2318 | /* This is only valid for single tasks */ | ||
2319 | if (pid <= 0) | ||
2320 | return -EINVAL; | ||
2321 | |||
2322 | info.si_signo = sig; | ||
2323 | info.si_errno = 0; | ||
2324 | info.si_code = SI_TKILL; | ||
2325 | info.si_pid = current->tgid; | ||
2326 | info.si_uid = current->uid; | ||
2327 | |||
2328 | read_lock(&tasklist_lock); | ||
2329 | p = find_task_by_pid(pid); | ||
2330 | error = -ESRCH; | ||
2331 | if (p) { | ||
2332 | error = check_kill_permission(sig, &info, p); | ||
2333 | /* | ||
2334 | * The null signal is a permissions and process existence | ||
2335 | * probe. No signal is actually delivered. | ||
2336 | */ | ||
2337 | if (!error && sig && p->sighand) { | ||
2338 | spin_lock_irq(&p->sighand->siglock); | ||
2339 | handle_stop_signal(sig, p); | ||
2340 | error = specific_send_sig_info(sig, &info, p); | ||
2341 | spin_unlock_irq(&p->sighand->siglock); | ||
2342 | } | ||
2343 | } | ||
2344 | read_unlock(&tasklist_lock); | ||
2345 | return error; | ||
2346 | } | ||
2347 | |||
2348 | asmlinkage long | ||
2349 | sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) | ||
2350 | { | ||
2351 | siginfo_t info; | ||
2352 | |||
2353 | if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) | ||
2354 | return -EFAULT; | ||
2355 | |||
2356 | /* Not even root can pretend to send signals from the kernel. | ||
2357 | Nor can they impersonate a kill(), which adds source info. */ | ||
2358 | if (info.si_code >= 0) | ||
2359 | return -EPERM; | ||
2360 | info.si_signo = sig; | ||
2361 | |||
2362 | /* POSIX.1b doesn't mention process groups. */ | ||
2363 | return kill_proc_info(sig, &info, pid); | ||
2364 | } | ||
2365 | |||
2366 | int | ||
2367 | do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | ||
2368 | { | ||
2369 | struct k_sigaction *k; | ||
2370 | |||
2371 | if (sig < 1 || sig > _NSIG || (act && sig_kernel_only(sig))) | ||
2372 | return -EINVAL; | ||
2373 | |||
2374 | k = ¤t->sighand->action[sig-1]; | ||
2375 | |||
2376 | spin_lock_irq(¤t->sighand->siglock); | ||
2377 | if (signal_pending(current)) { | ||
2378 | /* | ||
2379 | * If there might be a fatal signal pending on multiple | ||
2380 | * threads, make sure we take it before changing the action. | ||
2381 | */ | ||
2382 | spin_unlock_irq(¤t->sighand->siglock); | ||
2383 | return -ERESTARTNOINTR; | ||
2384 | } | ||
2385 | |||
2386 | if (oact) | ||
2387 | *oact = *k; | ||
2388 | |||
2389 | if (act) { | ||
2390 | /* | ||
2391 | * POSIX 3.3.1.3: | ||
2392 | * "Setting a signal action to SIG_IGN for a signal that is | ||
2393 | * pending shall cause the pending signal to be discarded, | ||
2394 | * whether or not it is blocked." | ||
2395 | * | ||
2396 | * "Setting a signal action to SIG_DFL for a signal that is | ||
2397 | * pending and whose default action is to ignore the signal | ||
2398 | * (for example, SIGCHLD), shall cause the pending signal to | ||
2399 | * be discarded, whether or not it is blocked" | ||
2400 | */ | ||
2401 | if (act->sa.sa_handler == SIG_IGN || | ||
2402 | (act->sa.sa_handler == SIG_DFL && | ||
2403 | sig_kernel_ignore(sig))) { | ||
2404 | /* | ||
2405 | * This is a fairly rare case, so we only take the | ||
2406 | * tasklist_lock once we're sure we'll need it. | ||
2407 | * Now we must do this little unlock and relock | ||
2408 | * dance to maintain the lock hierarchy. | ||
2409 | */ | ||
2410 | struct task_struct *t = current; | ||
2411 | spin_unlock_irq(&t->sighand->siglock); | ||
2412 | read_lock(&tasklist_lock); | ||
2413 | spin_lock_irq(&t->sighand->siglock); | ||
2414 | *k = *act; | ||
2415 | sigdelsetmask(&k->sa.sa_mask, | ||
2416 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2417 | rm_from_queue(sigmask(sig), &t->signal->shared_pending); | ||
2418 | do { | ||
2419 | rm_from_queue(sigmask(sig), &t->pending); | ||
2420 | recalc_sigpending_tsk(t); | ||
2421 | t = next_thread(t); | ||
2422 | } while (t != current); | ||
2423 | spin_unlock_irq(¤t->sighand->siglock); | ||
2424 | read_unlock(&tasklist_lock); | ||
2425 | return 0; | ||
2426 | } | ||
2427 | |||
2428 | *k = *act; | ||
2429 | sigdelsetmask(&k->sa.sa_mask, | ||
2430 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2431 | } | ||
2432 | |||
2433 | spin_unlock_irq(¤t->sighand->siglock); | ||
2434 | return 0; | ||
2435 | } | ||
2436 | |||
2437 | int | ||
2438 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) | ||
2439 | { | ||
2440 | stack_t oss; | ||
2441 | int error; | ||
2442 | |||
2443 | if (uoss) { | ||
2444 | oss.ss_sp = (void __user *) current->sas_ss_sp; | ||
2445 | oss.ss_size = current->sas_ss_size; | ||
2446 | oss.ss_flags = sas_ss_flags(sp); | ||
2447 | } | ||
2448 | |||
2449 | if (uss) { | ||
2450 | void __user *ss_sp; | ||
2451 | size_t ss_size; | ||
2452 | int ss_flags; | ||
2453 | |||
2454 | error = -EFAULT; | ||
2455 | if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) | ||
2456 | || __get_user(ss_sp, &uss->ss_sp) | ||
2457 | || __get_user(ss_flags, &uss->ss_flags) | ||
2458 | || __get_user(ss_size, &uss->ss_size)) | ||
2459 | goto out; | ||
2460 | |||
2461 | error = -EPERM; | ||
2462 | if (on_sig_stack(sp)) | ||
2463 | goto out; | ||
2464 | |||
2465 | error = -EINVAL; | ||
2466 | /* | ||
2467 | * | ||
2468 | * Note - this code used to test ss_flags incorrectly | ||
2469 | * old code may have been written using ss_flags==0 | ||
2470 | * to mean ss_flags==SS_ONSTACK (as this was the only | ||
2471 | * way that worked) - this fix preserves that older | ||
2472 | * mechanism | ||
2473 | */ | ||
2474 | if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) | ||
2475 | goto out; | ||
2476 | |||
2477 | if (ss_flags == SS_DISABLE) { | ||
2478 | ss_size = 0; | ||
2479 | ss_sp = NULL; | ||
2480 | } else { | ||
2481 | error = -ENOMEM; | ||
2482 | if (ss_size < MINSIGSTKSZ) | ||
2483 | goto out; | ||
2484 | } | ||
2485 | |||
2486 | current->sas_ss_sp = (unsigned long) ss_sp; | ||
2487 | current->sas_ss_size = ss_size; | ||
2488 | } | ||
2489 | |||
2490 | if (uoss) { | ||
2491 | error = -EFAULT; | ||
2492 | if (copy_to_user(uoss, &oss, sizeof(oss))) | ||
2493 | goto out; | ||
2494 | } | ||
2495 | |||
2496 | error = 0; | ||
2497 | out: | ||
2498 | return error; | ||
2499 | } | ||
2500 | |||
2501 | #ifdef __ARCH_WANT_SYS_SIGPENDING | ||
2502 | |||
2503 | asmlinkage long | ||
2504 | sys_sigpending(old_sigset_t __user *set) | ||
2505 | { | ||
2506 | return do_sigpending(set, sizeof(*set)); | ||
2507 | } | ||
2508 | |||
2509 | #endif | ||
2510 | |||
2511 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | ||
2512 | /* Some platforms have their own version with special arguments others | ||
2513 | support only sys_rt_sigprocmask. */ | ||
2514 | |||
2515 | asmlinkage long | ||
2516 | sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset) | ||
2517 | { | ||
2518 | int error; | ||
2519 | old_sigset_t old_set, new_set; | ||
2520 | |||
2521 | if (set) { | ||
2522 | error = -EFAULT; | ||
2523 | if (copy_from_user(&new_set, set, sizeof(*set))) | ||
2524 | goto out; | ||
2525 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2526 | |||
2527 | spin_lock_irq(¤t->sighand->siglock); | ||
2528 | old_set = current->blocked.sig[0]; | ||
2529 | |||
2530 | error = 0; | ||
2531 | switch (how) { | ||
2532 | default: | ||
2533 | error = -EINVAL; | ||
2534 | break; | ||
2535 | case SIG_BLOCK: | ||
2536 | sigaddsetmask(¤t->blocked, new_set); | ||
2537 | break; | ||
2538 | case SIG_UNBLOCK: | ||
2539 | sigdelsetmask(¤t->blocked, new_set); | ||
2540 | break; | ||
2541 | case SIG_SETMASK: | ||
2542 | current->blocked.sig[0] = new_set; | ||
2543 | break; | ||
2544 | } | ||
2545 | |||
2546 | recalc_sigpending(); | ||
2547 | spin_unlock_irq(¤t->sighand->siglock); | ||
2548 | if (error) | ||
2549 | goto out; | ||
2550 | if (oset) | ||
2551 | goto set_old; | ||
2552 | } else if (oset) { | ||
2553 | old_set = current->blocked.sig[0]; | ||
2554 | set_old: | ||
2555 | error = -EFAULT; | ||
2556 | if (copy_to_user(oset, &old_set, sizeof(*oset))) | ||
2557 | goto out; | ||
2558 | } | ||
2559 | error = 0; | ||
2560 | out: | ||
2561 | return error; | ||
2562 | } | ||
2563 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ | ||
2564 | |||
2565 | #ifdef __ARCH_WANT_SYS_RT_SIGACTION | ||
2566 | asmlinkage long | ||
2567 | sys_rt_sigaction(int sig, | ||
2568 | const struct sigaction __user *act, | ||
2569 | struct sigaction __user *oact, | ||
2570 | size_t sigsetsize) | ||
2571 | { | ||
2572 | struct k_sigaction new_sa, old_sa; | ||
2573 | int ret = -EINVAL; | ||
2574 | |||
2575 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
2576 | if (sigsetsize != sizeof(sigset_t)) | ||
2577 | goto out; | ||
2578 | |||
2579 | if (act) { | ||
2580 | if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) | ||
2581 | return -EFAULT; | ||
2582 | } | ||
2583 | |||
2584 | ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); | ||
2585 | |||
2586 | if (!ret && oact) { | ||
2587 | if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) | ||
2588 | return -EFAULT; | ||
2589 | } | ||
2590 | out: | ||
2591 | return ret; | ||
2592 | } | ||
2593 | #endif /* __ARCH_WANT_SYS_RT_SIGACTION */ | ||
2594 | |||
2595 | #ifdef __ARCH_WANT_SYS_SGETMASK | ||
2596 | |||
2597 | /* | ||
2598 | * For backwards compatibility. Functionality superseded by sigprocmask. | ||
2599 | */ | ||
2600 | asmlinkage long | ||
2601 | sys_sgetmask(void) | ||
2602 | { | ||
2603 | /* SMP safe */ | ||
2604 | return current->blocked.sig[0]; | ||
2605 | } | ||
2606 | |||
2607 | asmlinkage long | ||
2608 | sys_ssetmask(int newmask) | ||
2609 | { | ||
2610 | int old; | ||
2611 | |||
2612 | spin_lock_irq(¤t->sighand->siglock); | ||
2613 | old = current->blocked.sig[0]; | ||
2614 | |||
2615 | siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| | ||
2616 | sigmask(SIGSTOP))); | ||
2617 | recalc_sigpending(); | ||
2618 | spin_unlock_irq(¤t->sighand->siglock); | ||
2619 | |||
2620 | return old; | ||
2621 | } | ||
2622 | #endif /* __ARCH_WANT_SGETMASK */ | ||
2623 | |||
2624 | #ifdef __ARCH_WANT_SYS_SIGNAL | ||
2625 | /* | ||
2626 | * For backwards compatibility. Functionality superseded by sigaction. | ||
2627 | */ | ||
2628 | asmlinkage unsigned long | ||
2629 | sys_signal(int sig, __sighandler_t handler) | ||
2630 | { | ||
2631 | struct k_sigaction new_sa, old_sa; | ||
2632 | int ret; | ||
2633 | |||
2634 | new_sa.sa.sa_handler = handler; | ||
2635 | new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; | ||
2636 | |||
2637 | ret = do_sigaction(sig, &new_sa, &old_sa); | ||
2638 | |||
2639 | return ret ? ret : (unsigned long)old_sa.sa.sa_handler; | ||
2640 | } | ||
2641 | #endif /* __ARCH_WANT_SYS_SIGNAL */ | ||
2642 | |||
2643 | #ifdef __ARCH_WANT_SYS_PAUSE | ||
2644 | |||
2645 | asmlinkage long | ||
2646 | sys_pause(void) | ||
2647 | { | ||
2648 | current->state = TASK_INTERRUPTIBLE; | ||
2649 | schedule(); | ||
2650 | return -ERESTARTNOHAND; | ||
2651 | } | ||
2652 | |||
2653 | #endif | ||
2654 | |||
2655 | void __init signals_init(void) | ||
2656 | { | ||
2657 | sigqueue_cachep = | ||
2658 | kmem_cache_create("sigqueue", | ||
2659 | sizeof(struct sigqueue), | ||
2660 | __alignof__(struct sigqueue), | ||
2661 | SLAB_PANIC, NULL, NULL); | ||
2662 | } | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c new file mode 100644 index 000000000000..b4ab6af1dea8 --- /dev/null +++ b/kernel/softirq.c | |||
@@ -0,0 +1,496 @@ | |||
1 | /* | ||
2 | * linux/kernel/softirq.c | ||
3 | * | ||
4 | * Copyright (C) 1992 Linus Torvalds | ||
5 | * | ||
6 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | ||
7 | */ | ||
8 | |||
9 | #include <linux/module.h> | ||
10 | #include <linux/kernel_stat.h> | ||
11 | #include <linux/interrupt.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/mm.h> | ||
14 | #include <linux/notifier.h> | ||
15 | #include <linux/percpu.h> | ||
16 | #include <linux/cpu.h> | ||
17 | #include <linux/kthread.h> | ||
18 | #include <linux/rcupdate.h> | ||
19 | |||
20 | #include <asm/irq.h> | ||
21 | /* | ||
22 | - No shared variables, all the data are CPU local. | ||
23 | - If a softirq needs serialization, let it serialize itself | ||
24 | by its own spinlocks. | ||
25 | - Even if softirq is serialized, only local cpu is marked for | ||
26 | execution. Hence, we get something sort of weak cpu binding. | ||
27 | Though it is still not clear, will it result in better locality | ||
28 | or will not. | ||
29 | |||
30 | Examples: | ||
31 | - NET RX softirq. It is multithreaded and does not require | ||
32 | any global serialization. | ||
33 | - NET TX softirq. It kicks software netdevice queues, hence | ||
34 | it is logically serialized per device, but this serialization | ||
35 | is invisible to common code. | ||
36 | - Tasklets: serialized wrt itself. | ||
37 | */ | ||
38 | |||
39 | #ifndef __ARCH_IRQ_STAT | ||
40 | irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; | ||
41 | EXPORT_SYMBOL(irq_stat); | ||
42 | #endif | ||
43 | |||
44 | static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; | ||
45 | |||
46 | static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | ||
47 | |||
48 | /* | ||
49 | * we cannot loop indefinitely here to avoid userspace starvation, | ||
50 | * but we also don't want to introduce a worst case 1/HZ latency | ||
51 | * to the pending events, so lets the scheduler to balance | ||
52 | * the softirq load for us. | ||
53 | */ | ||
54 | static inline void wakeup_softirqd(void) | ||
55 | { | ||
56 | /* Interrupts are disabled: no need to stop preemption */ | ||
57 | struct task_struct *tsk = __get_cpu_var(ksoftirqd); | ||
58 | |||
59 | if (tsk && tsk->state != TASK_RUNNING) | ||
60 | wake_up_process(tsk); | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * We restart softirq processing MAX_SOFTIRQ_RESTART times, | ||
65 | * and we fall back to softirqd after that. | ||
66 | * | ||
67 | * This number has been established via experimentation. | ||
68 | * The two things to balance is latency against fairness - | ||
69 | * we want to handle softirqs as soon as possible, but they | ||
70 | * should not be able to lock up the box. | ||
71 | */ | ||
72 | #define MAX_SOFTIRQ_RESTART 10 | ||
73 | |||
74 | asmlinkage void __do_softirq(void) | ||
75 | { | ||
76 | struct softirq_action *h; | ||
77 | __u32 pending; | ||
78 | int max_restart = MAX_SOFTIRQ_RESTART; | ||
79 | int cpu; | ||
80 | |||
81 | pending = local_softirq_pending(); | ||
82 | |||
83 | local_bh_disable(); | ||
84 | cpu = smp_processor_id(); | ||
85 | restart: | ||
86 | /* Reset the pending bitmask before enabling irqs */ | ||
87 | local_softirq_pending() = 0; | ||
88 | |||
89 | local_irq_enable(); | ||
90 | |||
91 | h = softirq_vec; | ||
92 | |||
93 | do { | ||
94 | if (pending & 1) { | ||
95 | h->action(h); | ||
96 | rcu_bh_qsctr_inc(cpu); | ||
97 | } | ||
98 | h++; | ||
99 | pending >>= 1; | ||
100 | } while (pending); | ||
101 | |||
102 | local_irq_disable(); | ||
103 | |||
104 | pending = local_softirq_pending(); | ||
105 | if (pending && --max_restart) | ||
106 | goto restart; | ||
107 | |||
108 | if (pending) | ||
109 | wakeup_softirqd(); | ||
110 | |||
111 | __local_bh_enable(); | ||
112 | } | ||
113 | |||
114 | #ifndef __ARCH_HAS_DO_SOFTIRQ | ||
115 | |||
116 | asmlinkage void do_softirq(void) | ||
117 | { | ||
118 | __u32 pending; | ||
119 | unsigned long flags; | ||
120 | |||
121 | if (in_interrupt()) | ||
122 | return; | ||
123 | |||
124 | local_irq_save(flags); | ||
125 | |||
126 | pending = local_softirq_pending(); | ||
127 | |||
128 | if (pending) | ||
129 | __do_softirq(); | ||
130 | |||
131 | local_irq_restore(flags); | ||
132 | } | ||
133 | |||
134 | EXPORT_SYMBOL(do_softirq); | ||
135 | |||
136 | #endif | ||
137 | |||
138 | void local_bh_enable(void) | ||
139 | { | ||
140 | WARN_ON(irqs_disabled()); | ||
141 | /* | ||
142 | * Keep preemption disabled until we are done with | ||
143 | * softirq processing: | ||
144 | */ | ||
145 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | ||
146 | |||
147 | if (unlikely(!in_interrupt() && local_softirq_pending())) | ||
148 | do_softirq(); | ||
149 | |||
150 | dec_preempt_count(); | ||
151 | preempt_check_resched(); | ||
152 | } | ||
153 | EXPORT_SYMBOL(local_bh_enable); | ||
154 | |||
155 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | ||
156 | # define invoke_softirq() __do_softirq() | ||
157 | #else | ||
158 | # define invoke_softirq() do_softirq() | ||
159 | #endif | ||
160 | |||
161 | /* | ||
162 | * Exit an interrupt context. Process softirqs if needed and possible: | ||
163 | */ | ||
164 | void irq_exit(void) | ||
165 | { | ||
166 | account_system_vtime(current); | ||
167 | sub_preempt_count(IRQ_EXIT_OFFSET); | ||
168 | if (!in_interrupt() && local_softirq_pending()) | ||
169 | invoke_softirq(); | ||
170 | preempt_enable_no_resched(); | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * This function must run with irqs disabled! | ||
175 | */ | ||
176 | inline fastcall void raise_softirq_irqoff(unsigned int nr) | ||
177 | { | ||
178 | __raise_softirq_irqoff(nr); | ||
179 | |||
180 | /* | ||
181 | * If we're in an interrupt or softirq, we're done | ||
182 | * (this also catches softirq-disabled code). We will | ||
183 | * actually run the softirq once we return from | ||
184 | * the irq or softirq. | ||
185 | * | ||
186 | * Otherwise we wake up ksoftirqd to make sure we | ||
187 | * schedule the softirq soon. | ||
188 | */ | ||
189 | if (!in_interrupt()) | ||
190 | wakeup_softirqd(); | ||
191 | } | ||
192 | |||
193 | EXPORT_SYMBOL(raise_softirq_irqoff); | ||
194 | |||
195 | void fastcall raise_softirq(unsigned int nr) | ||
196 | { | ||
197 | unsigned long flags; | ||
198 | |||
199 | local_irq_save(flags); | ||
200 | raise_softirq_irqoff(nr); | ||
201 | local_irq_restore(flags); | ||
202 | } | ||
203 | |||
204 | void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) | ||
205 | { | ||
206 | softirq_vec[nr].data = data; | ||
207 | softirq_vec[nr].action = action; | ||
208 | } | ||
209 | |||
210 | EXPORT_SYMBOL(open_softirq); | ||
211 | |||
212 | /* Tasklets */ | ||
213 | struct tasklet_head | ||
214 | { | ||
215 | struct tasklet_struct *list; | ||
216 | }; | ||
217 | |||
218 | /* Some compilers disobey section attribute on statics when not | ||
219 | initialized -- RR */ | ||
220 | static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL }; | ||
221 | static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL }; | ||
222 | |||
223 | void fastcall __tasklet_schedule(struct tasklet_struct *t) | ||
224 | { | ||
225 | unsigned long flags; | ||
226 | |||
227 | local_irq_save(flags); | ||
228 | t->next = __get_cpu_var(tasklet_vec).list; | ||
229 | __get_cpu_var(tasklet_vec).list = t; | ||
230 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | ||
231 | local_irq_restore(flags); | ||
232 | } | ||
233 | |||
234 | EXPORT_SYMBOL(__tasklet_schedule); | ||
235 | |||
236 | void fastcall __tasklet_hi_schedule(struct tasklet_struct *t) | ||
237 | { | ||
238 | unsigned long flags; | ||
239 | |||
240 | local_irq_save(flags); | ||
241 | t->next = __get_cpu_var(tasklet_hi_vec).list; | ||
242 | __get_cpu_var(tasklet_hi_vec).list = t; | ||
243 | raise_softirq_irqoff(HI_SOFTIRQ); | ||
244 | local_irq_restore(flags); | ||
245 | } | ||
246 | |||
247 | EXPORT_SYMBOL(__tasklet_hi_schedule); | ||
248 | |||
249 | static void tasklet_action(struct softirq_action *a) | ||
250 | { | ||
251 | struct tasklet_struct *list; | ||
252 | |||
253 | local_irq_disable(); | ||
254 | list = __get_cpu_var(tasklet_vec).list; | ||
255 | __get_cpu_var(tasklet_vec).list = NULL; | ||
256 | local_irq_enable(); | ||
257 | |||
258 | while (list) { | ||
259 | struct tasklet_struct *t = list; | ||
260 | |||
261 | list = list->next; | ||
262 | |||
263 | if (tasklet_trylock(t)) { | ||
264 | if (!atomic_read(&t->count)) { | ||
265 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) | ||
266 | BUG(); | ||
267 | t->func(t->data); | ||
268 | tasklet_unlock(t); | ||
269 | continue; | ||
270 | } | ||
271 | tasklet_unlock(t); | ||
272 | } | ||
273 | |||
274 | local_irq_disable(); | ||
275 | t->next = __get_cpu_var(tasklet_vec).list; | ||
276 | __get_cpu_var(tasklet_vec).list = t; | ||
277 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | ||
278 | local_irq_enable(); | ||
279 | } | ||
280 | } | ||
281 | |||
282 | static void tasklet_hi_action(struct softirq_action *a) | ||
283 | { | ||
284 | struct tasklet_struct *list; | ||
285 | |||
286 | local_irq_disable(); | ||
287 | list = __get_cpu_var(tasklet_hi_vec).list; | ||
288 | __get_cpu_var(tasklet_hi_vec).list = NULL; | ||
289 | local_irq_enable(); | ||
290 | |||
291 | while (list) { | ||
292 | struct tasklet_struct *t = list; | ||
293 | |||
294 | list = list->next; | ||
295 | |||
296 | if (tasklet_trylock(t)) { | ||
297 | if (!atomic_read(&t->count)) { | ||
298 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) | ||
299 | BUG(); | ||
300 | t->func(t->data); | ||
301 | tasklet_unlock(t); | ||
302 | continue; | ||
303 | } | ||
304 | tasklet_unlock(t); | ||
305 | } | ||
306 | |||
307 | local_irq_disable(); | ||
308 | t->next = __get_cpu_var(tasklet_hi_vec).list; | ||
309 | __get_cpu_var(tasklet_hi_vec).list = t; | ||
310 | __raise_softirq_irqoff(HI_SOFTIRQ); | ||
311 | local_irq_enable(); | ||
312 | } | ||
313 | } | ||
314 | |||
315 | |||
316 | void tasklet_init(struct tasklet_struct *t, | ||
317 | void (*func)(unsigned long), unsigned long data) | ||
318 | { | ||
319 | t->next = NULL; | ||
320 | t->state = 0; | ||
321 | atomic_set(&t->count, 0); | ||
322 | t->func = func; | ||
323 | t->data = data; | ||
324 | } | ||
325 | |||
326 | EXPORT_SYMBOL(tasklet_init); | ||
327 | |||
328 | void tasklet_kill(struct tasklet_struct *t) | ||
329 | { | ||
330 | if (in_interrupt()) | ||
331 | printk("Attempt to kill tasklet from interrupt\n"); | ||
332 | |||
333 | while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { | ||
334 | do | ||
335 | yield(); | ||
336 | while (test_bit(TASKLET_STATE_SCHED, &t->state)); | ||
337 | } | ||
338 | tasklet_unlock_wait(t); | ||
339 | clear_bit(TASKLET_STATE_SCHED, &t->state); | ||
340 | } | ||
341 | |||
342 | EXPORT_SYMBOL(tasklet_kill); | ||
343 | |||
344 | void __init softirq_init(void) | ||
345 | { | ||
346 | open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); | ||
347 | open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); | ||
348 | } | ||
349 | |||
350 | static int ksoftirqd(void * __bind_cpu) | ||
351 | { | ||
352 | set_user_nice(current, 19); | ||
353 | current->flags |= PF_NOFREEZE; | ||
354 | |||
355 | set_current_state(TASK_INTERRUPTIBLE); | ||
356 | |||
357 | while (!kthread_should_stop()) { | ||
358 | preempt_disable(); | ||
359 | if (!local_softirq_pending()) { | ||
360 | preempt_enable_no_resched(); | ||
361 | schedule(); | ||
362 | preempt_disable(); | ||
363 | } | ||
364 | |||
365 | __set_current_state(TASK_RUNNING); | ||
366 | |||
367 | while (local_softirq_pending()) { | ||
368 | /* Preempt disable stops cpu going offline. | ||
369 | If already offline, we'll be on wrong CPU: | ||
370 | don't process */ | ||
371 | if (cpu_is_offline((long)__bind_cpu)) | ||
372 | goto wait_to_die; | ||
373 | do_softirq(); | ||
374 | preempt_enable_no_resched(); | ||
375 | cond_resched(); | ||
376 | preempt_disable(); | ||
377 | } | ||
378 | preempt_enable(); | ||
379 | set_current_state(TASK_INTERRUPTIBLE); | ||
380 | } | ||
381 | __set_current_state(TASK_RUNNING); | ||
382 | return 0; | ||
383 | |||
384 | wait_to_die: | ||
385 | preempt_enable(); | ||
386 | /* Wait for kthread_stop */ | ||
387 | set_current_state(TASK_INTERRUPTIBLE); | ||
388 | while (!kthread_should_stop()) { | ||
389 | schedule(); | ||
390 | set_current_state(TASK_INTERRUPTIBLE); | ||
391 | } | ||
392 | __set_current_state(TASK_RUNNING); | ||
393 | return 0; | ||
394 | } | ||
395 | |||
396 | #ifdef CONFIG_HOTPLUG_CPU | ||
397 | /* | ||
398 | * tasklet_kill_immediate is called to remove a tasklet which can already be | ||
399 | * scheduled for execution on @cpu. | ||
400 | * | ||
401 | * Unlike tasklet_kill, this function removes the tasklet | ||
402 | * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. | ||
403 | * | ||
404 | * When this function is called, @cpu must be in the CPU_DEAD state. | ||
405 | */ | ||
406 | void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | ||
407 | { | ||
408 | struct tasklet_struct **i; | ||
409 | |||
410 | BUG_ON(cpu_online(cpu)); | ||
411 | BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); | ||
412 | |||
413 | if (!test_bit(TASKLET_STATE_SCHED, &t->state)) | ||
414 | return; | ||
415 | |||
416 | /* CPU is dead, so no lock needed. */ | ||
417 | for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { | ||
418 | if (*i == t) { | ||
419 | *i = t->next; | ||
420 | return; | ||
421 | } | ||
422 | } | ||
423 | BUG(); | ||
424 | } | ||
425 | |||
426 | static void takeover_tasklets(unsigned int cpu) | ||
427 | { | ||
428 | struct tasklet_struct **i; | ||
429 | |||
430 | /* CPU is dead, so no lock needed. */ | ||
431 | local_irq_disable(); | ||
432 | |||
433 | /* Find end, append list for that CPU. */ | ||
434 | for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); | ||
435 | *i = per_cpu(tasklet_vec, cpu).list; | ||
436 | per_cpu(tasklet_vec, cpu).list = NULL; | ||
437 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | ||
438 | |||
439 | for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); | ||
440 | *i = per_cpu(tasklet_hi_vec, cpu).list; | ||
441 | per_cpu(tasklet_hi_vec, cpu).list = NULL; | ||
442 | raise_softirq_irqoff(HI_SOFTIRQ); | ||
443 | |||
444 | local_irq_enable(); | ||
445 | } | ||
446 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
447 | |||
448 | static int __devinit cpu_callback(struct notifier_block *nfb, | ||
449 | unsigned long action, | ||
450 | void *hcpu) | ||
451 | { | ||
452 | int hotcpu = (unsigned long)hcpu; | ||
453 | struct task_struct *p; | ||
454 | |||
455 | switch (action) { | ||
456 | case CPU_UP_PREPARE: | ||
457 | BUG_ON(per_cpu(tasklet_vec, hotcpu).list); | ||
458 | BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); | ||
459 | p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); | ||
460 | if (IS_ERR(p)) { | ||
461 | printk("ksoftirqd for %i failed\n", hotcpu); | ||
462 | return NOTIFY_BAD; | ||
463 | } | ||
464 | kthread_bind(p, hotcpu); | ||
465 | per_cpu(ksoftirqd, hotcpu) = p; | ||
466 | break; | ||
467 | case CPU_ONLINE: | ||
468 | wake_up_process(per_cpu(ksoftirqd, hotcpu)); | ||
469 | break; | ||
470 | #ifdef CONFIG_HOTPLUG_CPU | ||
471 | case CPU_UP_CANCELED: | ||
472 | /* Unbind so it can run. Fall thru. */ | ||
473 | kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id()); | ||
474 | case CPU_DEAD: | ||
475 | p = per_cpu(ksoftirqd, hotcpu); | ||
476 | per_cpu(ksoftirqd, hotcpu) = NULL; | ||
477 | kthread_stop(p); | ||
478 | takeover_tasklets(hotcpu); | ||
479 | break; | ||
480 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
481 | } | ||
482 | return NOTIFY_OK; | ||
483 | } | ||
484 | |||
485 | static struct notifier_block __devinitdata cpu_nfb = { | ||
486 | .notifier_call = cpu_callback | ||
487 | }; | ||
488 | |||
489 | __init int spawn_ksoftirqd(void) | ||
490 | { | ||
491 | void *cpu = (void *)(long)smp_processor_id(); | ||
492 | cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
493 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
494 | register_cpu_notifier(&cpu_nfb); | ||
495 | return 0; | ||
496 | } | ||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c new file mode 100644 index 000000000000..e15ed17863f1 --- /dev/null +++ b/kernel/spinlock.c | |||
@@ -0,0 +1,371 @@ | |||
1 | /* | ||
2 | * Copyright (2004) Linus Torvalds | ||
3 | * | ||
4 | * Author: Zwane Mwaikambo <zwane@fsmlabs.com> | ||
5 | * | ||
6 | * Copyright (2004) Ingo Molnar | ||
7 | */ | ||
8 | |||
9 | #include <linux/config.h> | ||
10 | #include <linux/linkage.h> | ||
11 | #include <linux/preempt.h> | ||
12 | #include <linux/spinlock.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/module.h> | ||
15 | |||
16 | /* | ||
17 | * Generic declaration of the raw read_trylock() function, | ||
18 | * architectures are supposed to optimize this: | ||
19 | */ | ||
20 | int __lockfunc generic_raw_read_trylock(rwlock_t *lock) | ||
21 | { | ||
22 | _raw_read_lock(lock); | ||
23 | return 1; | ||
24 | } | ||
25 | EXPORT_SYMBOL(generic_raw_read_trylock); | ||
26 | |||
27 | int __lockfunc _spin_trylock(spinlock_t *lock) | ||
28 | { | ||
29 | preempt_disable(); | ||
30 | if (_raw_spin_trylock(lock)) | ||
31 | return 1; | ||
32 | |||
33 | preempt_enable(); | ||
34 | return 0; | ||
35 | } | ||
36 | EXPORT_SYMBOL(_spin_trylock); | ||
37 | |||
38 | int __lockfunc _read_trylock(rwlock_t *lock) | ||
39 | { | ||
40 | preempt_disable(); | ||
41 | if (_raw_read_trylock(lock)) | ||
42 | return 1; | ||
43 | |||
44 | preempt_enable(); | ||
45 | return 0; | ||
46 | } | ||
47 | EXPORT_SYMBOL(_read_trylock); | ||
48 | |||
49 | int __lockfunc _write_trylock(rwlock_t *lock) | ||
50 | { | ||
51 | preempt_disable(); | ||
52 | if (_raw_write_trylock(lock)) | ||
53 | return 1; | ||
54 | |||
55 | preempt_enable(); | ||
56 | return 0; | ||
57 | } | ||
58 | EXPORT_SYMBOL(_write_trylock); | ||
59 | |||
60 | #ifndef CONFIG_PREEMPT | ||
61 | |||
62 | void __lockfunc _read_lock(rwlock_t *lock) | ||
63 | { | ||
64 | preempt_disable(); | ||
65 | _raw_read_lock(lock); | ||
66 | } | ||
67 | EXPORT_SYMBOL(_read_lock); | ||
68 | |||
69 | unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) | ||
70 | { | ||
71 | unsigned long flags; | ||
72 | |||
73 | local_irq_save(flags); | ||
74 | preempt_disable(); | ||
75 | _raw_spin_lock_flags(lock, flags); | ||
76 | return flags; | ||
77 | } | ||
78 | EXPORT_SYMBOL(_spin_lock_irqsave); | ||
79 | |||
80 | void __lockfunc _spin_lock_irq(spinlock_t *lock) | ||
81 | { | ||
82 | local_irq_disable(); | ||
83 | preempt_disable(); | ||
84 | _raw_spin_lock(lock); | ||
85 | } | ||
86 | EXPORT_SYMBOL(_spin_lock_irq); | ||
87 | |||
88 | void __lockfunc _spin_lock_bh(spinlock_t *lock) | ||
89 | { | ||
90 | local_bh_disable(); | ||
91 | preempt_disable(); | ||
92 | _raw_spin_lock(lock); | ||
93 | } | ||
94 | EXPORT_SYMBOL(_spin_lock_bh); | ||
95 | |||
96 | unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) | ||
97 | { | ||
98 | unsigned long flags; | ||
99 | |||
100 | local_irq_save(flags); | ||
101 | preempt_disable(); | ||
102 | _raw_read_lock(lock); | ||
103 | return flags; | ||
104 | } | ||
105 | EXPORT_SYMBOL(_read_lock_irqsave); | ||
106 | |||
107 | void __lockfunc _read_lock_irq(rwlock_t *lock) | ||
108 | { | ||
109 | local_irq_disable(); | ||
110 | preempt_disable(); | ||
111 | _raw_read_lock(lock); | ||
112 | } | ||
113 | EXPORT_SYMBOL(_read_lock_irq); | ||
114 | |||
115 | void __lockfunc _read_lock_bh(rwlock_t *lock) | ||
116 | { | ||
117 | local_bh_disable(); | ||
118 | preempt_disable(); | ||
119 | _raw_read_lock(lock); | ||
120 | } | ||
121 | EXPORT_SYMBOL(_read_lock_bh); | ||
122 | |||
123 | unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) | ||
124 | { | ||
125 | unsigned long flags; | ||
126 | |||
127 | local_irq_save(flags); | ||
128 | preempt_disable(); | ||
129 | _raw_write_lock(lock); | ||
130 | return flags; | ||
131 | } | ||
132 | EXPORT_SYMBOL(_write_lock_irqsave); | ||
133 | |||
134 | void __lockfunc _write_lock_irq(rwlock_t *lock) | ||
135 | { | ||
136 | local_irq_disable(); | ||
137 | preempt_disable(); | ||
138 | _raw_write_lock(lock); | ||
139 | } | ||
140 | EXPORT_SYMBOL(_write_lock_irq); | ||
141 | |||
142 | void __lockfunc _write_lock_bh(rwlock_t *lock) | ||
143 | { | ||
144 | local_bh_disable(); | ||
145 | preempt_disable(); | ||
146 | _raw_write_lock(lock); | ||
147 | } | ||
148 | EXPORT_SYMBOL(_write_lock_bh); | ||
149 | |||
150 | void __lockfunc _spin_lock(spinlock_t *lock) | ||
151 | { | ||
152 | preempt_disable(); | ||
153 | _raw_spin_lock(lock); | ||
154 | } | ||
155 | |||
156 | EXPORT_SYMBOL(_spin_lock); | ||
157 | |||
158 | void __lockfunc _write_lock(rwlock_t *lock) | ||
159 | { | ||
160 | preempt_disable(); | ||
161 | _raw_write_lock(lock); | ||
162 | } | ||
163 | |||
164 | EXPORT_SYMBOL(_write_lock); | ||
165 | |||
166 | #else /* CONFIG_PREEMPT: */ | ||
167 | |||
168 | /* | ||
169 | * This could be a long-held lock. We both prepare to spin for a long | ||
170 | * time (making _this_ CPU preemptable if possible), and we also signal | ||
171 | * towards that other CPU that it should break the lock ASAP. | ||
172 | * | ||
173 | * (We do this in a function because inlining it would be excessive.) | ||
174 | */ | ||
175 | |||
176 | #define BUILD_LOCK_OPS(op, locktype) \ | ||
177 | void __lockfunc _##op##_lock(locktype##_t *lock) \ | ||
178 | { \ | ||
179 | preempt_disable(); \ | ||
180 | for (;;) { \ | ||
181 | if (likely(_raw_##op##_trylock(lock))) \ | ||
182 | break; \ | ||
183 | preempt_enable(); \ | ||
184 | if (!(lock)->break_lock) \ | ||
185 | (lock)->break_lock = 1; \ | ||
186 | while (!op##_can_lock(lock) && (lock)->break_lock) \ | ||
187 | cpu_relax(); \ | ||
188 | preempt_disable(); \ | ||
189 | } \ | ||
190 | (lock)->break_lock = 0; \ | ||
191 | } \ | ||
192 | \ | ||
193 | EXPORT_SYMBOL(_##op##_lock); \ | ||
194 | \ | ||
195 | unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ | ||
196 | { \ | ||
197 | unsigned long flags; \ | ||
198 | \ | ||
199 | preempt_disable(); \ | ||
200 | for (;;) { \ | ||
201 | local_irq_save(flags); \ | ||
202 | if (likely(_raw_##op##_trylock(lock))) \ | ||
203 | break; \ | ||
204 | local_irq_restore(flags); \ | ||
205 | \ | ||
206 | preempt_enable(); \ | ||
207 | if (!(lock)->break_lock) \ | ||
208 | (lock)->break_lock = 1; \ | ||
209 | while (!op##_can_lock(lock) && (lock)->break_lock) \ | ||
210 | cpu_relax(); \ | ||
211 | preempt_disable(); \ | ||
212 | } \ | ||
213 | (lock)->break_lock = 0; \ | ||
214 | return flags; \ | ||
215 | } \ | ||
216 | \ | ||
217 | EXPORT_SYMBOL(_##op##_lock_irqsave); \ | ||
218 | \ | ||
219 | void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ | ||
220 | { \ | ||
221 | _##op##_lock_irqsave(lock); \ | ||
222 | } \ | ||
223 | \ | ||
224 | EXPORT_SYMBOL(_##op##_lock_irq); \ | ||
225 | \ | ||
226 | void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ | ||
227 | { \ | ||
228 | unsigned long flags; \ | ||
229 | \ | ||
230 | /* */ \ | ||
231 | /* Careful: we must exclude softirqs too, hence the */ \ | ||
232 | /* irq-disabling. We use the generic preemption-aware */ \ | ||
233 | /* function: */ \ | ||
234 | /**/ \ | ||
235 | flags = _##op##_lock_irqsave(lock); \ | ||
236 | local_bh_disable(); \ | ||
237 | local_irq_restore(flags); \ | ||
238 | } \ | ||
239 | \ | ||
240 | EXPORT_SYMBOL(_##op##_lock_bh) | ||
241 | |||
242 | /* | ||
243 | * Build preemption-friendly versions of the following | ||
244 | * lock-spinning functions: | ||
245 | * | ||
246 | * _[spin|read|write]_lock() | ||
247 | * _[spin|read|write]_lock_irq() | ||
248 | * _[spin|read|write]_lock_irqsave() | ||
249 | * _[spin|read|write]_lock_bh() | ||
250 | */ | ||
251 | BUILD_LOCK_OPS(spin, spinlock); | ||
252 | BUILD_LOCK_OPS(read, rwlock); | ||
253 | BUILD_LOCK_OPS(write, rwlock); | ||
254 | |||
255 | #endif /* CONFIG_PREEMPT */ | ||
256 | |||
257 | void __lockfunc _spin_unlock(spinlock_t *lock) | ||
258 | { | ||
259 | _raw_spin_unlock(lock); | ||
260 | preempt_enable(); | ||
261 | } | ||
262 | EXPORT_SYMBOL(_spin_unlock); | ||
263 | |||
264 | void __lockfunc _write_unlock(rwlock_t *lock) | ||
265 | { | ||
266 | _raw_write_unlock(lock); | ||
267 | preempt_enable(); | ||
268 | } | ||
269 | EXPORT_SYMBOL(_write_unlock); | ||
270 | |||
271 | void __lockfunc _read_unlock(rwlock_t *lock) | ||
272 | { | ||
273 | _raw_read_unlock(lock); | ||
274 | preempt_enable(); | ||
275 | } | ||
276 | EXPORT_SYMBOL(_read_unlock); | ||
277 | |||
278 | void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) | ||
279 | { | ||
280 | _raw_spin_unlock(lock); | ||
281 | local_irq_restore(flags); | ||
282 | preempt_enable(); | ||
283 | } | ||
284 | EXPORT_SYMBOL(_spin_unlock_irqrestore); | ||
285 | |||
286 | void __lockfunc _spin_unlock_irq(spinlock_t *lock) | ||
287 | { | ||
288 | _raw_spin_unlock(lock); | ||
289 | local_irq_enable(); | ||
290 | preempt_enable(); | ||
291 | } | ||
292 | EXPORT_SYMBOL(_spin_unlock_irq); | ||
293 | |||
294 | void __lockfunc _spin_unlock_bh(spinlock_t *lock) | ||
295 | { | ||
296 | _raw_spin_unlock(lock); | ||
297 | preempt_enable(); | ||
298 | local_bh_enable(); | ||
299 | } | ||
300 | EXPORT_SYMBOL(_spin_unlock_bh); | ||
301 | |||
302 | void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) | ||
303 | { | ||
304 | _raw_read_unlock(lock); | ||
305 | local_irq_restore(flags); | ||
306 | preempt_enable(); | ||
307 | } | ||
308 | EXPORT_SYMBOL(_read_unlock_irqrestore); | ||
309 | |||
310 | void __lockfunc _read_unlock_irq(rwlock_t *lock) | ||
311 | { | ||
312 | _raw_read_unlock(lock); | ||
313 | local_irq_enable(); | ||
314 | preempt_enable(); | ||
315 | } | ||
316 | EXPORT_SYMBOL(_read_unlock_irq); | ||
317 | |||
318 | void __lockfunc _read_unlock_bh(rwlock_t *lock) | ||
319 | { | ||
320 | _raw_read_unlock(lock); | ||
321 | preempt_enable(); | ||
322 | local_bh_enable(); | ||
323 | } | ||
324 | EXPORT_SYMBOL(_read_unlock_bh); | ||
325 | |||
326 | void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) | ||
327 | { | ||
328 | _raw_write_unlock(lock); | ||
329 | local_irq_restore(flags); | ||
330 | preempt_enable(); | ||
331 | } | ||
332 | EXPORT_SYMBOL(_write_unlock_irqrestore); | ||
333 | |||
334 | void __lockfunc _write_unlock_irq(rwlock_t *lock) | ||
335 | { | ||
336 | _raw_write_unlock(lock); | ||
337 | local_irq_enable(); | ||
338 | preempt_enable(); | ||
339 | } | ||
340 | EXPORT_SYMBOL(_write_unlock_irq); | ||
341 | |||
342 | void __lockfunc _write_unlock_bh(rwlock_t *lock) | ||
343 | { | ||
344 | _raw_write_unlock(lock); | ||
345 | preempt_enable(); | ||
346 | local_bh_enable(); | ||
347 | } | ||
348 | EXPORT_SYMBOL(_write_unlock_bh); | ||
349 | |||
350 | int __lockfunc _spin_trylock_bh(spinlock_t *lock) | ||
351 | { | ||
352 | local_bh_disable(); | ||
353 | preempt_disable(); | ||
354 | if (_raw_spin_trylock(lock)) | ||
355 | return 1; | ||
356 | |||
357 | preempt_enable(); | ||
358 | local_bh_enable(); | ||
359 | return 0; | ||
360 | } | ||
361 | EXPORT_SYMBOL(_spin_trylock_bh); | ||
362 | |||
363 | int in_lock_functions(unsigned long addr) | ||
364 | { | ||
365 | /* Linker adds these: start and end of __lockfunc functions */ | ||
366 | extern char __lock_text_start[], __lock_text_end[]; | ||
367 | |||
368 | return addr >= (unsigned long)__lock_text_start | ||
369 | && addr < (unsigned long)__lock_text_end; | ||
370 | } | ||
371 | EXPORT_SYMBOL(in_lock_functions); | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c new file mode 100644 index 000000000000..c39ed70af174 --- /dev/null +++ b/kernel/stop_machine.c | |||
@@ -0,0 +1,212 @@ | |||
1 | #include <linux/stop_machine.h> | ||
2 | #include <linux/kthread.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/cpu.h> | ||
5 | #include <linux/err.h> | ||
6 | #include <linux/syscalls.h> | ||
7 | #include <asm/atomic.h> | ||
8 | #include <asm/semaphore.h> | ||
9 | #include <asm/uaccess.h> | ||
10 | |||
11 | /* Since we effect priority and affinity (both of which are visible | ||
12 | * to, and settable by outside processes) we do indirection via a | ||
13 | * kthread. */ | ||
14 | |||
15 | /* Thread to stop each CPU in user context. */ | ||
16 | enum stopmachine_state { | ||
17 | STOPMACHINE_WAIT, | ||
18 | STOPMACHINE_PREPARE, | ||
19 | STOPMACHINE_DISABLE_IRQ, | ||
20 | STOPMACHINE_EXIT, | ||
21 | }; | ||
22 | |||
23 | static enum stopmachine_state stopmachine_state; | ||
24 | static unsigned int stopmachine_num_threads; | ||
25 | static atomic_t stopmachine_thread_ack; | ||
26 | static DECLARE_MUTEX(stopmachine_mutex); | ||
27 | |||
28 | static int stopmachine(void *cpu) | ||
29 | { | ||
30 | int irqs_disabled = 0; | ||
31 | int prepared = 0; | ||
32 | |||
33 | set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); | ||
34 | |||
35 | /* Ack: we are alive */ | ||
36 | mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ | ||
37 | atomic_inc(&stopmachine_thread_ack); | ||
38 | |||
39 | /* Simple state machine */ | ||
40 | while (stopmachine_state != STOPMACHINE_EXIT) { | ||
41 | if (stopmachine_state == STOPMACHINE_DISABLE_IRQ | ||
42 | && !irqs_disabled) { | ||
43 | local_irq_disable(); | ||
44 | irqs_disabled = 1; | ||
45 | /* Ack: irqs disabled. */ | ||
46 | mb(); /* Must read state first. */ | ||
47 | atomic_inc(&stopmachine_thread_ack); | ||
48 | } else if (stopmachine_state == STOPMACHINE_PREPARE | ||
49 | && !prepared) { | ||
50 | /* Everyone is in place, hold CPU. */ | ||
51 | preempt_disable(); | ||
52 | prepared = 1; | ||
53 | mb(); /* Must read state first. */ | ||
54 | atomic_inc(&stopmachine_thread_ack); | ||
55 | } | ||
56 | /* Yield in first stage: migration threads need to | ||
57 | * help our sisters onto their CPUs. */ | ||
58 | if (!prepared && !irqs_disabled) | ||
59 | yield(); | ||
60 | else | ||
61 | cpu_relax(); | ||
62 | } | ||
63 | |||
64 | /* Ack: we are exiting. */ | ||
65 | mb(); /* Must read state first. */ | ||
66 | atomic_inc(&stopmachine_thread_ack); | ||
67 | |||
68 | if (irqs_disabled) | ||
69 | local_irq_enable(); | ||
70 | if (prepared) | ||
71 | preempt_enable(); | ||
72 | |||
73 | return 0; | ||
74 | } | ||
75 | |||
76 | /* Change the thread state */ | ||
77 | static void stopmachine_set_state(enum stopmachine_state state) | ||
78 | { | ||
79 | atomic_set(&stopmachine_thread_ack, 0); | ||
80 | wmb(); | ||
81 | stopmachine_state = state; | ||
82 | while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) | ||
83 | cpu_relax(); | ||
84 | } | ||
85 | |||
86 | static int stop_machine(void) | ||
87 | { | ||
88 | int i, ret = 0; | ||
89 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
90 | mm_segment_t old_fs = get_fs(); | ||
91 | |||
92 | /* One high-prio thread per cpu. We'll do this one. */ | ||
93 | set_fs(KERNEL_DS); | ||
94 | sys_sched_setscheduler(current->pid, SCHED_FIFO, | ||
95 | (struct sched_param __user *)¶m); | ||
96 | set_fs(old_fs); | ||
97 | |||
98 | atomic_set(&stopmachine_thread_ack, 0); | ||
99 | stopmachine_num_threads = 0; | ||
100 | stopmachine_state = STOPMACHINE_WAIT; | ||
101 | |||
102 | for_each_online_cpu(i) { | ||
103 | if (i == _smp_processor_id()) | ||
104 | continue; | ||
105 | ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); | ||
106 | if (ret < 0) | ||
107 | break; | ||
108 | stopmachine_num_threads++; | ||
109 | } | ||
110 | |||
111 | /* Wait for them all to come to life. */ | ||
112 | while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) | ||
113 | yield(); | ||
114 | |||
115 | /* If some failed, kill them all. */ | ||
116 | if (ret < 0) { | ||
117 | stopmachine_set_state(STOPMACHINE_EXIT); | ||
118 | up(&stopmachine_mutex); | ||
119 | return ret; | ||
120 | } | ||
121 | |||
122 | /* Don't schedule us away at this point, please. */ | ||
123 | local_irq_disable(); | ||
124 | |||
125 | /* Now they are all started, make them hold the CPUs, ready. */ | ||
126 | stopmachine_set_state(STOPMACHINE_PREPARE); | ||
127 | |||
128 | /* Make them disable irqs. */ | ||
129 | stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); | ||
130 | |||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | static void restart_machine(void) | ||
135 | { | ||
136 | stopmachine_set_state(STOPMACHINE_EXIT); | ||
137 | local_irq_enable(); | ||
138 | } | ||
139 | |||
140 | struct stop_machine_data | ||
141 | { | ||
142 | int (*fn)(void *); | ||
143 | void *data; | ||
144 | struct completion done; | ||
145 | }; | ||
146 | |||
147 | static int do_stop(void *_smdata) | ||
148 | { | ||
149 | struct stop_machine_data *smdata = _smdata; | ||
150 | int ret; | ||
151 | |||
152 | ret = stop_machine(); | ||
153 | if (ret == 0) { | ||
154 | ret = smdata->fn(smdata->data); | ||
155 | restart_machine(); | ||
156 | } | ||
157 | |||
158 | /* We're done: you can kthread_stop us now */ | ||
159 | complete(&smdata->done); | ||
160 | |||
161 | /* Wait for kthread_stop */ | ||
162 | set_current_state(TASK_INTERRUPTIBLE); | ||
163 | while (!kthread_should_stop()) { | ||
164 | schedule(); | ||
165 | set_current_state(TASK_INTERRUPTIBLE); | ||
166 | } | ||
167 | __set_current_state(TASK_RUNNING); | ||
168 | return ret; | ||
169 | } | ||
170 | |||
171 | struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, | ||
172 | unsigned int cpu) | ||
173 | { | ||
174 | struct stop_machine_data smdata; | ||
175 | struct task_struct *p; | ||
176 | |||
177 | smdata.fn = fn; | ||
178 | smdata.data = data; | ||
179 | init_completion(&smdata.done); | ||
180 | |||
181 | down(&stopmachine_mutex); | ||
182 | |||
183 | /* If they don't care which CPU fn runs on, bind to any online one. */ | ||
184 | if (cpu == NR_CPUS) | ||
185 | cpu = _smp_processor_id(); | ||
186 | |||
187 | p = kthread_create(do_stop, &smdata, "kstopmachine"); | ||
188 | if (!IS_ERR(p)) { | ||
189 | kthread_bind(p, cpu); | ||
190 | wake_up_process(p); | ||
191 | wait_for_completion(&smdata.done); | ||
192 | } | ||
193 | up(&stopmachine_mutex); | ||
194 | return p; | ||
195 | } | ||
196 | |||
197 | int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) | ||
198 | { | ||
199 | struct task_struct *p; | ||
200 | int ret; | ||
201 | |||
202 | /* No CPUs can come up or down during this. */ | ||
203 | lock_cpu_hotplug(); | ||
204 | p = __stop_machine_run(fn, data, cpu); | ||
205 | if (!IS_ERR(p)) | ||
206 | ret = kthread_stop(p); | ||
207 | else | ||
208 | ret = PTR_ERR(p); | ||
209 | unlock_cpu_hotplug(); | ||
210 | |||
211 | return ret; | ||
212 | } | ||
diff --git a/kernel/sys.c b/kernel/sys.c new file mode 100644 index 000000000000..462d78d55895 --- /dev/null +++ b/kernel/sys.c | |||
@@ -0,0 +1,1725 @@ | |||
1 | /* | ||
2 | * linux/kernel/sys.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | #include <linux/config.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/utsname.h> | ||
11 | #include <linux/mman.h> | ||
12 | #include <linux/smp_lock.h> | ||
13 | #include <linux/notifier.h> | ||
14 | #include <linux/reboot.h> | ||
15 | #include <linux/prctl.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/highuid.h> | ||
18 | #include <linux/fs.h> | ||
19 | #include <linux/workqueue.h> | ||
20 | #include <linux/device.h> | ||
21 | #include <linux/key.h> | ||
22 | #include <linux/times.h> | ||
23 | #include <linux/posix-timers.h> | ||
24 | #include <linux/security.h> | ||
25 | #include <linux/dcookies.h> | ||
26 | #include <linux/suspend.h> | ||
27 | #include <linux/tty.h> | ||
28 | |||
29 | #include <linux/compat.h> | ||
30 | #include <linux/syscalls.h> | ||
31 | |||
32 | #include <asm/uaccess.h> | ||
33 | #include <asm/io.h> | ||
34 | #include <asm/unistd.h> | ||
35 | |||
36 | #ifndef SET_UNALIGN_CTL | ||
37 | # define SET_UNALIGN_CTL(a,b) (-EINVAL) | ||
38 | #endif | ||
39 | #ifndef GET_UNALIGN_CTL | ||
40 | # define GET_UNALIGN_CTL(a,b) (-EINVAL) | ||
41 | #endif | ||
42 | #ifndef SET_FPEMU_CTL | ||
43 | # define SET_FPEMU_CTL(a,b) (-EINVAL) | ||
44 | #endif | ||
45 | #ifndef GET_FPEMU_CTL | ||
46 | # define GET_FPEMU_CTL(a,b) (-EINVAL) | ||
47 | #endif | ||
48 | #ifndef SET_FPEXC_CTL | ||
49 | # define SET_FPEXC_CTL(a,b) (-EINVAL) | ||
50 | #endif | ||
51 | #ifndef GET_FPEXC_CTL | ||
52 | # define GET_FPEXC_CTL(a,b) (-EINVAL) | ||
53 | #endif | ||
54 | |||
55 | /* | ||
56 | * this is where the system-wide overflow UID and GID are defined, for | ||
57 | * architectures that now have 32-bit UID/GID but didn't in the past | ||
58 | */ | ||
59 | |||
60 | int overflowuid = DEFAULT_OVERFLOWUID; | ||
61 | int overflowgid = DEFAULT_OVERFLOWGID; | ||
62 | |||
63 | #ifdef CONFIG_UID16 | ||
64 | EXPORT_SYMBOL(overflowuid); | ||
65 | EXPORT_SYMBOL(overflowgid); | ||
66 | #endif | ||
67 | |||
68 | /* | ||
69 | * the same as above, but for filesystems which can only store a 16-bit | ||
70 | * UID and GID. as such, this is needed on all architectures | ||
71 | */ | ||
72 | |||
73 | int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; | ||
74 | int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; | ||
75 | |||
76 | EXPORT_SYMBOL(fs_overflowuid); | ||
77 | EXPORT_SYMBOL(fs_overflowgid); | ||
78 | |||
79 | /* | ||
80 | * this indicates whether you can reboot with ctrl-alt-del: the default is yes | ||
81 | */ | ||
82 | |||
83 | int C_A_D = 1; | ||
84 | int cad_pid = 1; | ||
85 | |||
86 | /* | ||
87 | * Notifier list for kernel code which wants to be called | ||
88 | * at shutdown. This is used to stop any idling DMA operations | ||
89 | * and the like. | ||
90 | */ | ||
91 | |||
92 | static struct notifier_block *reboot_notifier_list; | ||
93 | static DEFINE_RWLOCK(notifier_lock); | ||
94 | |||
95 | /** | ||
96 | * notifier_chain_register - Add notifier to a notifier chain | ||
97 | * @list: Pointer to root list pointer | ||
98 | * @n: New entry in notifier chain | ||
99 | * | ||
100 | * Adds a notifier to a notifier chain. | ||
101 | * | ||
102 | * Currently always returns zero. | ||
103 | */ | ||
104 | |||
105 | int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) | ||
106 | { | ||
107 | write_lock(¬ifier_lock); | ||
108 | while(*list) | ||
109 | { | ||
110 | if(n->priority > (*list)->priority) | ||
111 | break; | ||
112 | list= &((*list)->next); | ||
113 | } | ||
114 | n->next = *list; | ||
115 | *list=n; | ||
116 | write_unlock(¬ifier_lock); | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | EXPORT_SYMBOL(notifier_chain_register); | ||
121 | |||
122 | /** | ||
123 | * notifier_chain_unregister - Remove notifier from a notifier chain | ||
124 | * @nl: Pointer to root list pointer | ||
125 | * @n: New entry in notifier chain | ||
126 | * | ||
127 | * Removes a notifier from a notifier chain. | ||
128 | * | ||
129 | * Returns zero on success, or %-ENOENT on failure. | ||
130 | */ | ||
131 | |||
132 | int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) | ||
133 | { | ||
134 | write_lock(¬ifier_lock); | ||
135 | while((*nl)!=NULL) | ||
136 | { | ||
137 | if((*nl)==n) | ||
138 | { | ||
139 | *nl=n->next; | ||
140 | write_unlock(¬ifier_lock); | ||
141 | return 0; | ||
142 | } | ||
143 | nl=&((*nl)->next); | ||
144 | } | ||
145 | write_unlock(¬ifier_lock); | ||
146 | return -ENOENT; | ||
147 | } | ||
148 | |||
149 | EXPORT_SYMBOL(notifier_chain_unregister); | ||
150 | |||
151 | /** | ||
152 | * notifier_call_chain - Call functions in a notifier chain | ||
153 | * @n: Pointer to root pointer of notifier chain | ||
154 | * @val: Value passed unmodified to notifier function | ||
155 | * @v: Pointer passed unmodified to notifier function | ||
156 | * | ||
157 | * Calls each function in a notifier chain in turn. | ||
158 | * | ||
159 | * If the return value of the notifier can be and'd | ||
160 | * with %NOTIFY_STOP_MASK, then notifier_call_chain | ||
161 | * will return immediately, with the return value of | ||
162 | * the notifier function which halted execution. | ||
163 | * Otherwise, the return value is the return value | ||
164 | * of the last notifier function called. | ||
165 | */ | ||
166 | |||
167 | int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) | ||
168 | { | ||
169 | int ret=NOTIFY_DONE; | ||
170 | struct notifier_block *nb = *n; | ||
171 | |||
172 | while(nb) | ||
173 | { | ||
174 | ret=nb->notifier_call(nb,val,v); | ||
175 | if(ret&NOTIFY_STOP_MASK) | ||
176 | { | ||
177 | return ret; | ||
178 | } | ||
179 | nb=nb->next; | ||
180 | } | ||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | EXPORT_SYMBOL(notifier_call_chain); | ||
185 | |||
186 | /** | ||
187 | * register_reboot_notifier - Register function to be called at reboot time | ||
188 | * @nb: Info about notifier function to be called | ||
189 | * | ||
190 | * Registers a function with the list of functions | ||
191 | * to be called at reboot time. | ||
192 | * | ||
193 | * Currently always returns zero, as notifier_chain_register | ||
194 | * always returns zero. | ||
195 | */ | ||
196 | |||
197 | int register_reboot_notifier(struct notifier_block * nb) | ||
198 | { | ||
199 | return notifier_chain_register(&reboot_notifier_list, nb); | ||
200 | } | ||
201 | |||
202 | EXPORT_SYMBOL(register_reboot_notifier); | ||
203 | |||
204 | /** | ||
205 | * unregister_reboot_notifier - Unregister previously registered reboot notifier | ||
206 | * @nb: Hook to be unregistered | ||
207 | * | ||
208 | * Unregisters a previously registered reboot | ||
209 | * notifier function. | ||
210 | * | ||
211 | * Returns zero on success, or %-ENOENT on failure. | ||
212 | */ | ||
213 | |||
214 | int unregister_reboot_notifier(struct notifier_block * nb) | ||
215 | { | ||
216 | return notifier_chain_unregister(&reboot_notifier_list, nb); | ||
217 | } | ||
218 | |||
219 | EXPORT_SYMBOL(unregister_reboot_notifier); | ||
220 | |||
221 | static int set_one_prio(struct task_struct *p, int niceval, int error) | ||
222 | { | ||
223 | int no_nice; | ||
224 | |||
225 | if (p->uid != current->euid && | ||
226 | p->euid != current->euid && !capable(CAP_SYS_NICE)) { | ||
227 | error = -EPERM; | ||
228 | goto out; | ||
229 | } | ||
230 | if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) { | ||
231 | error = -EACCES; | ||
232 | goto out; | ||
233 | } | ||
234 | no_nice = security_task_setnice(p, niceval); | ||
235 | if (no_nice) { | ||
236 | error = no_nice; | ||
237 | goto out; | ||
238 | } | ||
239 | if (error == -ESRCH) | ||
240 | error = 0; | ||
241 | set_user_nice(p, niceval); | ||
242 | out: | ||
243 | return error; | ||
244 | } | ||
245 | |||
246 | asmlinkage long sys_setpriority(int which, int who, int niceval) | ||
247 | { | ||
248 | struct task_struct *g, *p; | ||
249 | struct user_struct *user; | ||
250 | int error = -EINVAL; | ||
251 | |||
252 | if (which > 2 || which < 0) | ||
253 | goto out; | ||
254 | |||
255 | /* normalize: avoid signed division (rounding problems) */ | ||
256 | error = -ESRCH; | ||
257 | if (niceval < -20) | ||
258 | niceval = -20; | ||
259 | if (niceval > 19) | ||
260 | niceval = 19; | ||
261 | |||
262 | read_lock(&tasklist_lock); | ||
263 | switch (which) { | ||
264 | case PRIO_PROCESS: | ||
265 | if (!who) | ||
266 | who = current->pid; | ||
267 | p = find_task_by_pid(who); | ||
268 | if (p) | ||
269 | error = set_one_prio(p, niceval, error); | ||
270 | break; | ||
271 | case PRIO_PGRP: | ||
272 | if (!who) | ||
273 | who = process_group(current); | ||
274 | do_each_task_pid(who, PIDTYPE_PGID, p) { | ||
275 | error = set_one_prio(p, niceval, error); | ||
276 | } while_each_task_pid(who, PIDTYPE_PGID, p); | ||
277 | break; | ||
278 | case PRIO_USER: | ||
279 | user = current->user; | ||
280 | if (!who) | ||
281 | who = current->uid; | ||
282 | else | ||
283 | if ((who != current->uid) && !(user = find_user(who))) | ||
284 | goto out_unlock; /* No processes for this user */ | ||
285 | |||
286 | do_each_thread(g, p) | ||
287 | if (p->uid == who) | ||
288 | error = set_one_prio(p, niceval, error); | ||
289 | while_each_thread(g, p); | ||
290 | if (who != current->uid) | ||
291 | free_uid(user); /* For find_user() */ | ||
292 | break; | ||
293 | } | ||
294 | out_unlock: | ||
295 | read_unlock(&tasklist_lock); | ||
296 | out: | ||
297 | return error; | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * Ugh. To avoid negative return values, "getpriority()" will | ||
302 | * not return the normal nice-value, but a negated value that | ||
303 | * has been offset by 20 (ie it returns 40..1 instead of -20..19) | ||
304 | * to stay compatible. | ||
305 | */ | ||
306 | asmlinkage long sys_getpriority(int which, int who) | ||
307 | { | ||
308 | struct task_struct *g, *p; | ||
309 | struct user_struct *user; | ||
310 | long niceval, retval = -ESRCH; | ||
311 | |||
312 | if (which > 2 || which < 0) | ||
313 | return -EINVAL; | ||
314 | |||
315 | read_lock(&tasklist_lock); | ||
316 | switch (which) { | ||
317 | case PRIO_PROCESS: | ||
318 | if (!who) | ||
319 | who = current->pid; | ||
320 | p = find_task_by_pid(who); | ||
321 | if (p) { | ||
322 | niceval = 20 - task_nice(p); | ||
323 | if (niceval > retval) | ||
324 | retval = niceval; | ||
325 | } | ||
326 | break; | ||
327 | case PRIO_PGRP: | ||
328 | if (!who) | ||
329 | who = process_group(current); | ||
330 | do_each_task_pid(who, PIDTYPE_PGID, p) { | ||
331 | niceval = 20 - task_nice(p); | ||
332 | if (niceval > retval) | ||
333 | retval = niceval; | ||
334 | } while_each_task_pid(who, PIDTYPE_PGID, p); | ||
335 | break; | ||
336 | case PRIO_USER: | ||
337 | user = current->user; | ||
338 | if (!who) | ||
339 | who = current->uid; | ||
340 | else | ||
341 | if ((who != current->uid) && !(user = find_user(who))) | ||
342 | goto out_unlock; /* No processes for this user */ | ||
343 | |||
344 | do_each_thread(g, p) | ||
345 | if (p->uid == who) { | ||
346 | niceval = 20 - task_nice(p); | ||
347 | if (niceval > retval) | ||
348 | retval = niceval; | ||
349 | } | ||
350 | while_each_thread(g, p); | ||
351 | if (who != current->uid) | ||
352 | free_uid(user); /* for find_user() */ | ||
353 | break; | ||
354 | } | ||
355 | out_unlock: | ||
356 | read_unlock(&tasklist_lock); | ||
357 | |||
358 | return retval; | ||
359 | } | ||
360 | |||
361 | |||
362 | /* | ||
363 | * Reboot system call: for obvious reasons only root may call it, | ||
364 | * and even root needs to set up some magic numbers in the registers | ||
365 | * so that some mistake won't make this reboot the whole machine. | ||
366 | * You can also set the meaning of the ctrl-alt-del-key here. | ||
367 | * | ||
368 | * reboot doesn't sync: do that yourself before calling this. | ||
369 | */ | ||
370 | asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) | ||
371 | { | ||
372 | char buffer[256]; | ||
373 | |||
374 | /* We only trust the superuser with rebooting the system. */ | ||
375 | if (!capable(CAP_SYS_BOOT)) | ||
376 | return -EPERM; | ||
377 | |||
378 | /* For safety, we require "magic" arguments. */ | ||
379 | if (magic1 != LINUX_REBOOT_MAGIC1 || | ||
380 | (magic2 != LINUX_REBOOT_MAGIC2 && | ||
381 | magic2 != LINUX_REBOOT_MAGIC2A && | ||
382 | magic2 != LINUX_REBOOT_MAGIC2B && | ||
383 | magic2 != LINUX_REBOOT_MAGIC2C)) | ||
384 | return -EINVAL; | ||
385 | |||
386 | lock_kernel(); | ||
387 | switch (cmd) { | ||
388 | case LINUX_REBOOT_CMD_RESTART: | ||
389 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); | ||
390 | system_state = SYSTEM_RESTART; | ||
391 | device_shutdown(); | ||
392 | printk(KERN_EMERG "Restarting system.\n"); | ||
393 | machine_restart(NULL); | ||
394 | break; | ||
395 | |||
396 | case LINUX_REBOOT_CMD_CAD_ON: | ||
397 | C_A_D = 1; | ||
398 | break; | ||
399 | |||
400 | case LINUX_REBOOT_CMD_CAD_OFF: | ||
401 | C_A_D = 0; | ||
402 | break; | ||
403 | |||
404 | case LINUX_REBOOT_CMD_HALT: | ||
405 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); | ||
406 | system_state = SYSTEM_HALT; | ||
407 | device_shutdown(); | ||
408 | printk(KERN_EMERG "System halted.\n"); | ||
409 | machine_halt(); | ||
410 | unlock_kernel(); | ||
411 | do_exit(0); | ||
412 | break; | ||
413 | |||
414 | case LINUX_REBOOT_CMD_POWER_OFF: | ||
415 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); | ||
416 | system_state = SYSTEM_POWER_OFF; | ||
417 | device_shutdown(); | ||
418 | printk(KERN_EMERG "Power down.\n"); | ||
419 | machine_power_off(); | ||
420 | unlock_kernel(); | ||
421 | do_exit(0); | ||
422 | break; | ||
423 | |||
424 | case LINUX_REBOOT_CMD_RESTART2: | ||
425 | if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { | ||
426 | unlock_kernel(); | ||
427 | return -EFAULT; | ||
428 | } | ||
429 | buffer[sizeof(buffer) - 1] = '\0'; | ||
430 | |||
431 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); | ||
432 | system_state = SYSTEM_RESTART; | ||
433 | device_shutdown(); | ||
434 | printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); | ||
435 | machine_restart(buffer); | ||
436 | break; | ||
437 | |||
438 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
439 | case LINUX_REBOOT_CMD_SW_SUSPEND: | ||
440 | { | ||
441 | int ret = software_suspend(); | ||
442 | unlock_kernel(); | ||
443 | return ret; | ||
444 | } | ||
445 | #endif | ||
446 | |||
447 | default: | ||
448 | unlock_kernel(); | ||
449 | return -EINVAL; | ||
450 | } | ||
451 | unlock_kernel(); | ||
452 | return 0; | ||
453 | } | ||
454 | |||
455 | static void deferred_cad(void *dummy) | ||
456 | { | ||
457 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); | ||
458 | machine_restart(NULL); | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * This function gets called by ctrl-alt-del - ie the keyboard interrupt. | ||
463 | * As it's called within an interrupt, it may NOT sync: the only choice | ||
464 | * is whether to reboot at once, or just ignore the ctrl-alt-del. | ||
465 | */ | ||
466 | void ctrl_alt_del(void) | ||
467 | { | ||
468 | static DECLARE_WORK(cad_work, deferred_cad, NULL); | ||
469 | |||
470 | if (C_A_D) | ||
471 | schedule_work(&cad_work); | ||
472 | else | ||
473 | kill_proc(cad_pid, SIGINT, 1); | ||
474 | } | ||
475 | |||
476 | |||
477 | /* | ||
478 | * Unprivileged users may change the real gid to the effective gid | ||
479 | * or vice versa. (BSD-style) | ||
480 | * | ||
481 | * If you set the real gid at all, or set the effective gid to a value not | ||
482 | * equal to the real gid, then the saved gid is set to the new effective gid. | ||
483 | * | ||
484 | * This makes it possible for a setgid program to completely drop its | ||
485 | * privileges, which is often a useful assertion to make when you are doing | ||
486 | * a security audit over a program. | ||
487 | * | ||
488 | * The general idea is that a program which uses just setregid() will be | ||
489 | * 100% compatible with BSD. A program which uses just setgid() will be | ||
490 | * 100% compatible with POSIX with saved IDs. | ||
491 | * | ||
492 | * SMP: There are not races, the GIDs are checked only by filesystem | ||
493 | * operations (as far as semantic preservation is concerned). | ||
494 | */ | ||
495 | asmlinkage long sys_setregid(gid_t rgid, gid_t egid) | ||
496 | { | ||
497 | int old_rgid = current->gid; | ||
498 | int old_egid = current->egid; | ||
499 | int new_rgid = old_rgid; | ||
500 | int new_egid = old_egid; | ||
501 | int retval; | ||
502 | |||
503 | retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); | ||
504 | if (retval) | ||
505 | return retval; | ||
506 | |||
507 | if (rgid != (gid_t) -1) { | ||
508 | if ((old_rgid == rgid) || | ||
509 | (current->egid==rgid) || | ||
510 | capable(CAP_SETGID)) | ||
511 | new_rgid = rgid; | ||
512 | else | ||
513 | return -EPERM; | ||
514 | } | ||
515 | if (egid != (gid_t) -1) { | ||
516 | if ((old_rgid == egid) || | ||
517 | (current->egid == egid) || | ||
518 | (current->sgid == egid) || | ||
519 | capable(CAP_SETGID)) | ||
520 | new_egid = egid; | ||
521 | else { | ||
522 | return -EPERM; | ||
523 | } | ||
524 | } | ||
525 | if (new_egid != old_egid) | ||
526 | { | ||
527 | current->mm->dumpable = 0; | ||
528 | wmb(); | ||
529 | } | ||
530 | if (rgid != (gid_t) -1 || | ||
531 | (egid != (gid_t) -1 && egid != old_rgid)) | ||
532 | current->sgid = new_egid; | ||
533 | current->fsgid = new_egid; | ||
534 | current->egid = new_egid; | ||
535 | current->gid = new_rgid; | ||
536 | key_fsgid_changed(current); | ||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * setgid() is implemented like SysV w/ SAVED_IDS | ||
542 | * | ||
543 | * SMP: Same implicit races as above. | ||
544 | */ | ||
545 | asmlinkage long sys_setgid(gid_t gid) | ||
546 | { | ||
547 | int old_egid = current->egid; | ||
548 | int retval; | ||
549 | |||
550 | retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); | ||
551 | if (retval) | ||
552 | return retval; | ||
553 | |||
554 | if (capable(CAP_SETGID)) | ||
555 | { | ||
556 | if(old_egid != gid) | ||
557 | { | ||
558 | current->mm->dumpable=0; | ||
559 | wmb(); | ||
560 | } | ||
561 | current->gid = current->egid = current->sgid = current->fsgid = gid; | ||
562 | } | ||
563 | else if ((gid == current->gid) || (gid == current->sgid)) | ||
564 | { | ||
565 | if(old_egid != gid) | ||
566 | { | ||
567 | current->mm->dumpable=0; | ||
568 | wmb(); | ||
569 | } | ||
570 | current->egid = current->fsgid = gid; | ||
571 | } | ||
572 | else | ||
573 | return -EPERM; | ||
574 | |||
575 | key_fsgid_changed(current); | ||
576 | return 0; | ||
577 | } | ||
578 | |||
579 | static int set_user(uid_t new_ruid, int dumpclear) | ||
580 | { | ||
581 | struct user_struct *new_user; | ||
582 | |||
583 | new_user = alloc_uid(new_ruid); | ||
584 | if (!new_user) | ||
585 | return -EAGAIN; | ||
586 | |||
587 | if (atomic_read(&new_user->processes) >= | ||
588 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && | ||
589 | new_user != &root_user) { | ||
590 | free_uid(new_user); | ||
591 | return -EAGAIN; | ||
592 | } | ||
593 | |||
594 | switch_uid(new_user); | ||
595 | |||
596 | if(dumpclear) | ||
597 | { | ||
598 | current->mm->dumpable = 0; | ||
599 | wmb(); | ||
600 | } | ||
601 | current->uid = new_ruid; | ||
602 | return 0; | ||
603 | } | ||
604 | |||
605 | /* | ||
606 | * Unprivileged users may change the real uid to the effective uid | ||
607 | * or vice versa. (BSD-style) | ||
608 | * | ||
609 | * If you set the real uid at all, or set the effective uid to a value not | ||
610 | * equal to the real uid, then the saved uid is set to the new effective uid. | ||
611 | * | ||
612 | * This makes it possible for a setuid program to completely drop its | ||
613 | * privileges, which is often a useful assertion to make when you are doing | ||
614 | * a security audit over a program. | ||
615 | * | ||
616 | * The general idea is that a program which uses just setreuid() will be | ||
617 | * 100% compatible with BSD. A program which uses just setuid() will be | ||
618 | * 100% compatible with POSIX with saved IDs. | ||
619 | */ | ||
620 | asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) | ||
621 | { | ||
622 | int old_ruid, old_euid, old_suid, new_ruid, new_euid; | ||
623 | int retval; | ||
624 | |||
625 | retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); | ||
626 | if (retval) | ||
627 | return retval; | ||
628 | |||
629 | new_ruid = old_ruid = current->uid; | ||
630 | new_euid = old_euid = current->euid; | ||
631 | old_suid = current->suid; | ||
632 | |||
633 | if (ruid != (uid_t) -1) { | ||
634 | new_ruid = ruid; | ||
635 | if ((old_ruid != ruid) && | ||
636 | (current->euid != ruid) && | ||
637 | !capable(CAP_SETUID)) | ||
638 | return -EPERM; | ||
639 | } | ||
640 | |||
641 | if (euid != (uid_t) -1) { | ||
642 | new_euid = euid; | ||
643 | if ((old_ruid != euid) && | ||
644 | (current->euid != euid) && | ||
645 | (current->suid != euid) && | ||
646 | !capable(CAP_SETUID)) | ||
647 | return -EPERM; | ||
648 | } | ||
649 | |||
650 | if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) | ||
651 | return -EAGAIN; | ||
652 | |||
653 | if (new_euid != old_euid) | ||
654 | { | ||
655 | current->mm->dumpable=0; | ||
656 | wmb(); | ||
657 | } | ||
658 | current->fsuid = current->euid = new_euid; | ||
659 | if (ruid != (uid_t) -1 || | ||
660 | (euid != (uid_t) -1 && euid != old_ruid)) | ||
661 | current->suid = current->euid; | ||
662 | current->fsuid = current->euid; | ||
663 | |||
664 | key_fsuid_changed(current); | ||
665 | |||
666 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); | ||
667 | } | ||
668 | |||
669 | |||
670 | |||
671 | /* | ||
672 | * setuid() is implemented like SysV with SAVED_IDS | ||
673 | * | ||
674 | * Note that SAVED_ID's is deficient in that a setuid root program | ||
675 | * like sendmail, for example, cannot set its uid to be a normal | ||
676 | * user and then switch back, because if you're root, setuid() sets | ||
677 | * the saved uid too. If you don't like this, blame the bright people | ||
678 | * in the POSIX committee and/or USG. Note that the BSD-style setreuid() | ||
679 | * will allow a root program to temporarily drop privileges and be able to | ||
680 | * regain them by swapping the real and effective uid. | ||
681 | */ | ||
682 | asmlinkage long sys_setuid(uid_t uid) | ||
683 | { | ||
684 | int old_euid = current->euid; | ||
685 | int old_ruid, old_suid, new_ruid, new_suid; | ||
686 | int retval; | ||
687 | |||
688 | retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); | ||
689 | if (retval) | ||
690 | return retval; | ||
691 | |||
692 | old_ruid = new_ruid = current->uid; | ||
693 | old_suid = current->suid; | ||
694 | new_suid = old_suid; | ||
695 | |||
696 | if (capable(CAP_SETUID)) { | ||
697 | if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) | ||
698 | return -EAGAIN; | ||
699 | new_suid = uid; | ||
700 | } else if ((uid != current->uid) && (uid != new_suid)) | ||
701 | return -EPERM; | ||
702 | |||
703 | if (old_euid != uid) | ||
704 | { | ||
705 | current->mm->dumpable = 0; | ||
706 | wmb(); | ||
707 | } | ||
708 | current->fsuid = current->euid = uid; | ||
709 | current->suid = new_suid; | ||
710 | |||
711 | key_fsuid_changed(current); | ||
712 | |||
713 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); | ||
714 | } | ||
715 | |||
716 | |||
717 | /* | ||
718 | * This function implements a generic ability to update ruid, euid, | ||
719 | * and suid. This allows you to implement the 4.4 compatible seteuid(). | ||
720 | */ | ||
721 | asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) | ||
722 | { | ||
723 | int old_ruid = current->uid; | ||
724 | int old_euid = current->euid; | ||
725 | int old_suid = current->suid; | ||
726 | int retval; | ||
727 | |||
728 | retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); | ||
729 | if (retval) | ||
730 | return retval; | ||
731 | |||
732 | if (!capable(CAP_SETUID)) { | ||
733 | if ((ruid != (uid_t) -1) && (ruid != current->uid) && | ||
734 | (ruid != current->euid) && (ruid != current->suid)) | ||
735 | return -EPERM; | ||
736 | if ((euid != (uid_t) -1) && (euid != current->uid) && | ||
737 | (euid != current->euid) && (euid != current->suid)) | ||
738 | return -EPERM; | ||
739 | if ((suid != (uid_t) -1) && (suid != current->uid) && | ||
740 | (suid != current->euid) && (suid != current->suid)) | ||
741 | return -EPERM; | ||
742 | } | ||
743 | if (ruid != (uid_t) -1) { | ||
744 | if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) | ||
745 | return -EAGAIN; | ||
746 | } | ||
747 | if (euid != (uid_t) -1) { | ||
748 | if (euid != current->euid) | ||
749 | { | ||
750 | current->mm->dumpable = 0; | ||
751 | wmb(); | ||
752 | } | ||
753 | current->euid = euid; | ||
754 | } | ||
755 | current->fsuid = current->euid; | ||
756 | if (suid != (uid_t) -1) | ||
757 | current->suid = suid; | ||
758 | |||
759 | key_fsuid_changed(current); | ||
760 | |||
761 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); | ||
762 | } | ||
763 | |||
764 | asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) | ||
765 | { | ||
766 | int retval; | ||
767 | |||
768 | if (!(retval = put_user(current->uid, ruid)) && | ||
769 | !(retval = put_user(current->euid, euid))) | ||
770 | retval = put_user(current->suid, suid); | ||
771 | |||
772 | return retval; | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * Same as above, but for rgid, egid, sgid. | ||
777 | */ | ||
778 | asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) | ||
779 | { | ||
780 | int retval; | ||
781 | |||
782 | retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); | ||
783 | if (retval) | ||
784 | return retval; | ||
785 | |||
786 | if (!capable(CAP_SETGID)) { | ||
787 | if ((rgid != (gid_t) -1) && (rgid != current->gid) && | ||
788 | (rgid != current->egid) && (rgid != current->sgid)) | ||
789 | return -EPERM; | ||
790 | if ((egid != (gid_t) -1) && (egid != current->gid) && | ||
791 | (egid != current->egid) && (egid != current->sgid)) | ||
792 | return -EPERM; | ||
793 | if ((sgid != (gid_t) -1) && (sgid != current->gid) && | ||
794 | (sgid != current->egid) && (sgid != current->sgid)) | ||
795 | return -EPERM; | ||
796 | } | ||
797 | if (egid != (gid_t) -1) { | ||
798 | if (egid != current->egid) | ||
799 | { | ||
800 | current->mm->dumpable = 0; | ||
801 | wmb(); | ||
802 | } | ||
803 | current->egid = egid; | ||
804 | } | ||
805 | current->fsgid = current->egid; | ||
806 | if (rgid != (gid_t) -1) | ||
807 | current->gid = rgid; | ||
808 | if (sgid != (gid_t) -1) | ||
809 | current->sgid = sgid; | ||
810 | |||
811 | key_fsgid_changed(current); | ||
812 | return 0; | ||
813 | } | ||
814 | |||
815 | asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) | ||
816 | { | ||
817 | int retval; | ||
818 | |||
819 | if (!(retval = put_user(current->gid, rgid)) && | ||
820 | !(retval = put_user(current->egid, egid))) | ||
821 | retval = put_user(current->sgid, sgid); | ||
822 | |||
823 | return retval; | ||
824 | } | ||
825 | |||
826 | |||
827 | /* | ||
828 | * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This | ||
829 | * is used for "access()" and for the NFS daemon (letting nfsd stay at | ||
830 | * whatever uid it wants to). It normally shadows "euid", except when | ||
831 | * explicitly set by setfsuid() or for access.. | ||
832 | */ | ||
833 | asmlinkage long sys_setfsuid(uid_t uid) | ||
834 | { | ||
835 | int old_fsuid; | ||
836 | |||
837 | old_fsuid = current->fsuid; | ||
838 | if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) | ||
839 | return old_fsuid; | ||
840 | |||
841 | if (uid == current->uid || uid == current->euid || | ||
842 | uid == current->suid || uid == current->fsuid || | ||
843 | capable(CAP_SETUID)) | ||
844 | { | ||
845 | if (uid != old_fsuid) | ||
846 | { | ||
847 | current->mm->dumpable = 0; | ||
848 | wmb(); | ||
849 | } | ||
850 | current->fsuid = uid; | ||
851 | } | ||
852 | |||
853 | key_fsuid_changed(current); | ||
854 | |||
855 | security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); | ||
856 | |||
857 | return old_fsuid; | ||
858 | } | ||
859 | |||
860 | /* | ||
861 | * Samma på svenska.. | ||
862 | */ | ||
863 | asmlinkage long sys_setfsgid(gid_t gid) | ||
864 | { | ||
865 | int old_fsgid; | ||
866 | |||
867 | old_fsgid = current->fsgid; | ||
868 | if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) | ||
869 | return old_fsgid; | ||
870 | |||
871 | if (gid == current->gid || gid == current->egid || | ||
872 | gid == current->sgid || gid == current->fsgid || | ||
873 | capable(CAP_SETGID)) | ||
874 | { | ||
875 | if (gid != old_fsgid) | ||
876 | { | ||
877 | current->mm->dumpable = 0; | ||
878 | wmb(); | ||
879 | } | ||
880 | current->fsgid = gid; | ||
881 | key_fsgid_changed(current); | ||
882 | } | ||
883 | return old_fsgid; | ||
884 | } | ||
885 | |||
886 | asmlinkage long sys_times(struct tms __user * tbuf) | ||
887 | { | ||
888 | /* | ||
889 | * In the SMP world we might just be unlucky and have one of | ||
890 | * the times increment as we use it. Since the value is an | ||
891 | * atomically safe type this is just fine. Conceptually its | ||
892 | * as if the syscall took an instant longer to occur. | ||
893 | */ | ||
894 | if (tbuf) { | ||
895 | struct tms tmp; | ||
896 | struct task_struct *tsk = current; | ||
897 | struct task_struct *t; | ||
898 | cputime_t utime, stime, cutime, cstime; | ||
899 | |||
900 | read_lock(&tasklist_lock); | ||
901 | utime = tsk->signal->utime; | ||
902 | stime = tsk->signal->stime; | ||
903 | t = tsk; | ||
904 | do { | ||
905 | utime = cputime_add(utime, t->utime); | ||
906 | stime = cputime_add(stime, t->stime); | ||
907 | t = next_thread(t); | ||
908 | } while (t != tsk); | ||
909 | |||
910 | /* | ||
911 | * While we have tasklist_lock read-locked, no dying thread | ||
912 | * can be updating current->signal->[us]time. Instead, | ||
913 | * we got their counts included in the live thread loop. | ||
914 | * However, another thread can come in right now and | ||
915 | * do a wait call that updates current->signal->c[us]time. | ||
916 | * To make sure we always see that pair updated atomically, | ||
917 | * we take the siglock around fetching them. | ||
918 | */ | ||
919 | spin_lock_irq(&tsk->sighand->siglock); | ||
920 | cutime = tsk->signal->cutime; | ||
921 | cstime = tsk->signal->cstime; | ||
922 | spin_unlock_irq(&tsk->sighand->siglock); | ||
923 | read_unlock(&tasklist_lock); | ||
924 | |||
925 | tmp.tms_utime = cputime_to_clock_t(utime); | ||
926 | tmp.tms_stime = cputime_to_clock_t(stime); | ||
927 | tmp.tms_cutime = cputime_to_clock_t(cutime); | ||
928 | tmp.tms_cstime = cputime_to_clock_t(cstime); | ||
929 | if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) | ||
930 | return -EFAULT; | ||
931 | } | ||
932 | return (long) jiffies_64_to_clock_t(get_jiffies_64()); | ||
933 | } | ||
934 | |||
935 | /* | ||
936 | * This needs some heavy checking ... | ||
937 | * I just haven't the stomach for it. I also don't fully | ||
938 | * understand sessions/pgrp etc. Let somebody who does explain it. | ||
939 | * | ||
940 | * OK, I think I have the protection semantics right.... this is really | ||
941 | * only important on a multi-user system anyway, to make sure one user | ||
942 | * can't send a signal to a process owned by another. -TYT, 12/12/91 | ||
943 | * | ||
944 | * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. | ||
945 | * LBT 04.03.94 | ||
946 | */ | ||
947 | |||
948 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | ||
949 | { | ||
950 | struct task_struct *p; | ||
951 | int err = -EINVAL; | ||
952 | |||
953 | if (!pid) | ||
954 | pid = current->pid; | ||
955 | if (!pgid) | ||
956 | pgid = pid; | ||
957 | if (pgid < 0) | ||
958 | return -EINVAL; | ||
959 | |||
960 | /* From this point forward we keep holding onto the tasklist lock | ||
961 | * so that our parent does not change from under us. -DaveM | ||
962 | */ | ||
963 | write_lock_irq(&tasklist_lock); | ||
964 | |||
965 | err = -ESRCH; | ||
966 | p = find_task_by_pid(pid); | ||
967 | if (!p) | ||
968 | goto out; | ||
969 | |||
970 | err = -EINVAL; | ||
971 | if (!thread_group_leader(p)) | ||
972 | goto out; | ||
973 | |||
974 | if (p->parent == current || p->real_parent == current) { | ||
975 | err = -EPERM; | ||
976 | if (p->signal->session != current->signal->session) | ||
977 | goto out; | ||
978 | err = -EACCES; | ||
979 | if (p->did_exec) | ||
980 | goto out; | ||
981 | } else { | ||
982 | err = -ESRCH; | ||
983 | if (p != current) | ||
984 | goto out; | ||
985 | } | ||
986 | |||
987 | err = -EPERM; | ||
988 | if (p->signal->leader) | ||
989 | goto out; | ||
990 | |||
991 | if (pgid != pid) { | ||
992 | struct task_struct *p; | ||
993 | |||
994 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { | ||
995 | if (p->signal->session == current->signal->session) | ||
996 | goto ok_pgid; | ||
997 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); | ||
998 | goto out; | ||
999 | } | ||
1000 | |||
1001 | ok_pgid: | ||
1002 | err = security_task_setpgid(p, pgid); | ||
1003 | if (err) | ||
1004 | goto out; | ||
1005 | |||
1006 | if (process_group(p) != pgid) { | ||
1007 | detach_pid(p, PIDTYPE_PGID); | ||
1008 | p->signal->pgrp = pgid; | ||
1009 | attach_pid(p, PIDTYPE_PGID, pgid); | ||
1010 | } | ||
1011 | |||
1012 | err = 0; | ||
1013 | out: | ||
1014 | /* All paths lead to here, thus we are safe. -DaveM */ | ||
1015 | write_unlock_irq(&tasklist_lock); | ||
1016 | return err; | ||
1017 | } | ||
1018 | |||
1019 | asmlinkage long sys_getpgid(pid_t pid) | ||
1020 | { | ||
1021 | if (!pid) { | ||
1022 | return process_group(current); | ||
1023 | } else { | ||
1024 | int retval; | ||
1025 | struct task_struct *p; | ||
1026 | |||
1027 | read_lock(&tasklist_lock); | ||
1028 | p = find_task_by_pid(pid); | ||
1029 | |||
1030 | retval = -ESRCH; | ||
1031 | if (p) { | ||
1032 | retval = security_task_getpgid(p); | ||
1033 | if (!retval) | ||
1034 | retval = process_group(p); | ||
1035 | } | ||
1036 | read_unlock(&tasklist_lock); | ||
1037 | return retval; | ||
1038 | } | ||
1039 | } | ||
1040 | |||
1041 | #ifdef __ARCH_WANT_SYS_GETPGRP | ||
1042 | |||
1043 | asmlinkage long sys_getpgrp(void) | ||
1044 | { | ||
1045 | /* SMP - assuming writes are word atomic this is fine */ | ||
1046 | return process_group(current); | ||
1047 | } | ||
1048 | |||
1049 | #endif | ||
1050 | |||
1051 | asmlinkage long sys_getsid(pid_t pid) | ||
1052 | { | ||
1053 | if (!pid) { | ||
1054 | return current->signal->session; | ||
1055 | } else { | ||
1056 | int retval; | ||
1057 | struct task_struct *p; | ||
1058 | |||
1059 | read_lock(&tasklist_lock); | ||
1060 | p = find_task_by_pid(pid); | ||
1061 | |||
1062 | retval = -ESRCH; | ||
1063 | if(p) { | ||
1064 | retval = security_task_getsid(p); | ||
1065 | if (!retval) | ||
1066 | retval = p->signal->session; | ||
1067 | } | ||
1068 | read_unlock(&tasklist_lock); | ||
1069 | return retval; | ||
1070 | } | ||
1071 | } | ||
1072 | |||
1073 | asmlinkage long sys_setsid(void) | ||
1074 | { | ||
1075 | struct pid *pid; | ||
1076 | int err = -EPERM; | ||
1077 | |||
1078 | if (!thread_group_leader(current)) | ||
1079 | return -EINVAL; | ||
1080 | |||
1081 | down(&tty_sem); | ||
1082 | write_lock_irq(&tasklist_lock); | ||
1083 | |||
1084 | pid = find_pid(PIDTYPE_PGID, current->pid); | ||
1085 | if (pid) | ||
1086 | goto out; | ||
1087 | |||
1088 | current->signal->leader = 1; | ||
1089 | __set_special_pids(current->pid, current->pid); | ||
1090 | current->signal->tty = NULL; | ||
1091 | current->signal->tty_old_pgrp = 0; | ||
1092 | err = process_group(current); | ||
1093 | out: | ||
1094 | write_unlock_irq(&tasklist_lock); | ||
1095 | up(&tty_sem); | ||
1096 | return err; | ||
1097 | } | ||
1098 | |||
1099 | /* | ||
1100 | * Supplementary group IDs | ||
1101 | */ | ||
1102 | |||
1103 | /* init to 2 - one for init_task, one to ensure it is never freed */ | ||
1104 | struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; | ||
1105 | |||
1106 | struct group_info *groups_alloc(int gidsetsize) | ||
1107 | { | ||
1108 | struct group_info *group_info; | ||
1109 | int nblocks; | ||
1110 | int i; | ||
1111 | |||
1112 | nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; | ||
1113 | /* Make sure we always allocate at least one indirect block pointer */ | ||
1114 | nblocks = nblocks ? : 1; | ||
1115 | group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); | ||
1116 | if (!group_info) | ||
1117 | return NULL; | ||
1118 | group_info->ngroups = gidsetsize; | ||
1119 | group_info->nblocks = nblocks; | ||
1120 | atomic_set(&group_info->usage, 1); | ||
1121 | |||
1122 | if (gidsetsize <= NGROUPS_SMALL) { | ||
1123 | group_info->blocks[0] = group_info->small_block; | ||
1124 | } else { | ||
1125 | for (i = 0; i < nblocks; i++) { | ||
1126 | gid_t *b; | ||
1127 | b = (void *)__get_free_page(GFP_USER); | ||
1128 | if (!b) | ||
1129 | goto out_undo_partial_alloc; | ||
1130 | group_info->blocks[i] = b; | ||
1131 | } | ||
1132 | } | ||
1133 | return group_info; | ||
1134 | |||
1135 | out_undo_partial_alloc: | ||
1136 | while (--i >= 0) { | ||
1137 | free_page((unsigned long)group_info->blocks[i]); | ||
1138 | } | ||
1139 | kfree(group_info); | ||
1140 | return NULL; | ||
1141 | } | ||
1142 | |||
1143 | EXPORT_SYMBOL(groups_alloc); | ||
1144 | |||
1145 | void groups_free(struct group_info *group_info) | ||
1146 | { | ||
1147 | if (group_info->blocks[0] != group_info->small_block) { | ||
1148 | int i; | ||
1149 | for (i = 0; i < group_info->nblocks; i++) | ||
1150 | free_page((unsigned long)group_info->blocks[i]); | ||
1151 | } | ||
1152 | kfree(group_info); | ||
1153 | } | ||
1154 | |||
1155 | EXPORT_SYMBOL(groups_free); | ||
1156 | |||
1157 | /* export the group_info to a user-space array */ | ||
1158 | static int groups_to_user(gid_t __user *grouplist, | ||
1159 | struct group_info *group_info) | ||
1160 | { | ||
1161 | int i; | ||
1162 | int count = group_info->ngroups; | ||
1163 | |||
1164 | for (i = 0; i < group_info->nblocks; i++) { | ||
1165 | int cp_count = min(NGROUPS_PER_BLOCK, count); | ||
1166 | int off = i * NGROUPS_PER_BLOCK; | ||
1167 | int len = cp_count * sizeof(*grouplist); | ||
1168 | |||
1169 | if (copy_to_user(grouplist+off, group_info->blocks[i], len)) | ||
1170 | return -EFAULT; | ||
1171 | |||
1172 | count -= cp_count; | ||
1173 | } | ||
1174 | return 0; | ||
1175 | } | ||
1176 | |||
1177 | /* fill a group_info from a user-space array - it must be allocated already */ | ||
1178 | static int groups_from_user(struct group_info *group_info, | ||
1179 | gid_t __user *grouplist) | ||
1180 | { | ||
1181 | int i; | ||
1182 | int count = group_info->ngroups; | ||
1183 | |||
1184 | for (i = 0; i < group_info->nblocks; i++) { | ||
1185 | int cp_count = min(NGROUPS_PER_BLOCK, count); | ||
1186 | int off = i * NGROUPS_PER_BLOCK; | ||
1187 | int len = cp_count * sizeof(*grouplist); | ||
1188 | |||
1189 | if (copy_from_user(group_info->blocks[i], grouplist+off, len)) | ||
1190 | return -EFAULT; | ||
1191 | |||
1192 | count -= cp_count; | ||
1193 | } | ||
1194 | return 0; | ||
1195 | } | ||
1196 | |||
1197 | /* a simple shell-metzner sort */ | ||
1198 | static void groups_sort(struct group_info *group_info) | ||
1199 | { | ||
1200 | int base, max, stride; | ||
1201 | int gidsetsize = group_info->ngroups; | ||
1202 | |||
1203 | for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) | ||
1204 | ; /* nothing */ | ||
1205 | stride /= 3; | ||
1206 | |||
1207 | while (stride) { | ||
1208 | max = gidsetsize - stride; | ||
1209 | for (base = 0; base < max; base++) { | ||
1210 | int left = base; | ||
1211 | int right = left + stride; | ||
1212 | gid_t tmp = GROUP_AT(group_info, right); | ||
1213 | |||
1214 | while (left >= 0 && GROUP_AT(group_info, left) > tmp) { | ||
1215 | GROUP_AT(group_info, right) = | ||
1216 | GROUP_AT(group_info, left); | ||
1217 | right = left; | ||
1218 | left -= stride; | ||
1219 | } | ||
1220 | GROUP_AT(group_info, right) = tmp; | ||
1221 | } | ||
1222 | stride /= 3; | ||
1223 | } | ||
1224 | } | ||
1225 | |||
1226 | /* a simple bsearch */ | ||
1227 | static int groups_search(struct group_info *group_info, gid_t grp) | ||
1228 | { | ||
1229 | int left, right; | ||
1230 | |||
1231 | if (!group_info) | ||
1232 | return 0; | ||
1233 | |||
1234 | left = 0; | ||
1235 | right = group_info->ngroups; | ||
1236 | while (left < right) { | ||
1237 | int mid = (left+right)/2; | ||
1238 | int cmp = grp - GROUP_AT(group_info, mid); | ||
1239 | if (cmp > 0) | ||
1240 | left = mid + 1; | ||
1241 | else if (cmp < 0) | ||
1242 | right = mid; | ||
1243 | else | ||
1244 | return 1; | ||
1245 | } | ||
1246 | return 0; | ||
1247 | } | ||
1248 | |||
1249 | /* validate and set current->group_info */ | ||
1250 | int set_current_groups(struct group_info *group_info) | ||
1251 | { | ||
1252 | int retval; | ||
1253 | struct group_info *old_info; | ||
1254 | |||
1255 | retval = security_task_setgroups(group_info); | ||
1256 | if (retval) | ||
1257 | return retval; | ||
1258 | |||
1259 | groups_sort(group_info); | ||
1260 | get_group_info(group_info); | ||
1261 | |||
1262 | task_lock(current); | ||
1263 | old_info = current->group_info; | ||
1264 | current->group_info = group_info; | ||
1265 | task_unlock(current); | ||
1266 | |||
1267 | put_group_info(old_info); | ||
1268 | |||
1269 | return 0; | ||
1270 | } | ||
1271 | |||
1272 | EXPORT_SYMBOL(set_current_groups); | ||
1273 | |||
1274 | asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) | ||
1275 | { | ||
1276 | int i = 0; | ||
1277 | |||
1278 | /* | ||
1279 | * SMP: Nobody else can change our grouplist. Thus we are | ||
1280 | * safe. | ||
1281 | */ | ||
1282 | |||
1283 | if (gidsetsize < 0) | ||
1284 | return -EINVAL; | ||
1285 | |||
1286 | /* no need to grab task_lock here; it cannot change */ | ||
1287 | get_group_info(current->group_info); | ||
1288 | i = current->group_info->ngroups; | ||
1289 | if (gidsetsize) { | ||
1290 | if (i > gidsetsize) { | ||
1291 | i = -EINVAL; | ||
1292 | goto out; | ||
1293 | } | ||
1294 | if (groups_to_user(grouplist, current->group_info)) { | ||
1295 | i = -EFAULT; | ||
1296 | goto out; | ||
1297 | } | ||
1298 | } | ||
1299 | out: | ||
1300 | put_group_info(current->group_info); | ||
1301 | return i; | ||
1302 | } | ||
1303 | |||
1304 | /* | ||
1305 | * SMP: Our groups are copy-on-write. We can set them safely | ||
1306 | * without another task interfering. | ||
1307 | */ | ||
1308 | |||
1309 | asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) | ||
1310 | { | ||
1311 | struct group_info *group_info; | ||
1312 | int retval; | ||
1313 | |||
1314 | if (!capable(CAP_SETGID)) | ||
1315 | return -EPERM; | ||
1316 | if ((unsigned)gidsetsize > NGROUPS_MAX) | ||
1317 | return -EINVAL; | ||
1318 | |||
1319 | group_info = groups_alloc(gidsetsize); | ||
1320 | if (!group_info) | ||
1321 | return -ENOMEM; | ||
1322 | retval = groups_from_user(group_info, grouplist); | ||
1323 | if (retval) { | ||
1324 | put_group_info(group_info); | ||
1325 | return retval; | ||
1326 | } | ||
1327 | |||
1328 | retval = set_current_groups(group_info); | ||
1329 | put_group_info(group_info); | ||
1330 | |||
1331 | return retval; | ||
1332 | } | ||
1333 | |||
1334 | /* | ||
1335 | * Check whether we're fsgid/egid or in the supplemental group.. | ||
1336 | */ | ||
1337 | int in_group_p(gid_t grp) | ||
1338 | { | ||
1339 | int retval = 1; | ||
1340 | if (grp != current->fsgid) { | ||
1341 | get_group_info(current->group_info); | ||
1342 | retval = groups_search(current->group_info, grp); | ||
1343 | put_group_info(current->group_info); | ||
1344 | } | ||
1345 | return retval; | ||
1346 | } | ||
1347 | |||
1348 | EXPORT_SYMBOL(in_group_p); | ||
1349 | |||
1350 | int in_egroup_p(gid_t grp) | ||
1351 | { | ||
1352 | int retval = 1; | ||
1353 | if (grp != current->egid) { | ||
1354 | get_group_info(current->group_info); | ||
1355 | retval = groups_search(current->group_info, grp); | ||
1356 | put_group_info(current->group_info); | ||
1357 | } | ||
1358 | return retval; | ||
1359 | } | ||
1360 | |||
1361 | EXPORT_SYMBOL(in_egroup_p); | ||
1362 | |||
1363 | DECLARE_RWSEM(uts_sem); | ||
1364 | |||
1365 | EXPORT_SYMBOL(uts_sem); | ||
1366 | |||
1367 | asmlinkage long sys_newuname(struct new_utsname __user * name) | ||
1368 | { | ||
1369 | int errno = 0; | ||
1370 | |||
1371 | down_read(&uts_sem); | ||
1372 | if (copy_to_user(name,&system_utsname,sizeof *name)) | ||
1373 | errno = -EFAULT; | ||
1374 | up_read(&uts_sem); | ||
1375 | return errno; | ||
1376 | } | ||
1377 | |||
1378 | asmlinkage long sys_sethostname(char __user *name, int len) | ||
1379 | { | ||
1380 | int errno; | ||
1381 | char tmp[__NEW_UTS_LEN]; | ||
1382 | |||
1383 | if (!capable(CAP_SYS_ADMIN)) | ||
1384 | return -EPERM; | ||
1385 | if (len < 0 || len > __NEW_UTS_LEN) | ||
1386 | return -EINVAL; | ||
1387 | down_write(&uts_sem); | ||
1388 | errno = -EFAULT; | ||
1389 | if (!copy_from_user(tmp, name, len)) { | ||
1390 | memcpy(system_utsname.nodename, tmp, len); | ||
1391 | system_utsname.nodename[len] = 0; | ||
1392 | errno = 0; | ||
1393 | } | ||
1394 | up_write(&uts_sem); | ||
1395 | return errno; | ||
1396 | } | ||
1397 | |||
1398 | #ifdef __ARCH_WANT_SYS_GETHOSTNAME | ||
1399 | |||
1400 | asmlinkage long sys_gethostname(char __user *name, int len) | ||
1401 | { | ||
1402 | int i, errno; | ||
1403 | |||
1404 | if (len < 0) | ||
1405 | return -EINVAL; | ||
1406 | down_read(&uts_sem); | ||
1407 | i = 1 + strlen(system_utsname.nodename); | ||
1408 | if (i > len) | ||
1409 | i = len; | ||
1410 | errno = 0; | ||
1411 | if (copy_to_user(name, system_utsname.nodename, i)) | ||
1412 | errno = -EFAULT; | ||
1413 | up_read(&uts_sem); | ||
1414 | return errno; | ||
1415 | } | ||
1416 | |||
1417 | #endif | ||
1418 | |||
1419 | /* | ||
1420 | * Only setdomainname; getdomainname can be implemented by calling | ||
1421 | * uname() | ||
1422 | */ | ||
1423 | asmlinkage long sys_setdomainname(char __user *name, int len) | ||
1424 | { | ||
1425 | int errno; | ||
1426 | char tmp[__NEW_UTS_LEN]; | ||
1427 | |||
1428 | if (!capable(CAP_SYS_ADMIN)) | ||
1429 | return -EPERM; | ||
1430 | if (len < 0 || len > __NEW_UTS_LEN) | ||
1431 | return -EINVAL; | ||
1432 | |||
1433 | down_write(&uts_sem); | ||
1434 | errno = -EFAULT; | ||
1435 | if (!copy_from_user(tmp, name, len)) { | ||
1436 | memcpy(system_utsname.domainname, tmp, len); | ||
1437 | system_utsname.domainname[len] = 0; | ||
1438 | errno = 0; | ||
1439 | } | ||
1440 | up_write(&uts_sem); | ||
1441 | return errno; | ||
1442 | } | ||
1443 | |||
1444 | asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) | ||
1445 | { | ||
1446 | if (resource >= RLIM_NLIMITS) | ||
1447 | return -EINVAL; | ||
1448 | else { | ||
1449 | struct rlimit value; | ||
1450 | task_lock(current->group_leader); | ||
1451 | value = current->signal->rlim[resource]; | ||
1452 | task_unlock(current->group_leader); | ||
1453 | return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; | ||
1454 | } | ||
1455 | } | ||
1456 | |||
1457 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT | ||
1458 | |||
1459 | /* | ||
1460 | * Back compatibility for getrlimit. Needed for some apps. | ||
1461 | */ | ||
1462 | |||
1463 | asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim) | ||
1464 | { | ||
1465 | struct rlimit x; | ||
1466 | if (resource >= RLIM_NLIMITS) | ||
1467 | return -EINVAL; | ||
1468 | |||
1469 | task_lock(current->group_leader); | ||
1470 | x = current->signal->rlim[resource]; | ||
1471 | task_unlock(current->group_leader); | ||
1472 | if(x.rlim_cur > 0x7FFFFFFF) | ||
1473 | x.rlim_cur = 0x7FFFFFFF; | ||
1474 | if(x.rlim_max > 0x7FFFFFFF) | ||
1475 | x.rlim_max = 0x7FFFFFFF; | ||
1476 | return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; | ||
1477 | } | ||
1478 | |||
1479 | #endif | ||
1480 | |||
1481 | asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | ||
1482 | { | ||
1483 | struct rlimit new_rlim, *old_rlim; | ||
1484 | int retval; | ||
1485 | |||
1486 | if (resource >= RLIM_NLIMITS) | ||
1487 | return -EINVAL; | ||
1488 | if(copy_from_user(&new_rlim, rlim, sizeof(*rlim))) | ||
1489 | return -EFAULT; | ||
1490 | if (new_rlim.rlim_cur > new_rlim.rlim_max) | ||
1491 | return -EINVAL; | ||
1492 | old_rlim = current->signal->rlim + resource; | ||
1493 | if ((new_rlim.rlim_max > old_rlim->rlim_max) && | ||
1494 | !capable(CAP_SYS_RESOURCE)) | ||
1495 | return -EPERM; | ||
1496 | if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) | ||
1497 | return -EPERM; | ||
1498 | |||
1499 | retval = security_task_setrlimit(resource, &new_rlim); | ||
1500 | if (retval) | ||
1501 | return retval; | ||
1502 | |||
1503 | task_lock(current->group_leader); | ||
1504 | *old_rlim = new_rlim; | ||
1505 | task_unlock(current->group_leader); | ||
1506 | |||
1507 | if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY && | ||
1508 | (cputime_eq(current->signal->it_prof_expires, cputime_zero) || | ||
1509 | new_rlim.rlim_cur <= cputime_to_secs( | ||
1510 | current->signal->it_prof_expires))) { | ||
1511 | cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); | ||
1512 | read_lock(&tasklist_lock); | ||
1513 | spin_lock_irq(¤t->sighand->siglock); | ||
1514 | set_process_cpu_timer(current, CPUCLOCK_PROF, | ||
1515 | &cputime, NULL); | ||
1516 | spin_unlock_irq(¤t->sighand->siglock); | ||
1517 | read_unlock(&tasklist_lock); | ||
1518 | } | ||
1519 | |||
1520 | return 0; | ||
1521 | } | ||
1522 | |||
1523 | /* | ||
1524 | * It would make sense to put struct rusage in the task_struct, | ||
1525 | * except that would make the task_struct be *really big*. After | ||
1526 | * task_struct gets moved into malloc'ed memory, it would | ||
1527 | * make sense to do this. It will make moving the rest of the information | ||
1528 | * a lot simpler! (Which we're not doing right now because we're not | ||
1529 | * measuring them yet). | ||
1530 | * | ||
1531 | * This expects to be called with tasklist_lock read-locked or better, | ||
1532 | * and the siglock not locked. It may momentarily take the siglock. | ||
1533 | * | ||
1534 | * When sampling multiple threads for RUSAGE_SELF, under SMP we might have | ||
1535 | * races with threads incrementing their own counters. But since word | ||
1536 | * reads are atomic, we either get new values or old values and we don't | ||
1537 | * care which for the sums. We always take the siglock to protect reading | ||
1538 | * the c* fields from p->signal from races with exit.c updating those | ||
1539 | * fields when reaping, so a sample either gets all the additions of a | ||
1540 | * given child after it's reaped, or none so this sample is before reaping. | ||
1541 | */ | ||
1542 | |||
1543 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | ||
1544 | { | ||
1545 | struct task_struct *t; | ||
1546 | unsigned long flags; | ||
1547 | cputime_t utime, stime; | ||
1548 | |||
1549 | memset((char *) r, 0, sizeof *r); | ||
1550 | |||
1551 | if (unlikely(!p->signal)) | ||
1552 | return; | ||
1553 | |||
1554 | switch (who) { | ||
1555 | case RUSAGE_CHILDREN: | ||
1556 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1557 | utime = p->signal->cutime; | ||
1558 | stime = p->signal->cstime; | ||
1559 | r->ru_nvcsw = p->signal->cnvcsw; | ||
1560 | r->ru_nivcsw = p->signal->cnivcsw; | ||
1561 | r->ru_minflt = p->signal->cmin_flt; | ||
1562 | r->ru_majflt = p->signal->cmaj_flt; | ||
1563 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1564 | cputime_to_timeval(utime, &r->ru_utime); | ||
1565 | cputime_to_timeval(stime, &r->ru_stime); | ||
1566 | break; | ||
1567 | case RUSAGE_SELF: | ||
1568 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1569 | utime = stime = cputime_zero; | ||
1570 | goto sum_group; | ||
1571 | case RUSAGE_BOTH: | ||
1572 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1573 | utime = p->signal->cutime; | ||
1574 | stime = p->signal->cstime; | ||
1575 | r->ru_nvcsw = p->signal->cnvcsw; | ||
1576 | r->ru_nivcsw = p->signal->cnivcsw; | ||
1577 | r->ru_minflt = p->signal->cmin_flt; | ||
1578 | r->ru_majflt = p->signal->cmaj_flt; | ||
1579 | sum_group: | ||
1580 | utime = cputime_add(utime, p->signal->utime); | ||
1581 | stime = cputime_add(stime, p->signal->stime); | ||
1582 | r->ru_nvcsw += p->signal->nvcsw; | ||
1583 | r->ru_nivcsw += p->signal->nivcsw; | ||
1584 | r->ru_minflt += p->signal->min_flt; | ||
1585 | r->ru_majflt += p->signal->maj_flt; | ||
1586 | t = p; | ||
1587 | do { | ||
1588 | utime = cputime_add(utime, t->utime); | ||
1589 | stime = cputime_add(stime, t->stime); | ||
1590 | r->ru_nvcsw += t->nvcsw; | ||
1591 | r->ru_nivcsw += t->nivcsw; | ||
1592 | r->ru_minflt += t->min_flt; | ||
1593 | r->ru_majflt += t->maj_flt; | ||
1594 | t = next_thread(t); | ||
1595 | } while (t != p); | ||
1596 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1597 | cputime_to_timeval(utime, &r->ru_utime); | ||
1598 | cputime_to_timeval(stime, &r->ru_stime); | ||
1599 | break; | ||
1600 | default: | ||
1601 | BUG(); | ||
1602 | } | ||
1603 | } | ||
1604 | |||
1605 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | ||
1606 | { | ||
1607 | struct rusage r; | ||
1608 | read_lock(&tasklist_lock); | ||
1609 | k_getrusage(p, who, &r); | ||
1610 | read_unlock(&tasklist_lock); | ||
1611 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | ||
1612 | } | ||
1613 | |||
1614 | asmlinkage long sys_getrusage(int who, struct rusage __user *ru) | ||
1615 | { | ||
1616 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) | ||
1617 | return -EINVAL; | ||
1618 | return getrusage(current, who, ru); | ||
1619 | } | ||
1620 | |||
1621 | asmlinkage long sys_umask(int mask) | ||
1622 | { | ||
1623 | mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); | ||
1624 | return mask; | ||
1625 | } | ||
1626 | |||
1627 | asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | ||
1628 | unsigned long arg4, unsigned long arg5) | ||
1629 | { | ||
1630 | long error; | ||
1631 | int sig; | ||
1632 | |||
1633 | error = security_task_prctl(option, arg2, arg3, arg4, arg5); | ||
1634 | if (error) | ||
1635 | return error; | ||
1636 | |||
1637 | switch (option) { | ||
1638 | case PR_SET_PDEATHSIG: | ||
1639 | sig = arg2; | ||
1640 | if (sig < 0 || sig > _NSIG) { | ||
1641 | error = -EINVAL; | ||
1642 | break; | ||
1643 | } | ||
1644 | current->pdeath_signal = sig; | ||
1645 | break; | ||
1646 | case PR_GET_PDEATHSIG: | ||
1647 | error = put_user(current->pdeath_signal, (int __user *)arg2); | ||
1648 | break; | ||
1649 | case PR_GET_DUMPABLE: | ||
1650 | if (current->mm->dumpable) | ||
1651 | error = 1; | ||
1652 | break; | ||
1653 | case PR_SET_DUMPABLE: | ||
1654 | if (arg2 != 0 && arg2 != 1) { | ||
1655 | error = -EINVAL; | ||
1656 | break; | ||
1657 | } | ||
1658 | current->mm->dumpable = arg2; | ||
1659 | break; | ||
1660 | |||
1661 | case PR_SET_UNALIGN: | ||
1662 | error = SET_UNALIGN_CTL(current, arg2); | ||
1663 | break; | ||
1664 | case PR_GET_UNALIGN: | ||
1665 | error = GET_UNALIGN_CTL(current, arg2); | ||
1666 | break; | ||
1667 | case PR_SET_FPEMU: | ||
1668 | error = SET_FPEMU_CTL(current, arg2); | ||
1669 | break; | ||
1670 | case PR_GET_FPEMU: | ||
1671 | error = GET_FPEMU_CTL(current, arg2); | ||
1672 | break; | ||
1673 | case PR_SET_FPEXC: | ||
1674 | error = SET_FPEXC_CTL(current, arg2); | ||
1675 | break; | ||
1676 | case PR_GET_FPEXC: | ||
1677 | error = GET_FPEXC_CTL(current, arg2); | ||
1678 | break; | ||
1679 | case PR_GET_TIMING: | ||
1680 | error = PR_TIMING_STATISTICAL; | ||
1681 | break; | ||
1682 | case PR_SET_TIMING: | ||
1683 | if (arg2 == PR_TIMING_STATISTICAL) | ||
1684 | error = 0; | ||
1685 | else | ||
1686 | error = -EINVAL; | ||
1687 | break; | ||
1688 | |||
1689 | case PR_GET_KEEPCAPS: | ||
1690 | if (current->keep_capabilities) | ||
1691 | error = 1; | ||
1692 | break; | ||
1693 | case PR_SET_KEEPCAPS: | ||
1694 | if (arg2 != 0 && arg2 != 1) { | ||
1695 | error = -EINVAL; | ||
1696 | break; | ||
1697 | } | ||
1698 | current->keep_capabilities = arg2; | ||
1699 | break; | ||
1700 | case PR_SET_NAME: { | ||
1701 | struct task_struct *me = current; | ||
1702 | unsigned char ncomm[sizeof(me->comm)]; | ||
1703 | |||
1704 | ncomm[sizeof(me->comm)-1] = 0; | ||
1705 | if (strncpy_from_user(ncomm, (char __user *)arg2, | ||
1706 | sizeof(me->comm)-1) < 0) | ||
1707 | return -EFAULT; | ||
1708 | set_task_comm(me, ncomm); | ||
1709 | return 0; | ||
1710 | } | ||
1711 | case PR_GET_NAME: { | ||
1712 | struct task_struct *me = current; | ||
1713 | unsigned char tcomm[sizeof(me->comm)]; | ||
1714 | |||
1715 | get_task_comm(tcomm, me); | ||
1716 | if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm))) | ||
1717 | return -EFAULT; | ||
1718 | return 0; | ||
1719 | } | ||
1720 | default: | ||
1721 | error = -EINVAL; | ||
1722 | break; | ||
1723 | } | ||
1724 | return error; | ||
1725 | } | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c new file mode 100644 index 000000000000..1802a311dd3f --- /dev/null +++ b/kernel/sys_ni.c | |||
@@ -0,0 +1,86 @@ | |||
1 | |||
2 | #include <linux/linkage.h> | ||
3 | #include <linux/errno.h> | ||
4 | |||
5 | #include <asm/unistd.h> | ||
6 | |||
7 | /* | ||
8 | * Non-implemented system calls get redirected here. | ||
9 | */ | ||
10 | asmlinkage long sys_ni_syscall(void) | ||
11 | { | ||
12 | return -ENOSYS; | ||
13 | } | ||
14 | |||
15 | cond_syscall(sys_nfsservctl); | ||
16 | cond_syscall(sys_quotactl); | ||
17 | cond_syscall(sys_acct); | ||
18 | cond_syscall(sys_lookup_dcookie); | ||
19 | cond_syscall(sys_swapon); | ||
20 | cond_syscall(sys_swapoff); | ||
21 | cond_syscall(sys_init_module); | ||
22 | cond_syscall(sys_delete_module); | ||
23 | cond_syscall(sys_socketpair); | ||
24 | cond_syscall(sys_bind); | ||
25 | cond_syscall(sys_listen); | ||
26 | cond_syscall(sys_accept); | ||
27 | cond_syscall(sys_connect); | ||
28 | cond_syscall(sys_getsockname); | ||
29 | cond_syscall(sys_getpeername); | ||
30 | cond_syscall(sys_sendto); | ||
31 | cond_syscall(sys_send); | ||
32 | cond_syscall(sys_recvfrom); | ||
33 | cond_syscall(sys_recv); | ||
34 | cond_syscall(sys_socket); | ||
35 | cond_syscall(sys_setsockopt); | ||
36 | cond_syscall(sys_getsockopt); | ||
37 | cond_syscall(sys_shutdown); | ||
38 | cond_syscall(sys_sendmsg); | ||
39 | cond_syscall(sys_recvmsg); | ||
40 | cond_syscall(sys_socketcall); | ||
41 | cond_syscall(sys_futex); | ||
42 | cond_syscall(compat_sys_futex); | ||
43 | cond_syscall(sys_epoll_create); | ||
44 | cond_syscall(sys_epoll_ctl); | ||
45 | cond_syscall(sys_epoll_wait); | ||
46 | cond_syscall(sys_semget); | ||
47 | cond_syscall(sys_semop); | ||
48 | cond_syscall(sys_semtimedop); | ||
49 | cond_syscall(sys_semctl); | ||
50 | cond_syscall(sys_msgget); | ||
51 | cond_syscall(sys_msgsnd); | ||
52 | cond_syscall(sys_msgrcv); | ||
53 | cond_syscall(sys_msgctl); | ||
54 | cond_syscall(sys_shmget); | ||
55 | cond_syscall(sys_shmdt); | ||
56 | cond_syscall(sys_shmctl); | ||
57 | cond_syscall(sys_mq_open); | ||
58 | cond_syscall(sys_mq_unlink); | ||
59 | cond_syscall(sys_mq_timedsend); | ||
60 | cond_syscall(sys_mq_timedreceive); | ||
61 | cond_syscall(sys_mq_notify); | ||
62 | cond_syscall(sys_mq_getsetattr); | ||
63 | cond_syscall(compat_sys_mq_open); | ||
64 | cond_syscall(compat_sys_mq_timedsend); | ||
65 | cond_syscall(compat_sys_mq_timedreceive); | ||
66 | cond_syscall(compat_sys_mq_notify); | ||
67 | cond_syscall(compat_sys_mq_getsetattr); | ||
68 | cond_syscall(sys_mbind); | ||
69 | cond_syscall(sys_get_mempolicy); | ||
70 | cond_syscall(sys_set_mempolicy); | ||
71 | cond_syscall(compat_sys_mbind); | ||
72 | cond_syscall(compat_sys_get_mempolicy); | ||
73 | cond_syscall(compat_sys_set_mempolicy); | ||
74 | cond_syscall(sys_add_key); | ||
75 | cond_syscall(sys_request_key); | ||
76 | cond_syscall(sys_keyctl); | ||
77 | cond_syscall(compat_sys_keyctl); | ||
78 | cond_syscall(compat_sys_socketcall); | ||
79 | |||
80 | /* arch-specific weak syscall entries */ | ||
81 | cond_syscall(sys_pciconfig_read); | ||
82 | cond_syscall(sys_pciconfig_write); | ||
83 | cond_syscall(sys_pciconfig_iobase); | ||
84 | cond_syscall(sys32_ipc); | ||
85 | cond_syscall(sys32_sysctl); | ||
86 | cond_syscall(ppc_rtas); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c new file mode 100644 index 000000000000..79dbd93bd697 --- /dev/null +++ b/kernel/sysctl.c | |||
@@ -0,0 +1,2337 @@ | |||
1 | /* | ||
2 | * sysctl.c: General linux system control interface | ||
3 | * | ||
4 | * Begun 24 March 1995, Stephen Tweedie | ||
5 | * Added /proc support, Dec 1995 | ||
6 | * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. | ||
7 | * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. | ||
8 | * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. | ||
9 | * Dynamic registration fixes, Stephen Tweedie. | ||
10 | * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. | ||
11 | * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris | ||
12 | * Horn. | ||
13 | * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. | ||
14 | * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. | ||
15 | * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill | ||
16 | * Wendling. | ||
17 | * The list_for_each() macro wasn't appropriate for the sysctl loop. | ||
18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling | ||
19 | */ | ||
20 | |||
21 | #include <linux/config.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/swap.h> | ||
25 | #include <linux/slab.h> | ||
26 | #include <linux/sysctl.h> | ||
27 | #include <linux/proc_fs.h> | ||
28 | #include <linux/ctype.h> | ||
29 | #include <linux/utsname.h> | ||
30 | #include <linux/capability.h> | ||
31 | #include <linux/smp_lock.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/sysrq.h> | ||
35 | #include <linux/highuid.h> | ||
36 | #include <linux/writeback.h> | ||
37 | #include <linux/hugetlb.h> | ||
38 | #include <linux/security.h> | ||
39 | #include <linux/initrd.h> | ||
40 | #include <linux/times.h> | ||
41 | #include <linux/limits.h> | ||
42 | #include <linux/dcache.h> | ||
43 | #include <linux/syscalls.h> | ||
44 | |||
45 | #include <asm/uaccess.h> | ||
46 | #include <asm/processor.h> | ||
47 | |||
48 | #ifdef CONFIG_ROOT_NFS | ||
49 | #include <linux/nfs_fs.h> | ||
50 | #endif | ||
51 | |||
52 | #if defined(CONFIG_SYSCTL) | ||
53 | |||
54 | /* External variables not in a header file. */ | ||
55 | extern int C_A_D; | ||
56 | extern int sysctl_overcommit_memory; | ||
57 | extern int sysctl_overcommit_ratio; | ||
58 | extern int max_threads; | ||
59 | extern int sysrq_enabled; | ||
60 | extern int core_uses_pid; | ||
61 | extern char core_pattern[]; | ||
62 | extern int cad_pid; | ||
63 | extern int pid_max; | ||
64 | extern int min_free_kbytes; | ||
65 | extern int printk_ratelimit_jiffies; | ||
66 | extern int printk_ratelimit_burst; | ||
67 | extern int pid_max_min, pid_max_max; | ||
68 | |||
69 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
70 | int unknown_nmi_panic; | ||
71 | extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, | ||
72 | void __user *, size_t *, loff_t *); | ||
73 | #endif | ||
74 | |||
75 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | ||
76 | static int maxolduid = 65535; | ||
77 | static int minolduid; | ||
78 | |||
79 | static int ngroups_max = NGROUPS_MAX; | ||
80 | |||
81 | #ifdef CONFIG_KMOD | ||
82 | extern char modprobe_path[]; | ||
83 | #endif | ||
84 | #ifdef CONFIG_HOTPLUG | ||
85 | extern char hotplug_path[]; | ||
86 | #endif | ||
87 | #ifdef CONFIG_CHR_DEV_SG | ||
88 | extern int sg_big_buff; | ||
89 | #endif | ||
90 | #ifdef CONFIG_SYSVIPC | ||
91 | extern size_t shm_ctlmax; | ||
92 | extern size_t shm_ctlall; | ||
93 | extern int shm_ctlmni; | ||
94 | extern int msg_ctlmax; | ||
95 | extern int msg_ctlmnb; | ||
96 | extern int msg_ctlmni; | ||
97 | extern int sem_ctls[]; | ||
98 | #endif | ||
99 | |||
100 | #ifdef __sparc__ | ||
101 | extern char reboot_command []; | ||
102 | extern int stop_a_enabled; | ||
103 | extern int scons_pwroff; | ||
104 | #endif | ||
105 | |||
106 | #ifdef __hppa__ | ||
107 | extern int pwrsw_enabled; | ||
108 | extern int unaligned_enabled; | ||
109 | #endif | ||
110 | |||
111 | #ifdef CONFIG_ARCH_S390 | ||
112 | #ifdef CONFIG_MATHEMU | ||
113 | extern int sysctl_ieee_emulation_warnings; | ||
114 | #endif | ||
115 | extern int sysctl_userprocess_debug; | ||
116 | #endif | ||
117 | |||
118 | extern int sysctl_hz_timer; | ||
119 | |||
120 | #ifdef CONFIG_BSD_PROCESS_ACCT | ||
121 | extern int acct_parm[]; | ||
122 | #endif | ||
123 | |||
124 | int randomize_va_space = 1; | ||
125 | |||
126 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, | ||
127 | ctl_table *, void **); | ||
128 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, | ||
129 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
130 | |||
131 | static ctl_table root_table[]; | ||
132 | static struct ctl_table_header root_table_header = | ||
133 | { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; | ||
134 | |||
135 | static ctl_table kern_table[]; | ||
136 | static ctl_table vm_table[]; | ||
137 | #ifdef CONFIG_NET | ||
138 | extern ctl_table net_table[]; | ||
139 | #endif | ||
140 | static ctl_table proc_table[]; | ||
141 | static ctl_table fs_table[]; | ||
142 | static ctl_table debug_table[]; | ||
143 | static ctl_table dev_table[]; | ||
144 | extern ctl_table random_table[]; | ||
145 | #ifdef CONFIG_UNIX98_PTYS | ||
146 | extern ctl_table pty_table[]; | ||
147 | #endif | ||
148 | |||
149 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | ||
150 | int sysctl_legacy_va_layout; | ||
151 | #endif | ||
152 | |||
153 | /* /proc declarations: */ | ||
154 | |||
155 | #ifdef CONFIG_PROC_FS | ||
156 | |||
157 | static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); | ||
158 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); | ||
159 | static int proc_opensys(struct inode *, struct file *); | ||
160 | |||
161 | struct file_operations proc_sys_file_operations = { | ||
162 | .open = proc_opensys, | ||
163 | .read = proc_readsys, | ||
164 | .write = proc_writesys, | ||
165 | }; | ||
166 | |||
167 | extern struct proc_dir_entry *proc_sys_root; | ||
168 | |||
169 | static void register_proc_table(ctl_table *, struct proc_dir_entry *); | ||
170 | static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); | ||
171 | #endif | ||
172 | |||
173 | /* The default sysctl tables: */ | ||
174 | |||
175 | static ctl_table root_table[] = { | ||
176 | { | ||
177 | .ctl_name = CTL_KERN, | ||
178 | .procname = "kernel", | ||
179 | .mode = 0555, | ||
180 | .child = kern_table, | ||
181 | }, | ||
182 | { | ||
183 | .ctl_name = CTL_VM, | ||
184 | .procname = "vm", | ||
185 | .mode = 0555, | ||
186 | .child = vm_table, | ||
187 | }, | ||
188 | #ifdef CONFIG_NET | ||
189 | { | ||
190 | .ctl_name = CTL_NET, | ||
191 | .procname = "net", | ||
192 | .mode = 0555, | ||
193 | .child = net_table, | ||
194 | }, | ||
195 | #endif | ||
196 | { | ||
197 | .ctl_name = CTL_PROC, | ||
198 | .procname = "proc", | ||
199 | .mode = 0555, | ||
200 | .child = proc_table, | ||
201 | }, | ||
202 | { | ||
203 | .ctl_name = CTL_FS, | ||
204 | .procname = "fs", | ||
205 | .mode = 0555, | ||
206 | .child = fs_table, | ||
207 | }, | ||
208 | { | ||
209 | .ctl_name = CTL_DEBUG, | ||
210 | .procname = "debug", | ||
211 | .mode = 0555, | ||
212 | .child = debug_table, | ||
213 | }, | ||
214 | { | ||
215 | .ctl_name = CTL_DEV, | ||
216 | .procname = "dev", | ||
217 | .mode = 0555, | ||
218 | .child = dev_table, | ||
219 | }, | ||
220 | { .ctl_name = 0 } | ||
221 | }; | ||
222 | |||
223 | static ctl_table kern_table[] = { | ||
224 | { | ||
225 | .ctl_name = KERN_OSTYPE, | ||
226 | .procname = "ostype", | ||
227 | .data = system_utsname.sysname, | ||
228 | .maxlen = sizeof(system_utsname.sysname), | ||
229 | .mode = 0444, | ||
230 | .proc_handler = &proc_doutsstring, | ||
231 | .strategy = &sysctl_string, | ||
232 | }, | ||
233 | { | ||
234 | .ctl_name = KERN_OSRELEASE, | ||
235 | .procname = "osrelease", | ||
236 | .data = system_utsname.release, | ||
237 | .maxlen = sizeof(system_utsname.release), | ||
238 | .mode = 0444, | ||
239 | .proc_handler = &proc_doutsstring, | ||
240 | .strategy = &sysctl_string, | ||
241 | }, | ||
242 | { | ||
243 | .ctl_name = KERN_VERSION, | ||
244 | .procname = "version", | ||
245 | .data = system_utsname.version, | ||
246 | .maxlen = sizeof(system_utsname.version), | ||
247 | .mode = 0444, | ||
248 | .proc_handler = &proc_doutsstring, | ||
249 | .strategy = &sysctl_string, | ||
250 | }, | ||
251 | { | ||
252 | .ctl_name = KERN_NODENAME, | ||
253 | .procname = "hostname", | ||
254 | .data = system_utsname.nodename, | ||
255 | .maxlen = sizeof(system_utsname.nodename), | ||
256 | .mode = 0644, | ||
257 | .proc_handler = &proc_doutsstring, | ||
258 | .strategy = &sysctl_string, | ||
259 | }, | ||
260 | { | ||
261 | .ctl_name = KERN_DOMAINNAME, | ||
262 | .procname = "domainname", | ||
263 | .data = system_utsname.domainname, | ||
264 | .maxlen = sizeof(system_utsname.domainname), | ||
265 | .mode = 0644, | ||
266 | .proc_handler = &proc_doutsstring, | ||
267 | .strategy = &sysctl_string, | ||
268 | }, | ||
269 | { | ||
270 | .ctl_name = KERN_PANIC, | ||
271 | .procname = "panic", | ||
272 | .data = &panic_timeout, | ||
273 | .maxlen = sizeof(int), | ||
274 | .mode = 0644, | ||
275 | .proc_handler = &proc_dointvec, | ||
276 | }, | ||
277 | { | ||
278 | .ctl_name = KERN_CORE_USES_PID, | ||
279 | .procname = "core_uses_pid", | ||
280 | .data = &core_uses_pid, | ||
281 | .maxlen = sizeof(int), | ||
282 | .mode = 0644, | ||
283 | .proc_handler = &proc_dointvec, | ||
284 | }, | ||
285 | { | ||
286 | .ctl_name = KERN_CORE_PATTERN, | ||
287 | .procname = "core_pattern", | ||
288 | .data = core_pattern, | ||
289 | .maxlen = 64, | ||
290 | .mode = 0644, | ||
291 | .proc_handler = &proc_dostring, | ||
292 | .strategy = &sysctl_string, | ||
293 | }, | ||
294 | { | ||
295 | .ctl_name = KERN_TAINTED, | ||
296 | .procname = "tainted", | ||
297 | .data = &tainted, | ||
298 | .maxlen = sizeof(int), | ||
299 | .mode = 0444, | ||
300 | .proc_handler = &proc_dointvec, | ||
301 | }, | ||
302 | { | ||
303 | .ctl_name = KERN_CAP_BSET, | ||
304 | .procname = "cap-bound", | ||
305 | .data = &cap_bset, | ||
306 | .maxlen = sizeof(kernel_cap_t), | ||
307 | .mode = 0600, | ||
308 | .proc_handler = &proc_dointvec_bset, | ||
309 | }, | ||
310 | #ifdef CONFIG_BLK_DEV_INITRD | ||
311 | { | ||
312 | .ctl_name = KERN_REALROOTDEV, | ||
313 | .procname = "real-root-dev", | ||
314 | .data = &real_root_dev, | ||
315 | .maxlen = sizeof(int), | ||
316 | .mode = 0644, | ||
317 | .proc_handler = &proc_dointvec, | ||
318 | }, | ||
319 | #endif | ||
320 | #ifdef __sparc__ | ||
321 | { | ||
322 | .ctl_name = KERN_SPARC_REBOOT, | ||
323 | .procname = "reboot-cmd", | ||
324 | .data = reboot_command, | ||
325 | .maxlen = 256, | ||
326 | .mode = 0644, | ||
327 | .proc_handler = &proc_dostring, | ||
328 | .strategy = &sysctl_string, | ||
329 | }, | ||
330 | { | ||
331 | .ctl_name = KERN_SPARC_STOP_A, | ||
332 | .procname = "stop-a", | ||
333 | .data = &stop_a_enabled, | ||
334 | .maxlen = sizeof (int), | ||
335 | .mode = 0644, | ||
336 | .proc_handler = &proc_dointvec, | ||
337 | }, | ||
338 | { | ||
339 | .ctl_name = KERN_SPARC_SCONS_PWROFF, | ||
340 | .procname = "scons-poweroff", | ||
341 | .data = &scons_pwroff, | ||
342 | .maxlen = sizeof (int), | ||
343 | .mode = 0644, | ||
344 | .proc_handler = &proc_dointvec, | ||
345 | }, | ||
346 | #endif | ||
347 | #ifdef __hppa__ | ||
348 | { | ||
349 | .ctl_name = KERN_HPPA_PWRSW, | ||
350 | .procname = "soft-power", | ||
351 | .data = &pwrsw_enabled, | ||
352 | .maxlen = sizeof (int), | ||
353 | .mode = 0644, | ||
354 | .proc_handler = &proc_dointvec, | ||
355 | }, | ||
356 | { | ||
357 | .ctl_name = KERN_HPPA_UNALIGNED, | ||
358 | .procname = "unaligned-trap", | ||
359 | .data = &unaligned_enabled, | ||
360 | .maxlen = sizeof (int), | ||
361 | .mode = 0644, | ||
362 | .proc_handler = &proc_dointvec, | ||
363 | }, | ||
364 | #endif | ||
365 | { | ||
366 | .ctl_name = KERN_CTLALTDEL, | ||
367 | .procname = "ctrl-alt-del", | ||
368 | .data = &C_A_D, | ||
369 | .maxlen = sizeof(int), | ||
370 | .mode = 0644, | ||
371 | .proc_handler = &proc_dointvec, | ||
372 | }, | ||
373 | { | ||
374 | .ctl_name = KERN_PRINTK, | ||
375 | .procname = "printk", | ||
376 | .data = &console_loglevel, | ||
377 | .maxlen = 4*sizeof(int), | ||
378 | .mode = 0644, | ||
379 | .proc_handler = &proc_dointvec, | ||
380 | }, | ||
381 | #ifdef CONFIG_KMOD | ||
382 | { | ||
383 | .ctl_name = KERN_MODPROBE, | ||
384 | .procname = "modprobe", | ||
385 | .data = &modprobe_path, | ||
386 | .maxlen = KMOD_PATH_LEN, | ||
387 | .mode = 0644, | ||
388 | .proc_handler = &proc_dostring, | ||
389 | .strategy = &sysctl_string, | ||
390 | }, | ||
391 | #endif | ||
392 | #ifdef CONFIG_HOTPLUG | ||
393 | { | ||
394 | .ctl_name = KERN_HOTPLUG, | ||
395 | .procname = "hotplug", | ||
396 | .data = &hotplug_path, | ||
397 | .maxlen = HOTPLUG_PATH_LEN, | ||
398 | .mode = 0644, | ||
399 | .proc_handler = &proc_dostring, | ||
400 | .strategy = &sysctl_string, | ||
401 | }, | ||
402 | #endif | ||
403 | #ifdef CONFIG_CHR_DEV_SG | ||
404 | { | ||
405 | .ctl_name = KERN_SG_BIG_BUFF, | ||
406 | .procname = "sg-big-buff", | ||
407 | .data = &sg_big_buff, | ||
408 | .maxlen = sizeof (int), | ||
409 | .mode = 0444, | ||
410 | .proc_handler = &proc_dointvec, | ||
411 | }, | ||
412 | #endif | ||
413 | #ifdef CONFIG_BSD_PROCESS_ACCT | ||
414 | { | ||
415 | .ctl_name = KERN_ACCT, | ||
416 | .procname = "acct", | ||
417 | .data = &acct_parm, | ||
418 | .maxlen = 3*sizeof(int), | ||
419 | .mode = 0644, | ||
420 | .proc_handler = &proc_dointvec, | ||
421 | }, | ||
422 | #endif | ||
423 | #ifdef CONFIG_SYSVIPC | ||
424 | { | ||
425 | .ctl_name = KERN_SHMMAX, | ||
426 | .procname = "shmmax", | ||
427 | .data = &shm_ctlmax, | ||
428 | .maxlen = sizeof (size_t), | ||
429 | .mode = 0644, | ||
430 | .proc_handler = &proc_doulongvec_minmax, | ||
431 | }, | ||
432 | { | ||
433 | .ctl_name = KERN_SHMALL, | ||
434 | .procname = "shmall", | ||
435 | .data = &shm_ctlall, | ||
436 | .maxlen = sizeof (size_t), | ||
437 | .mode = 0644, | ||
438 | .proc_handler = &proc_doulongvec_minmax, | ||
439 | }, | ||
440 | { | ||
441 | .ctl_name = KERN_SHMMNI, | ||
442 | .procname = "shmmni", | ||
443 | .data = &shm_ctlmni, | ||
444 | .maxlen = sizeof (int), | ||
445 | .mode = 0644, | ||
446 | .proc_handler = &proc_dointvec, | ||
447 | }, | ||
448 | { | ||
449 | .ctl_name = KERN_MSGMAX, | ||
450 | .procname = "msgmax", | ||
451 | .data = &msg_ctlmax, | ||
452 | .maxlen = sizeof (int), | ||
453 | .mode = 0644, | ||
454 | .proc_handler = &proc_dointvec, | ||
455 | }, | ||
456 | { | ||
457 | .ctl_name = KERN_MSGMNI, | ||
458 | .procname = "msgmni", | ||
459 | .data = &msg_ctlmni, | ||
460 | .maxlen = sizeof (int), | ||
461 | .mode = 0644, | ||
462 | .proc_handler = &proc_dointvec, | ||
463 | }, | ||
464 | { | ||
465 | .ctl_name = KERN_MSGMNB, | ||
466 | .procname = "msgmnb", | ||
467 | .data = &msg_ctlmnb, | ||
468 | .maxlen = sizeof (int), | ||
469 | .mode = 0644, | ||
470 | .proc_handler = &proc_dointvec, | ||
471 | }, | ||
472 | { | ||
473 | .ctl_name = KERN_SEM, | ||
474 | .procname = "sem", | ||
475 | .data = &sem_ctls, | ||
476 | .maxlen = 4*sizeof (int), | ||
477 | .mode = 0644, | ||
478 | .proc_handler = &proc_dointvec, | ||
479 | }, | ||
480 | #endif | ||
481 | #ifdef CONFIG_MAGIC_SYSRQ | ||
482 | { | ||
483 | .ctl_name = KERN_SYSRQ, | ||
484 | .procname = "sysrq", | ||
485 | .data = &sysrq_enabled, | ||
486 | .maxlen = sizeof (int), | ||
487 | .mode = 0644, | ||
488 | .proc_handler = &proc_dointvec, | ||
489 | }, | ||
490 | #endif | ||
491 | { | ||
492 | .ctl_name = KERN_CADPID, | ||
493 | .procname = "cad_pid", | ||
494 | .data = &cad_pid, | ||
495 | .maxlen = sizeof (int), | ||
496 | .mode = 0600, | ||
497 | .proc_handler = &proc_dointvec, | ||
498 | }, | ||
499 | { | ||
500 | .ctl_name = KERN_MAX_THREADS, | ||
501 | .procname = "threads-max", | ||
502 | .data = &max_threads, | ||
503 | .maxlen = sizeof(int), | ||
504 | .mode = 0644, | ||
505 | .proc_handler = &proc_dointvec, | ||
506 | }, | ||
507 | { | ||
508 | .ctl_name = KERN_RANDOM, | ||
509 | .procname = "random", | ||
510 | .mode = 0555, | ||
511 | .child = random_table, | ||
512 | }, | ||
513 | #ifdef CONFIG_UNIX98_PTYS | ||
514 | { | ||
515 | .ctl_name = KERN_PTY, | ||
516 | .procname = "pty", | ||
517 | .mode = 0555, | ||
518 | .child = pty_table, | ||
519 | }, | ||
520 | #endif | ||
521 | { | ||
522 | .ctl_name = KERN_OVERFLOWUID, | ||
523 | .procname = "overflowuid", | ||
524 | .data = &overflowuid, | ||
525 | .maxlen = sizeof(int), | ||
526 | .mode = 0644, | ||
527 | .proc_handler = &proc_dointvec_minmax, | ||
528 | .strategy = &sysctl_intvec, | ||
529 | .extra1 = &minolduid, | ||
530 | .extra2 = &maxolduid, | ||
531 | }, | ||
532 | { | ||
533 | .ctl_name = KERN_OVERFLOWGID, | ||
534 | .procname = "overflowgid", | ||
535 | .data = &overflowgid, | ||
536 | .maxlen = sizeof(int), | ||
537 | .mode = 0644, | ||
538 | .proc_handler = &proc_dointvec_minmax, | ||
539 | .strategy = &sysctl_intvec, | ||
540 | .extra1 = &minolduid, | ||
541 | .extra2 = &maxolduid, | ||
542 | }, | ||
543 | #ifdef CONFIG_ARCH_S390 | ||
544 | #ifdef CONFIG_MATHEMU | ||
545 | { | ||
546 | .ctl_name = KERN_IEEE_EMULATION_WARNINGS, | ||
547 | .procname = "ieee_emulation_warnings", | ||
548 | .data = &sysctl_ieee_emulation_warnings, | ||
549 | .maxlen = sizeof(int), | ||
550 | .mode = 0644, | ||
551 | .proc_handler = &proc_dointvec, | ||
552 | }, | ||
553 | #endif | ||
554 | #ifdef CONFIG_NO_IDLE_HZ | ||
555 | { | ||
556 | .ctl_name = KERN_HZ_TIMER, | ||
557 | .procname = "hz_timer", | ||
558 | .data = &sysctl_hz_timer, | ||
559 | .maxlen = sizeof(int), | ||
560 | .mode = 0644, | ||
561 | .proc_handler = &proc_dointvec, | ||
562 | }, | ||
563 | #endif | ||
564 | { | ||
565 | .ctl_name = KERN_S390_USER_DEBUG_LOGGING, | ||
566 | .procname = "userprocess_debug", | ||
567 | .data = &sysctl_userprocess_debug, | ||
568 | .maxlen = sizeof(int), | ||
569 | .mode = 0644, | ||
570 | .proc_handler = &proc_dointvec, | ||
571 | }, | ||
572 | #endif | ||
573 | { | ||
574 | .ctl_name = KERN_PIDMAX, | ||
575 | .procname = "pid_max", | ||
576 | .data = &pid_max, | ||
577 | .maxlen = sizeof (int), | ||
578 | .mode = 0644, | ||
579 | .proc_handler = &proc_dointvec_minmax, | ||
580 | .strategy = sysctl_intvec, | ||
581 | .extra1 = &pid_max_min, | ||
582 | .extra2 = &pid_max_max, | ||
583 | }, | ||
584 | { | ||
585 | .ctl_name = KERN_PANIC_ON_OOPS, | ||
586 | .procname = "panic_on_oops", | ||
587 | .data = &panic_on_oops, | ||
588 | .maxlen = sizeof(int), | ||
589 | .mode = 0644, | ||
590 | .proc_handler = &proc_dointvec, | ||
591 | }, | ||
592 | { | ||
593 | .ctl_name = KERN_PRINTK_RATELIMIT, | ||
594 | .procname = "printk_ratelimit", | ||
595 | .data = &printk_ratelimit_jiffies, | ||
596 | .maxlen = sizeof(int), | ||
597 | .mode = 0644, | ||
598 | .proc_handler = &proc_dointvec_jiffies, | ||
599 | .strategy = &sysctl_jiffies, | ||
600 | }, | ||
601 | { | ||
602 | .ctl_name = KERN_PRINTK_RATELIMIT_BURST, | ||
603 | .procname = "printk_ratelimit_burst", | ||
604 | .data = &printk_ratelimit_burst, | ||
605 | .maxlen = sizeof(int), | ||
606 | .mode = 0644, | ||
607 | .proc_handler = &proc_dointvec, | ||
608 | }, | ||
609 | { | ||
610 | .ctl_name = KERN_NGROUPS_MAX, | ||
611 | .procname = "ngroups_max", | ||
612 | .data = &ngroups_max, | ||
613 | .maxlen = sizeof (int), | ||
614 | .mode = 0444, | ||
615 | .proc_handler = &proc_dointvec, | ||
616 | }, | ||
617 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
618 | { | ||
619 | .ctl_name = KERN_UNKNOWN_NMI_PANIC, | ||
620 | .procname = "unknown_nmi_panic", | ||
621 | .data = &unknown_nmi_panic, | ||
622 | .maxlen = sizeof (int), | ||
623 | .mode = 0644, | ||
624 | .proc_handler = &proc_unknown_nmi_panic, | ||
625 | }, | ||
626 | #endif | ||
627 | #if defined(CONFIG_X86) | ||
628 | { | ||
629 | .ctl_name = KERN_BOOTLOADER_TYPE, | ||
630 | .procname = "bootloader_type", | ||
631 | .data = &bootloader_type, | ||
632 | .maxlen = sizeof (int), | ||
633 | .mode = 0444, | ||
634 | .proc_handler = &proc_dointvec, | ||
635 | }, | ||
636 | #endif | ||
637 | { | ||
638 | .ctl_name = KERN_RANDOMIZE, | ||
639 | .procname = "randomize_va_space", | ||
640 | .data = &randomize_va_space, | ||
641 | .maxlen = sizeof(int), | ||
642 | .mode = 0644, | ||
643 | .proc_handler = &proc_dointvec, | ||
644 | }, | ||
645 | |||
646 | { .ctl_name = 0 } | ||
647 | }; | ||
648 | |||
649 | /* Constants for minimum and maximum testing in vm_table. | ||
650 | We use these as one-element integer vectors. */ | ||
651 | static int zero; | ||
652 | static int one_hundred = 100; | ||
653 | |||
654 | |||
655 | static ctl_table vm_table[] = { | ||
656 | { | ||
657 | .ctl_name = VM_OVERCOMMIT_MEMORY, | ||
658 | .procname = "overcommit_memory", | ||
659 | .data = &sysctl_overcommit_memory, | ||
660 | .maxlen = sizeof(sysctl_overcommit_memory), | ||
661 | .mode = 0644, | ||
662 | .proc_handler = &proc_dointvec, | ||
663 | }, | ||
664 | { | ||
665 | .ctl_name = VM_OVERCOMMIT_RATIO, | ||
666 | .procname = "overcommit_ratio", | ||
667 | .data = &sysctl_overcommit_ratio, | ||
668 | .maxlen = sizeof(sysctl_overcommit_ratio), | ||
669 | .mode = 0644, | ||
670 | .proc_handler = &proc_dointvec, | ||
671 | }, | ||
672 | { | ||
673 | .ctl_name = VM_PAGE_CLUSTER, | ||
674 | .procname = "page-cluster", | ||
675 | .data = &page_cluster, | ||
676 | .maxlen = sizeof(int), | ||
677 | .mode = 0644, | ||
678 | .proc_handler = &proc_dointvec, | ||
679 | }, | ||
680 | { | ||
681 | .ctl_name = VM_DIRTY_BACKGROUND, | ||
682 | .procname = "dirty_background_ratio", | ||
683 | .data = &dirty_background_ratio, | ||
684 | .maxlen = sizeof(dirty_background_ratio), | ||
685 | .mode = 0644, | ||
686 | .proc_handler = &proc_dointvec_minmax, | ||
687 | .strategy = &sysctl_intvec, | ||
688 | .extra1 = &zero, | ||
689 | .extra2 = &one_hundred, | ||
690 | }, | ||
691 | { | ||
692 | .ctl_name = VM_DIRTY_RATIO, | ||
693 | .procname = "dirty_ratio", | ||
694 | .data = &vm_dirty_ratio, | ||
695 | .maxlen = sizeof(vm_dirty_ratio), | ||
696 | .mode = 0644, | ||
697 | .proc_handler = &proc_dointvec_minmax, | ||
698 | .strategy = &sysctl_intvec, | ||
699 | .extra1 = &zero, | ||
700 | .extra2 = &one_hundred, | ||
701 | }, | ||
702 | { | ||
703 | .ctl_name = VM_DIRTY_WB_CS, | ||
704 | .procname = "dirty_writeback_centisecs", | ||
705 | .data = &dirty_writeback_centisecs, | ||
706 | .maxlen = sizeof(dirty_writeback_centisecs), | ||
707 | .mode = 0644, | ||
708 | .proc_handler = &dirty_writeback_centisecs_handler, | ||
709 | }, | ||
710 | { | ||
711 | .ctl_name = VM_DIRTY_EXPIRE_CS, | ||
712 | .procname = "dirty_expire_centisecs", | ||
713 | .data = &dirty_expire_centisecs, | ||
714 | .maxlen = sizeof(dirty_expire_centisecs), | ||
715 | .mode = 0644, | ||
716 | .proc_handler = &proc_dointvec, | ||
717 | }, | ||
718 | { | ||
719 | .ctl_name = VM_NR_PDFLUSH_THREADS, | ||
720 | .procname = "nr_pdflush_threads", | ||
721 | .data = &nr_pdflush_threads, | ||
722 | .maxlen = sizeof nr_pdflush_threads, | ||
723 | .mode = 0444 /* read-only*/, | ||
724 | .proc_handler = &proc_dointvec, | ||
725 | }, | ||
726 | { | ||
727 | .ctl_name = VM_SWAPPINESS, | ||
728 | .procname = "swappiness", | ||
729 | .data = &vm_swappiness, | ||
730 | .maxlen = sizeof(vm_swappiness), | ||
731 | .mode = 0644, | ||
732 | .proc_handler = &proc_dointvec_minmax, | ||
733 | .strategy = &sysctl_intvec, | ||
734 | .extra1 = &zero, | ||
735 | .extra2 = &one_hundred, | ||
736 | }, | ||
737 | #ifdef CONFIG_HUGETLB_PAGE | ||
738 | { | ||
739 | .ctl_name = VM_HUGETLB_PAGES, | ||
740 | .procname = "nr_hugepages", | ||
741 | .data = &max_huge_pages, | ||
742 | .maxlen = sizeof(unsigned long), | ||
743 | .mode = 0644, | ||
744 | .proc_handler = &hugetlb_sysctl_handler, | ||
745 | .extra1 = (void *)&hugetlb_zero, | ||
746 | .extra2 = (void *)&hugetlb_infinity, | ||
747 | }, | ||
748 | { | ||
749 | .ctl_name = VM_HUGETLB_GROUP, | ||
750 | .procname = "hugetlb_shm_group", | ||
751 | .data = &sysctl_hugetlb_shm_group, | ||
752 | .maxlen = sizeof(gid_t), | ||
753 | .mode = 0644, | ||
754 | .proc_handler = &proc_dointvec, | ||
755 | }, | ||
756 | #endif | ||
757 | { | ||
758 | .ctl_name = VM_LOWMEM_RESERVE_RATIO, | ||
759 | .procname = "lowmem_reserve_ratio", | ||
760 | .data = &sysctl_lowmem_reserve_ratio, | ||
761 | .maxlen = sizeof(sysctl_lowmem_reserve_ratio), | ||
762 | .mode = 0644, | ||
763 | .proc_handler = &lowmem_reserve_ratio_sysctl_handler, | ||
764 | .strategy = &sysctl_intvec, | ||
765 | }, | ||
766 | { | ||
767 | .ctl_name = VM_MIN_FREE_KBYTES, | ||
768 | .procname = "min_free_kbytes", | ||
769 | .data = &min_free_kbytes, | ||
770 | .maxlen = sizeof(min_free_kbytes), | ||
771 | .mode = 0644, | ||
772 | .proc_handler = &min_free_kbytes_sysctl_handler, | ||
773 | .strategy = &sysctl_intvec, | ||
774 | .extra1 = &zero, | ||
775 | }, | ||
776 | #ifdef CONFIG_MMU | ||
777 | { | ||
778 | .ctl_name = VM_MAX_MAP_COUNT, | ||
779 | .procname = "max_map_count", | ||
780 | .data = &sysctl_max_map_count, | ||
781 | .maxlen = sizeof(sysctl_max_map_count), | ||
782 | .mode = 0644, | ||
783 | .proc_handler = &proc_dointvec | ||
784 | }, | ||
785 | #endif | ||
786 | { | ||
787 | .ctl_name = VM_LAPTOP_MODE, | ||
788 | .procname = "laptop_mode", | ||
789 | .data = &laptop_mode, | ||
790 | .maxlen = sizeof(laptop_mode), | ||
791 | .mode = 0644, | ||
792 | .proc_handler = &proc_dointvec, | ||
793 | .strategy = &sysctl_intvec, | ||
794 | .extra1 = &zero, | ||
795 | }, | ||
796 | { | ||
797 | .ctl_name = VM_BLOCK_DUMP, | ||
798 | .procname = "block_dump", | ||
799 | .data = &block_dump, | ||
800 | .maxlen = sizeof(block_dump), | ||
801 | .mode = 0644, | ||
802 | .proc_handler = &proc_dointvec, | ||
803 | .strategy = &sysctl_intvec, | ||
804 | .extra1 = &zero, | ||
805 | }, | ||
806 | { | ||
807 | .ctl_name = VM_VFS_CACHE_PRESSURE, | ||
808 | .procname = "vfs_cache_pressure", | ||
809 | .data = &sysctl_vfs_cache_pressure, | ||
810 | .maxlen = sizeof(sysctl_vfs_cache_pressure), | ||
811 | .mode = 0644, | ||
812 | .proc_handler = &proc_dointvec, | ||
813 | .strategy = &sysctl_intvec, | ||
814 | .extra1 = &zero, | ||
815 | }, | ||
816 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | ||
817 | { | ||
818 | .ctl_name = VM_LEGACY_VA_LAYOUT, | ||
819 | .procname = "legacy_va_layout", | ||
820 | .data = &sysctl_legacy_va_layout, | ||
821 | .maxlen = sizeof(sysctl_legacy_va_layout), | ||
822 | .mode = 0644, | ||
823 | .proc_handler = &proc_dointvec, | ||
824 | .strategy = &sysctl_intvec, | ||
825 | .extra1 = &zero, | ||
826 | }, | ||
827 | #endif | ||
828 | #ifdef CONFIG_SWAP | ||
829 | { | ||
830 | .ctl_name = VM_SWAP_TOKEN_TIMEOUT, | ||
831 | .procname = "swap_token_timeout", | ||
832 | .data = &swap_token_default_timeout, | ||
833 | .maxlen = sizeof(swap_token_default_timeout), | ||
834 | .mode = 0644, | ||
835 | .proc_handler = &proc_dointvec_jiffies, | ||
836 | .strategy = &sysctl_jiffies, | ||
837 | }, | ||
838 | #endif | ||
839 | { .ctl_name = 0 } | ||
840 | }; | ||
841 | |||
842 | static ctl_table proc_table[] = { | ||
843 | { .ctl_name = 0 } | ||
844 | }; | ||
845 | |||
846 | static ctl_table fs_table[] = { | ||
847 | { | ||
848 | .ctl_name = FS_NRINODE, | ||
849 | .procname = "inode-nr", | ||
850 | .data = &inodes_stat, | ||
851 | .maxlen = 2*sizeof(int), | ||
852 | .mode = 0444, | ||
853 | .proc_handler = &proc_dointvec, | ||
854 | }, | ||
855 | { | ||
856 | .ctl_name = FS_STATINODE, | ||
857 | .procname = "inode-state", | ||
858 | .data = &inodes_stat, | ||
859 | .maxlen = 7*sizeof(int), | ||
860 | .mode = 0444, | ||
861 | .proc_handler = &proc_dointvec, | ||
862 | }, | ||
863 | { | ||
864 | .ctl_name = FS_NRFILE, | ||
865 | .procname = "file-nr", | ||
866 | .data = &files_stat, | ||
867 | .maxlen = 3*sizeof(int), | ||
868 | .mode = 0444, | ||
869 | .proc_handler = &proc_dointvec, | ||
870 | }, | ||
871 | { | ||
872 | .ctl_name = FS_MAXFILE, | ||
873 | .procname = "file-max", | ||
874 | .data = &files_stat.max_files, | ||
875 | .maxlen = sizeof(int), | ||
876 | .mode = 0644, | ||
877 | .proc_handler = &proc_dointvec, | ||
878 | }, | ||
879 | { | ||
880 | .ctl_name = FS_DENTRY, | ||
881 | .procname = "dentry-state", | ||
882 | .data = &dentry_stat, | ||
883 | .maxlen = 6*sizeof(int), | ||
884 | .mode = 0444, | ||
885 | .proc_handler = &proc_dointvec, | ||
886 | }, | ||
887 | { | ||
888 | .ctl_name = FS_OVERFLOWUID, | ||
889 | .procname = "overflowuid", | ||
890 | .data = &fs_overflowuid, | ||
891 | .maxlen = sizeof(int), | ||
892 | .mode = 0644, | ||
893 | .proc_handler = &proc_dointvec_minmax, | ||
894 | .strategy = &sysctl_intvec, | ||
895 | .extra1 = &minolduid, | ||
896 | .extra2 = &maxolduid, | ||
897 | }, | ||
898 | { | ||
899 | .ctl_name = FS_OVERFLOWGID, | ||
900 | .procname = "overflowgid", | ||
901 | .data = &fs_overflowgid, | ||
902 | .maxlen = sizeof(int), | ||
903 | .mode = 0644, | ||
904 | .proc_handler = &proc_dointvec_minmax, | ||
905 | .strategy = &sysctl_intvec, | ||
906 | .extra1 = &minolduid, | ||
907 | .extra2 = &maxolduid, | ||
908 | }, | ||
909 | { | ||
910 | .ctl_name = FS_LEASES, | ||
911 | .procname = "leases-enable", | ||
912 | .data = &leases_enable, | ||
913 | .maxlen = sizeof(int), | ||
914 | .mode = 0644, | ||
915 | .proc_handler = &proc_dointvec, | ||
916 | }, | ||
917 | #ifdef CONFIG_DNOTIFY | ||
918 | { | ||
919 | .ctl_name = FS_DIR_NOTIFY, | ||
920 | .procname = "dir-notify-enable", | ||
921 | .data = &dir_notify_enable, | ||
922 | .maxlen = sizeof(int), | ||
923 | .mode = 0644, | ||
924 | .proc_handler = &proc_dointvec, | ||
925 | }, | ||
926 | #endif | ||
927 | #ifdef CONFIG_MMU | ||
928 | { | ||
929 | .ctl_name = FS_LEASE_TIME, | ||
930 | .procname = "lease-break-time", | ||
931 | .data = &lease_break_time, | ||
932 | .maxlen = sizeof(int), | ||
933 | .mode = 0644, | ||
934 | .proc_handler = &proc_dointvec, | ||
935 | }, | ||
936 | { | ||
937 | .ctl_name = FS_AIO_NR, | ||
938 | .procname = "aio-nr", | ||
939 | .data = &aio_nr, | ||
940 | .maxlen = sizeof(aio_nr), | ||
941 | .mode = 0444, | ||
942 | .proc_handler = &proc_dointvec, | ||
943 | }, | ||
944 | { | ||
945 | .ctl_name = FS_AIO_MAX_NR, | ||
946 | .procname = "aio-max-nr", | ||
947 | .data = &aio_max_nr, | ||
948 | .maxlen = sizeof(aio_max_nr), | ||
949 | .mode = 0644, | ||
950 | .proc_handler = &proc_dointvec, | ||
951 | }, | ||
952 | #endif | ||
953 | { .ctl_name = 0 } | ||
954 | }; | ||
955 | |||
956 | static ctl_table debug_table[] = { | ||
957 | { .ctl_name = 0 } | ||
958 | }; | ||
959 | |||
960 | static ctl_table dev_table[] = { | ||
961 | { .ctl_name = 0 } | ||
962 | }; | ||
963 | |||
964 | extern void init_irq_proc (void); | ||
965 | |||
966 | void __init sysctl_init(void) | ||
967 | { | ||
968 | #ifdef CONFIG_PROC_FS | ||
969 | register_proc_table(root_table, proc_sys_root); | ||
970 | init_irq_proc(); | ||
971 | #endif | ||
972 | } | ||
973 | |||
974 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | ||
975 | void __user *newval, size_t newlen) | ||
976 | { | ||
977 | struct list_head *tmp; | ||
978 | |||
979 | if (nlen <= 0 || nlen >= CTL_MAXNAME) | ||
980 | return -ENOTDIR; | ||
981 | if (oldval) { | ||
982 | int old_len; | ||
983 | if (!oldlenp || get_user(old_len, oldlenp)) | ||
984 | return -EFAULT; | ||
985 | } | ||
986 | tmp = &root_table_header.ctl_entry; | ||
987 | do { | ||
988 | struct ctl_table_header *head = | ||
989 | list_entry(tmp, struct ctl_table_header, ctl_entry); | ||
990 | void *context = NULL; | ||
991 | int error = parse_table(name, nlen, oldval, oldlenp, | ||
992 | newval, newlen, head->ctl_table, | ||
993 | &context); | ||
994 | if (context) | ||
995 | kfree(context); | ||
996 | if (error != -ENOTDIR) | ||
997 | return error; | ||
998 | tmp = tmp->next; | ||
999 | } while (tmp != &root_table_header.ctl_entry); | ||
1000 | return -ENOTDIR; | ||
1001 | } | ||
1002 | |||
1003 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | ||
1004 | { | ||
1005 | struct __sysctl_args tmp; | ||
1006 | int error; | ||
1007 | |||
1008 | if (copy_from_user(&tmp, args, sizeof(tmp))) | ||
1009 | return -EFAULT; | ||
1010 | |||
1011 | lock_kernel(); | ||
1012 | error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, | ||
1013 | tmp.newval, tmp.newlen); | ||
1014 | unlock_kernel(); | ||
1015 | return error; | ||
1016 | } | ||
1017 | |||
1018 | /* | ||
1019 | * ctl_perm does NOT grant the superuser all rights automatically, because | ||
1020 | * some sysctl variables are readonly even to root. | ||
1021 | */ | ||
1022 | |||
1023 | static int test_perm(int mode, int op) | ||
1024 | { | ||
1025 | if (!current->euid) | ||
1026 | mode >>= 6; | ||
1027 | else if (in_egroup_p(0)) | ||
1028 | mode >>= 3; | ||
1029 | if ((mode & op & 0007) == op) | ||
1030 | return 0; | ||
1031 | return -EACCES; | ||
1032 | } | ||
1033 | |||
1034 | static inline int ctl_perm(ctl_table *table, int op) | ||
1035 | { | ||
1036 | int error; | ||
1037 | error = security_sysctl(table, op); | ||
1038 | if (error) | ||
1039 | return error; | ||
1040 | return test_perm(table->mode, op); | ||
1041 | } | ||
1042 | |||
1043 | static int parse_table(int __user *name, int nlen, | ||
1044 | void __user *oldval, size_t __user *oldlenp, | ||
1045 | void __user *newval, size_t newlen, | ||
1046 | ctl_table *table, void **context) | ||
1047 | { | ||
1048 | int n; | ||
1049 | repeat: | ||
1050 | if (!nlen) | ||
1051 | return -ENOTDIR; | ||
1052 | if (get_user(n, name)) | ||
1053 | return -EFAULT; | ||
1054 | for ( ; table->ctl_name; table++) { | ||
1055 | if (n == table->ctl_name || table->ctl_name == CTL_ANY) { | ||
1056 | int error; | ||
1057 | if (table->child) { | ||
1058 | if (ctl_perm(table, 001)) | ||
1059 | return -EPERM; | ||
1060 | if (table->strategy) { | ||
1061 | error = table->strategy( | ||
1062 | table, name, nlen, | ||
1063 | oldval, oldlenp, | ||
1064 | newval, newlen, context); | ||
1065 | if (error) | ||
1066 | return error; | ||
1067 | } | ||
1068 | name++; | ||
1069 | nlen--; | ||
1070 | table = table->child; | ||
1071 | goto repeat; | ||
1072 | } | ||
1073 | error = do_sysctl_strategy(table, name, nlen, | ||
1074 | oldval, oldlenp, | ||
1075 | newval, newlen, context); | ||
1076 | return error; | ||
1077 | } | ||
1078 | } | ||
1079 | return -ENOTDIR; | ||
1080 | } | ||
1081 | |||
1082 | /* Perform the actual read/write of a sysctl table entry. */ | ||
1083 | int do_sysctl_strategy (ctl_table *table, | ||
1084 | int __user *name, int nlen, | ||
1085 | void __user *oldval, size_t __user *oldlenp, | ||
1086 | void __user *newval, size_t newlen, void **context) | ||
1087 | { | ||
1088 | int op = 0, rc; | ||
1089 | size_t len; | ||
1090 | |||
1091 | if (oldval) | ||
1092 | op |= 004; | ||
1093 | if (newval) | ||
1094 | op |= 002; | ||
1095 | if (ctl_perm(table, op)) | ||
1096 | return -EPERM; | ||
1097 | |||
1098 | if (table->strategy) { | ||
1099 | rc = table->strategy(table, name, nlen, oldval, oldlenp, | ||
1100 | newval, newlen, context); | ||
1101 | if (rc < 0) | ||
1102 | return rc; | ||
1103 | if (rc > 0) | ||
1104 | return 0; | ||
1105 | } | ||
1106 | |||
1107 | /* If there is no strategy routine, or if the strategy returns | ||
1108 | * zero, proceed with automatic r/w */ | ||
1109 | if (table->data && table->maxlen) { | ||
1110 | if (oldval && oldlenp) { | ||
1111 | if (get_user(len, oldlenp)) | ||
1112 | return -EFAULT; | ||
1113 | if (len) { | ||
1114 | if (len > table->maxlen) | ||
1115 | len = table->maxlen; | ||
1116 | if(copy_to_user(oldval, table->data, len)) | ||
1117 | return -EFAULT; | ||
1118 | if(put_user(len, oldlenp)) | ||
1119 | return -EFAULT; | ||
1120 | } | ||
1121 | } | ||
1122 | if (newval && newlen) { | ||
1123 | len = newlen; | ||
1124 | if (len > table->maxlen) | ||
1125 | len = table->maxlen; | ||
1126 | if(copy_from_user(table->data, newval, len)) | ||
1127 | return -EFAULT; | ||
1128 | } | ||
1129 | } | ||
1130 | return 0; | ||
1131 | } | ||
1132 | |||
1133 | /** | ||
1134 | * register_sysctl_table - register a sysctl hierarchy | ||
1135 | * @table: the top-level table structure | ||
1136 | * @insert_at_head: whether the entry should be inserted in front or at the end | ||
1137 | * | ||
1138 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1139 | * array. An entry with a ctl_name of 0 terminates the table. | ||
1140 | * | ||
1141 | * The members of the &ctl_table structure are used as follows: | ||
1142 | * | ||
1143 | * ctl_name - This is the numeric sysctl value used by sysctl(2). The number | ||
1144 | * must be unique within that level of sysctl | ||
1145 | * | ||
1146 | * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not | ||
1147 | * enter a sysctl file | ||
1148 | * | ||
1149 | * data - a pointer to data for use by proc_handler | ||
1150 | * | ||
1151 | * maxlen - the maximum size in bytes of the data | ||
1152 | * | ||
1153 | * mode - the file permissions for the /proc/sys file, and for sysctl(2) | ||
1154 | * | ||
1155 | * child - a pointer to the child sysctl table if this entry is a directory, or | ||
1156 | * %NULL. | ||
1157 | * | ||
1158 | * proc_handler - the text handler routine (described below) | ||
1159 | * | ||
1160 | * strategy - the strategy routine (described below) | ||
1161 | * | ||
1162 | * de - for internal use by the sysctl routines | ||
1163 | * | ||
1164 | * extra1, extra2 - extra pointers usable by the proc handler routines | ||
1165 | * | ||
1166 | * Leaf nodes in the sysctl tree will be represented by a single file | ||
1167 | * under /proc; non-leaf nodes will be represented by directories. | ||
1168 | * | ||
1169 | * sysctl(2) can automatically manage read and write requests through | ||
1170 | * the sysctl table. The data and maxlen fields of the ctl_table | ||
1171 | * struct enable minimal validation of the values being written to be | ||
1172 | * performed, and the mode field allows minimal authentication. | ||
1173 | * | ||
1174 | * More sophisticated management can be enabled by the provision of a | ||
1175 | * strategy routine with the table entry. This will be called before | ||
1176 | * any automatic read or write of the data is performed. | ||
1177 | * | ||
1178 | * The strategy routine may return | ||
1179 | * | ||
1180 | * < 0 - Error occurred (error is passed to user process) | ||
1181 | * | ||
1182 | * 0 - OK - proceed with automatic read or write. | ||
1183 | * | ||
1184 | * > 0 - OK - read or write has been done by the strategy routine, so | ||
1185 | * return immediately. | ||
1186 | * | ||
1187 | * There must be a proc_handler routine for any terminal nodes | ||
1188 | * mirrored under /proc/sys (non-terminals are handled by a built-in | ||
1189 | * directory handler). Several default handlers are available to | ||
1190 | * cover common cases - | ||
1191 | * | ||
1192 | * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), | ||
1193 | * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), | ||
1194 | * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() | ||
1195 | * | ||
1196 | * It is the handler's job to read the input buffer from user memory | ||
1197 | * and process it. The handler should return 0 on success. | ||
1198 | * | ||
1199 | * This routine returns %NULL on a failure to register, and a pointer | ||
1200 | * to the table header on success. | ||
1201 | */ | ||
1202 | struct ctl_table_header *register_sysctl_table(ctl_table * table, | ||
1203 | int insert_at_head) | ||
1204 | { | ||
1205 | struct ctl_table_header *tmp; | ||
1206 | tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); | ||
1207 | if (!tmp) | ||
1208 | return NULL; | ||
1209 | tmp->ctl_table = table; | ||
1210 | INIT_LIST_HEAD(&tmp->ctl_entry); | ||
1211 | if (insert_at_head) | ||
1212 | list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); | ||
1213 | else | ||
1214 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); | ||
1215 | #ifdef CONFIG_PROC_FS | ||
1216 | register_proc_table(table, proc_sys_root); | ||
1217 | #endif | ||
1218 | return tmp; | ||
1219 | } | ||
1220 | |||
1221 | /** | ||
1222 | * unregister_sysctl_table - unregister a sysctl table hierarchy | ||
1223 | * @header: the header returned from register_sysctl_table | ||
1224 | * | ||
1225 | * Unregisters the sysctl table and all children. proc entries may not | ||
1226 | * actually be removed until they are no longer used by anyone. | ||
1227 | */ | ||
1228 | void unregister_sysctl_table(struct ctl_table_header * header) | ||
1229 | { | ||
1230 | list_del(&header->ctl_entry); | ||
1231 | #ifdef CONFIG_PROC_FS | ||
1232 | unregister_proc_table(header->ctl_table, proc_sys_root); | ||
1233 | #endif | ||
1234 | kfree(header); | ||
1235 | } | ||
1236 | |||
1237 | /* | ||
1238 | * /proc/sys support | ||
1239 | */ | ||
1240 | |||
1241 | #ifdef CONFIG_PROC_FS | ||
1242 | |||
1243 | /* Scan the sysctl entries in table and add them all into /proc */ | ||
1244 | static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) | ||
1245 | { | ||
1246 | struct proc_dir_entry *de; | ||
1247 | int len; | ||
1248 | mode_t mode; | ||
1249 | |||
1250 | for (; table->ctl_name; table++) { | ||
1251 | /* Can't do anything without a proc name. */ | ||
1252 | if (!table->procname) | ||
1253 | continue; | ||
1254 | /* Maybe we can't do anything with it... */ | ||
1255 | if (!table->proc_handler && !table->child) { | ||
1256 | printk(KERN_WARNING "SYSCTL: Can't register %s\n", | ||
1257 | table->procname); | ||
1258 | continue; | ||
1259 | } | ||
1260 | |||
1261 | len = strlen(table->procname); | ||
1262 | mode = table->mode; | ||
1263 | |||
1264 | de = NULL; | ||
1265 | if (table->proc_handler) | ||
1266 | mode |= S_IFREG; | ||
1267 | else { | ||
1268 | mode |= S_IFDIR; | ||
1269 | for (de = root->subdir; de; de = de->next) { | ||
1270 | if (proc_match(len, table->procname, de)) | ||
1271 | break; | ||
1272 | } | ||
1273 | /* If the subdir exists already, de is non-NULL */ | ||
1274 | } | ||
1275 | |||
1276 | if (!de) { | ||
1277 | de = create_proc_entry(table->procname, mode, root); | ||
1278 | if (!de) | ||
1279 | continue; | ||
1280 | de->data = (void *) table; | ||
1281 | if (table->proc_handler) | ||
1282 | de->proc_fops = &proc_sys_file_operations; | ||
1283 | } | ||
1284 | table->de = de; | ||
1285 | if (de->mode & S_IFDIR) | ||
1286 | register_proc_table(table->child, de); | ||
1287 | } | ||
1288 | } | ||
1289 | |||
1290 | /* | ||
1291 | * Unregister a /proc sysctl table and any subdirectories. | ||
1292 | */ | ||
1293 | static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) | ||
1294 | { | ||
1295 | struct proc_dir_entry *de; | ||
1296 | for (; table->ctl_name; table++) { | ||
1297 | if (!(de = table->de)) | ||
1298 | continue; | ||
1299 | if (de->mode & S_IFDIR) { | ||
1300 | if (!table->child) { | ||
1301 | printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); | ||
1302 | continue; | ||
1303 | } | ||
1304 | unregister_proc_table(table->child, de); | ||
1305 | |||
1306 | /* Don't unregister directories which still have entries.. */ | ||
1307 | if (de->subdir) | ||
1308 | continue; | ||
1309 | } | ||
1310 | |||
1311 | /* Don't unregister proc entries that are still being used.. */ | ||
1312 | if (atomic_read(&de->count)) | ||
1313 | continue; | ||
1314 | |||
1315 | table->de = NULL; | ||
1316 | remove_proc_entry(table->procname, root); | ||
1317 | } | ||
1318 | } | ||
1319 | |||
1320 | static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, | ||
1321 | size_t count, loff_t *ppos) | ||
1322 | { | ||
1323 | int op; | ||
1324 | struct proc_dir_entry *de; | ||
1325 | struct ctl_table *table; | ||
1326 | size_t res; | ||
1327 | ssize_t error; | ||
1328 | |||
1329 | de = PDE(file->f_dentry->d_inode); | ||
1330 | if (!de || !de->data) | ||
1331 | return -ENOTDIR; | ||
1332 | table = (struct ctl_table *) de->data; | ||
1333 | if (!table || !table->proc_handler) | ||
1334 | return -ENOTDIR; | ||
1335 | op = (write ? 002 : 004); | ||
1336 | if (ctl_perm(table, op)) | ||
1337 | return -EPERM; | ||
1338 | |||
1339 | res = count; | ||
1340 | |||
1341 | error = (*table->proc_handler) (table, write, file, buf, &res, ppos); | ||
1342 | if (error) | ||
1343 | return error; | ||
1344 | return res; | ||
1345 | } | ||
1346 | |||
1347 | static int proc_opensys(struct inode *inode, struct file *file) | ||
1348 | { | ||
1349 | if (file->f_mode & FMODE_WRITE) { | ||
1350 | /* | ||
1351 | * sysctl entries that are not writable, | ||
1352 | * are _NOT_ writable, capabilities or not. | ||
1353 | */ | ||
1354 | if (!(inode->i_mode & S_IWUSR)) | ||
1355 | return -EPERM; | ||
1356 | } | ||
1357 | |||
1358 | return 0; | ||
1359 | } | ||
1360 | |||
1361 | static ssize_t proc_readsys(struct file * file, char __user * buf, | ||
1362 | size_t count, loff_t *ppos) | ||
1363 | { | ||
1364 | return do_rw_proc(0, file, buf, count, ppos); | ||
1365 | } | ||
1366 | |||
1367 | static ssize_t proc_writesys(struct file * file, const char __user * buf, | ||
1368 | size_t count, loff_t *ppos) | ||
1369 | { | ||
1370 | return do_rw_proc(1, file, (char __user *) buf, count, ppos); | ||
1371 | } | ||
1372 | |||
1373 | /** | ||
1374 | * proc_dostring - read a string sysctl | ||
1375 | * @table: the sysctl table | ||
1376 | * @write: %TRUE if this is a write to the sysctl file | ||
1377 | * @filp: the file structure | ||
1378 | * @buffer: the user buffer | ||
1379 | * @lenp: the size of the user buffer | ||
1380 | * @ppos: file position | ||
1381 | * | ||
1382 | * Reads/writes a string from/to the user buffer. If the kernel | ||
1383 | * buffer provided is not large enough to hold the string, the | ||
1384 | * string is truncated. The copied string is %NULL-terminated. | ||
1385 | * If the string is being read by the user process, it is copied | ||
1386 | * and a newline '\n' is added. It is truncated if the buffer is | ||
1387 | * not large enough. | ||
1388 | * | ||
1389 | * Returns 0 on success. | ||
1390 | */ | ||
1391 | int proc_dostring(ctl_table *table, int write, struct file *filp, | ||
1392 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1393 | { | ||
1394 | size_t len; | ||
1395 | char __user *p; | ||
1396 | char c; | ||
1397 | |||
1398 | if (!table->data || !table->maxlen || !*lenp || | ||
1399 | (*ppos && !write)) { | ||
1400 | *lenp = 0; | ||
1401 | return 0; | ||
1402 | } | ||
1403 | |||
1404 | if (write) { | ||
1405 | len = 0; | ||
1406 | p = buffer; | ||
1407 | while (len < *lenp) { | ||
1408 | if (get_user(c, p++)) | ||
1409 | return -EFAULT; | ||
1410 | if (c == 0 || c == '\n') | ||
1411 | break; | ||
1412 | len++; | ||
1413 | } | ||
1414 | if (len >= table->maxlen) | ||
1415 | len = table->maxlen-1; | ||
1416 | if(copy_from_user(table->data, buffer, len)) | ||
1417 | return -EFAULT; | ||
1418 | ((char *) table->data)[len] = 0; | ||
1419 | *ppos += *lenp; | ||
1420 | } else { | ||
1421 | len = strlen(table->data); | ||
1422 | if (len > table->maxlen) | ||
1423 | len = table->maxlen; | ||
1424 | if (len > *lenp) | ||
1425 | len = *lenp; | ||
1426 | if (len) | ||
1427 | if(copy_to_user(buffer, table->data, len)) | ||
1428 | return -EFAULT; | ||
1429 | if (len < *lenp) { | ||
1430 | if(put_user('\n', ((char __user *) buffer) + len)) | ||
1431 | return -EFAULT; | ||
1432 | len++; | ||
1433 | } | ||
1434 | *lenp = len; | ||
1435 | *ppos += len; | ||
1436 | } | ||
1437 | return 0; | ||
1438 | } | ||
1439 | |||
1440 | /* | ||
1441 | * Special case of dostring for the UTS structure. This has locks | ||
1442 | * to observe. Should this be in kernel/sys.c ???? | ||
1443 | */ | ||
1444 | |||
1445 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, | ||
1446 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1447 | { | ||
1448 | int r; | ||
1449 | |||
1450 | if (!write) { | ||
1451 | down_read(&uts_sem); | ||
1452 | r=proc_dostring(table,0,filp,buffer,lenp, ppos); | ||
1453 | up_read(&uts_sem); | ||
1454 | } else { | ||
1455 | down_write(&uts_sem); | ||
1456 | r=proc_dostring(table,1,filp,buffer,lenp, ppos); | ||
1457 | up_write(&uts_sem); | ||
1458 | } | ||
1459 | return r; | ||
1460 | } | ||
1461 | |||
1462 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, | ||
1463 | int *valp, | ||
1464 | int write, void *data) | ||
1465 | { | ||
1466 | if (write) { | ||
1467 | *valp = *negp ? -*lvalp : *lvalp; | ||
1468 | } else { | ||
1469 | int val = *valp; | ||
1470 | if (val < 0) { | ||
1471 | *negp = -1; | ||
1472 | *lvalp = (unsigned long)-val; | ||
1473 | } else { | ||
1474 | *negp = 0; | ||
1475 | *lvalp = (unsigned long)val; | ||
1476 | } | ||
1477 | } | ||
1478 | return 0; | ||
1479 | } | ||
1480 | |||
1481 | static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, | ||
1482 | void __user *buffer, size_t *lenp, loff_t *ppos, | ||
1483 | int (*conv)(int *negp, unsigned long *lvalp, int *valp, | ||
1484 | int write, void *data), | ||
1485 | void *data) | ||
1486 | { | ||
1487 | #define TMPBUFLEN 21 | ||
1488 | int *i, vleft, first=1, neg, val; | ||
1489 | unsigned long lval; | ||
1490 | size_t left, len; | ||
1491 | |||
1492 | char buf[TMPBUFLEN], *p; | ||
1493 | char __user *s = buffer; | ||
1494 | |||
1495 | if (!table->data || !table->maxlen || !*lenp || | ||
1496 | (*ppos && !write)) { | ||
1497 | *lenp = 0; | ||
1498 | return 0; | ||
1499 | } | ||
1500 | |||
1501 | i = (int *) table->data; | ||
1502 | vleft = table->maxlen / sizeof(*i); | ||
1503 | left = *lenp; | ||
1504 | |||
1505 | if (!conv) | ||
1506 | conv = do_proc_dointvec_conv; | ||
1507 | |||
1508 | for (; left && vleft--; i++, first=0) { | ||
1509 | if (write) { | ||
1510 | while (left) { | ||
1511 | char c; | ||
1512 | if (get_user(c, s)) | ||
1513 | return -EFAULT; | ||
1514 | if (!isspace(c)) | ||
1515 | break; | ||
1516 | left--; | ||
1517 | s++; | ||
1518 | } | ||
1519 | if (!left) | ||
1520 | break; | ||
1521 | neg = 0; | ||
1522 | len = left; | ||
1523 | if (len > sizeof(buf) - 1) | ||
1524 | len = sizeof(buf) - 1; | ||
1525 | if (copy_from_user(buf, s, len)) | ||
1526 | return -EFAULT; | ||
1527 | buf[len] = 0; | ||
1528 | p = buf; | ||
1529 | if (*p == '-' && left > 1) { | ||
1530 | neg = 1; | ||
1531 | left--, p++; | ||
1532 | } | ||
1533 | if (*p < '0' || *p > '9') | ||
1534 | break; | ||
1535 | |||
1536 | lval = simple_strtoul(p, &p, 0); | ||
1537 | |||
1538 | len = p-buf; | ||
1539 | if ((len < left) && *p && !isspace(*p)) | ||
1540 | break; | ||
1541 | if (neg) | ||
1542 | val = -val; | ||
1543 | s += len; | ||
1544 | left -= len; | ||
1545 | |||
1546 | if (conv(&neg, &lval, i, 1, data)) | ||
1547 | break; | ||
1548 | } else { | ||
1549 | p = buf; | ||
1550 | if (!first) | ||
1551 | *p++ = '\t'; | ||
1552 | |||
1553 | if (conv(&neg, &lval, i, 0, data)) | ||
1554 | break; | ||
1555 | |||
1556 | sprintf(p, "%s%lu", neg ? "-" : "", lval); | ||
1557 | len = strlen(buf); | ||
1558 | if (len > left) | ||
1559 | len = left; | ||
1560 | if(copy_to_user(s, buf, len)) | ||
1561 | return -EFAULT; | ||
1562 | left -= len; | ||
1563 | s += len; | ||
1564 | } | ||
1565 | } | ||
1566 | |||
1567 | if (!write && !first && left) { | ||
1568 | if(put_user('\n', s)) | ||
1569 | return -EFAULT; | ||
1570 | left--, s++; | ||
1571 | } | ||
1572 | if (write) { | ||
1573 | while (left) { | ||
1574 | char c; | ||
1575 | if (get_user(c, s++)) | ||
1576 | return -EFAULT; | ||
1577 | if (!isspace(c)) | ||
1578 | break; | ||
1579 | left--; | ||
1580 | } | ||
1581 | } | ||
1582 | if (write && first) | ||
1583 | return -EINVAL; | ||
1584 | *lenp -= left; | ||
1585 | *ppos += *lenp; | ||
1586 | return 0; | ||
1587 | #undef TMPBUFLEN | ||
1588 | } | ||
1589 | |||
1590 | /** | ||
1591 | * proc_dointvec - read a vector of integers | ||
1592 | * @table: the sysctl table | ||
1593 | * @write: %TRUE if this is a write to the sysctl file | ||
1594 | * @filp: the file structure | ||
1595 | * @buffer: the user buffer | ||
1596 | * @lenp: the size of the user buffer | ||
1597 | * @ppos: file position | ||
1598 | * | ||
1599 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | ||
1600 | * values from/to the user buffer, treated as an ASCII string. | ||
1601 | * | ||
1602 | * Returns 0 on success. | ||
1603 | */ | ||
1604 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | ||
1605 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1606 | { | ||
1607 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | ||
1608 | NULL,NULL); | ||
1609 | } | ||
1610 | |||
1611 | #define OP_SET 0 | ||
1612 | #define OP_AND 1 | ||
1613 | #define OP_OR 2 | ||
1614 | #define OP_MAX 3 | ||
1615 | #define OP_MIN 4 | ||
1616 | |||
1617 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | ||
1618 | int *valp, | ||
1619 | int write, void *data) | ||
1620 | { | ||
1621 | int op = *(int *)data; | ||
1622 | if (write) { | ||
1623 | int val = *negp ? -*lvalp : *lvalp; | ||
1624 | switch(op) { | ||
1625 | case OP_SET: *valp = val; break; | ||
1626 | case OP_AND: *valp &= val; break; | ||
1627 | case OP_OR: *valp |= val; break; | ||
1628 | case OP_MAX: if(*valp < val) | ||
1629 | *valp = val; | ||
1630 | break; | ||
1631 | case OP_MIN: if(*valp > val) | ||
1632 | *valp = val; | ||
1633 | break; | ||
1634 | } | ||
1635 | } else { | ||
1636 | int val = *valp; | ||
1637 | if (val < 0) { | ||
1638 | *negp = -1; | ||
1639 | *lvalp = (unsigned long)-val; | ||
1640 | } else { | ||
1641 | *negp = 0; | ||
1642 | *lvalp = (unsigned long)val; | ||
1643 | } | ||
1644 | } | ||
1645 | return 0; | ||
1646 | } | ||
1647 | |||
1648 | /* | ||
1649 | * init may raise the set. | ||
1650 | */ | ||
1651 | |||
1652 | int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, | ||
1653 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1654 | { | ||
1655 | int op; | ||
1656 | |||
1657 | if (!capable(CAP_SYS_MODULE)) { | ||
1658 | return -EPERM; | ||
1659 | } | ||
1660 | |||
1661 | op = (current->pid == 1) ? OP_SET : OP_AND; | ||
1662 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | ||
1663 | do_proc_dointvec_bset_conv,&op); | ||
1664 | } | ||
1665 | |||
1666 | struct do_proc_dointvec_minmax_conv_param { | ||
1667 | int *min; | ||
1668 | int *max; | ||
1669 | }; | ||
1670 | |||
1671 | static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, | ||
1672 | int *valp, | ||
1673 | int write, void *data) | ||
1674 | { | ||
1675 | struct do_proc_dointvec_minmax_conv_param *param = data; | ||
1676 | if (write) { | ||
1677 | int val = *negp ? -*lvalp : *lvalp; | ||
1678 | if ((param->min && *param->min > val) || | ||
1679 | (param->max && *param->max < val)) | ||
1680 | return -EINVAL; | ||
1681 | *valp = val; | ||
1682 | } else { | ||
1683 | int val = *valp; | ||
1684 | if (val < 0) { | ||
1685 | *negp = -1; | ||
1686 | *lvalp = (unsigned long)-val; | ||
1687 | } else { | ||
1688 | *negp = 0; | ||
1689 | *lvalp = (unsigned long)val; | ||
1690 | } | ||
1691 | } | ||
1692 | return 0; | ||
1693 | } | ||
1694 | |||
1695 | /** | ||
1696 | * proc_dointvec_minmax - read a vector of integers with min/max values | ||
1697 | * @table: the sysctl table | ||
1698 | * @write: %TRUE if this is a write to the sysctl file | ||
1699 | * @filp: the file structure | ||
1700 | * @buffer: the user buffer | ||
1701 | * @lenp: the size of the user buffer | ||
1702 | * @ppos: file position | ||
1703 | * | ||
1704 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | ||
1705 | * values from/to the user buffer, treated as an ASCII string. | ||
1706 | * | ||
1707 | * This routine will ensure the values are within the range specified by | ||
1708 | * table->extra1 (min) and table->extra2 (max). | ||
1709 | * | ||
1710 | * Returns 0 on success. | ||
1711 | */ | ||
1712 | int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, | ||
1713 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1714 | { | ||
1715 | struct do_proc_dointvec_minmax_conv_param param = { | ||
1716 | .min = (int *) table->extra1, | ||
1717 | .max = (int *) table->extra2, | ||
1718 | }; | ||
1719 | return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, | ||
1720 | do_proc_dointvec_minmax_conv, ¶m); | ||
1721 | } | ||
1722 | |||
1723 | static int do_proc_doulongvec_minmax(ctl_table *table, int write, | ||
1724 | struct file *filp, | ||
1725 | void __user *buffer, | ||
1726 | size_t *lenp, loff_t *ppos, | ||
1727 | unsigned long convmul, | ||
1728 | unsigned long convdiv) | ||
1729 | { | ||
1730 | #define TMPBUFLEN 21 | ||
1731 | unsigned long *i, *min, *max, val; | ||
1732 | int vleft, first=1, neg; | ||
1733 | size_t len, left; | ||
1734 | char buf[TMPBUFLEN], *p; | ||
1735 | char __user *s = buffer; | ||
1736 | |||
1737 | if (!table->data || !table->maxlen || !*lenp || | ||
1738 | (*ppos && !write)) { | ||
1739 | *lenp = 0; | ||
1740 | return 0; | ||
1741 | } | ||
1742 | |||
1743 | i = (unsigned long *) table->data; | ||
1744 | min = (unsigned long *) table->extra1; | ||
1745 | max = (unsigned long *) table->extra2; | ||
1746 | vleft = table->maxlen / sizeof(unsigned long); | ||
1747 | left = *lenp; | ||
1748 | |||
1749 | for (; left && vleft--; i++, min++, max++, first=0) { | ||
1750 | if (write) { | ||
1751 | while (left) { | ||
1752 | char c; | ||
1753 | if (get_user(c, s)) | ||
1754 | return -EFAULT; | ||
1755 | if (!isspace(c)) | ||
1756 | break; | ||
1757 | left--; | ||
1758 | s++; | ||
1759 | } | ||
1760 | if (!left) | ||
1761 | break; | ||
1762 | neg = 0; | ||
1763 | len = left; | ||
1764 | if (len > TMPBUFLEN-1) | ||
1765 | len = TMPBUFLEN-1; | ||
1766 | if (copy_from_user(buf, s, len)) | ||
1767 | return -EFAULT; | ||
1768 | buf[len] = 0; | ||
1769 | p = buf; | ||
1770 | if (*p == '-' && left > 1) { | ||
1771 | neg = 1; | ||
1772 | left--, p++; | ||
1773 | } | ||
1774 | if (*p < '0' || *p > '9') | ||
1775 | break; | ||
1776 | val = simple_strtoul(p, &p, 0) * convmul / convdiv ; | ||
1777 | len = p-buf; | ||
1778 | if ((len < left) && *p && !isspace(*p)) | ||
1779 | break; | ||
1780 | if (neg) | ||
1781 | val = -val; | ||
1782 | s += len; | ||
1783 | left -= len; | ||
1784 | |||
1785 | if(neg) | ||
1786 | continue; | ||
1787 | if ((min && val < *min) || (max && val > *max)) | ||
1788 | continue; | ||
1789 | *i = val; | ||
1790 | } else { | ||
1791 | p = buf; | ||
1792 | if (!first) | ||
1793 | *p++ = '\t'; | ||
1794 | sprintf(p, "%lu", convdiv * (*i) / convmul); | ||
1795 | len = strlen(buf); | ||
1796 | if (len > left) | ||
1797 | len = left; | ||
1798 | if(copy_to_user(s, buf, len)) | ||
1799 | return -EFAULT; | ||
1800 | left -= len; | ||
1801 | s += len; | ||
1802 | } | ||
1803 | } | ||
1804 | |||
1805 | if (!write && !first && left) { | ||
1806 | if(put_user('\n', s)) | ||
1807 | return -EFAULT; | ||
1808 | left--, s++; | ||
1809 | } | ||
1810 | if (write) { | ||
1811 | while (left) { | ||
1812 | char c; | ||
1813 | if (get_user(c, s++)) | ||
1814 | return -EFAULT; | ||
1815 | if (!isspace(c)) | ||
1816 | break; | ||
1817 | left--; | ||
1818 | } | ||
1819 | } | ||
1820 | if (write && first) | ||
1821 | return -EINVAL; | ||
1822 | *lenp -= left; | ||
1823 | *ppos += *lenp; | ||
1824 | return 0; | ||
1825 | #undef TMPBUFLEN | ||
1826 | } | ||
1827 | |||
1828 | /** | ||
1829 | * proc_doulongvec_minmax - read a vector of long integers with min/max values | ||
1830 | * @table: the sysctl table | ||
1831 | * @write: %TRUE if this is a write to the sysctl file | ||
1832 | * @filp: the file structure | ||
1833 | * @buffer: the user buffer | ||
1834 | * @lenp: the size of the user buffer | ||
1835 | * @ppos: file position | ||
1836 | * | ||
1837 | * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long | ||
1838 | * values from/to the user buffer, treated as an ASCII string. | ||
1839 | * | ||
1840 | * This routine will ensure the values are within the range specified by | ||
1841 | * table->extra1 (min) and table->extra2 (max). | ||
1842 | * | ||
1843 | * Returns 0 on success. | ||
1844 | */ | ||
1845 | int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
1846 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1847 | { | ||
1848 | return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); | ||
1849 | } | ||
1850 | |||
1851 | /** | ||
1852 | * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values | ||
1853 | * @table: the sysctl table | ||
1854 | * @write: %TRUE if this is a write to the sysctl file | ||
1855 | * @filp: the file structure | ||
1856 | * @buffer: the user buffer | ||
1857 | * @lenp: the size of the user buffer | ||
1858 | * @ppos: file position | ||
1859 | * | ||
1860 | * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long | ||
1861 | * values from/to the user buffer, treated as an ASCII string. The values | ||
1862 | * are treated as milliseconds, and converted to jiffies when they are stored. | ||
1863 | * | ||
1864 | * This routine will ensure the values are within the range specified by | ||
1865 | * table->extra1 (min) and table->extra2 (max). | ||
1866 | * | ||
1867 | * Returns 0 on success. | ||
1868 | */ | ||
1869 | int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, | ||
1870 | struct file *filp, | ||
1871 | void __user *buffer, | ||
1872 | size_t *lenp, loff_t *ppos) | ||
1873 | { | ||
1874 | return do_proc_doulongvec_minmax(table, write, filp, buffer, | ||
1875 | lenp, ppos, HZ, 1000l); | ||
1876 | } | ||
1877 | |||
1878 | |||
1879 | static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, | ||
1880 | int *valp, | ||
1881 | int write, void *data) | ||
1882 | { | ||
1883 | if (write) { | ||
1884 | *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); | ||
1885 | } else { | ||
1886 | int val = *valp; | ||
1887 | unsigned long lval; | ||
1888 | if (val < 0) { | ||
1889 | *negp = -1; | ||
1890 | lval = (unsigned long)-val; | ||
1891 | } else { | ||
1892 | *negp = 0; | ||
1893 | lval = (unsigned long)val; | ||
1894 | } | ||
1895 | *lvalp = lval / HZ; | ||
1896 | } | ||
1897 | return 0; | ||
1898 | } | ||
1899 | |||
1900 | static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, | ||
1901 | int *valp, | ||
1902 | int write, void *data) | ||
1903 | { | ||
1904 | if (write) { | ||
1905 | *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); | ||
1906 | } else { | ||
1907 | int val = *valp; | ||
1908 | unsigned long lval; | ||
1909 | if (val < 0) { | ||
1910 | *negp = -1; | ||
1911 | lval = (unsigned long)-val; | ||
1912 | } else { | ||
1913 | *negp = 0; | ||
1914 | lval = (unsigned long)val; | ||
1915 | } | ||
1916 | *lvalp = jiffies_to_clock_t(lval); | ||
1917 | } | ||
1918 | return 0; | ||
1919 | } | ||
1920 | |||
1921 | static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, | ||
1922 | int *valp, | ||
1923 | int write, void *data) | ||
1924 | { | ||
1925 | if (write) { | ||
1926 | *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); | ||
1927 | } else { | ||
1928 | int val = *valp; | ||
1929 | unsigned long lval; | ||
1930 | if (val < 0) { | ||
1931 | *negp = -1; | ||
1932 | lval = (unsigned long)-val; | ||
1933 | } else { | ||
1934 | *negp = 0; | ||
1935 | lval = (unsigned long)val; | ||
1936 | } | ||
1937 | *lvalp = jiffies_to_msecs(lval); | ||
1938 | } | ||
1939 | return 0; | ||
1940 | } | ||
1941 | |||
1942 | /** | ||
1943 | * proc_dointvec_jiffies - read a vector of integers as seconds | ||
1944 | * @table: the sysctl table | ||
1945 | * @write: %TRUE if this is a write to the sysctl file | ||
1946 | * @filp: the file structure | ||
1947 | * @buffer: the user buffer | ||
1948 | * @lenp: the size of the user buffer | ||
1949 | * @ppos: file position | ||
1950 | * | ||
1951 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | ||
1952 | * values from/to the user buffer, treated as an ASCII string. | ||
1953 | * The values read are assumed to be in seconds, and are converted into | ||
1954 | * jiffies. | ||
1955 | * | ||
1956 | * Returns 0 on success. | ||
1957 | */ | ||
1958 | int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, | ||
1959 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1960 | { | ||
1961 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | ||
1962 | do_proc_dointvec_jiffies_conv,NULL); | ||
1963 | } | ||
1964 | |||
1965 | /** | ||
1966 | * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds | ||
1967 | * @table: the sysctl table | ||
1968 | * @write: %TRUE if this is a write to the sysctl file | ||
1969 | * @filp: the file structure | ||
1970 | * @buffer: the user buffer | ||
1971 | * @lenp: the size of the user buffer | ||
1972 | * | ||
1973 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | ||
1974 | * values from/to the user buffer, treated as an ASCII string. | ||
1975 | * The values read are assumed to be in 1/USER_HZ seconds, and | ||
1976 | * are converted into jiffies. | ||
1977 | * | ||
1978 | * Returns 0 on success. | ||
1979 | */ | ||
1980 | int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, | ||
1981 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1982 | { | ||
1983 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | ||
1984 | do_proc_dointvec_userhz_jiffies_conv,NULL); | ||
1985 | } | ||
1986 | |||
1987 | /** | ||
1988 | * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds | ||
1989 | * @table: the sysctl table | ||
1990 | * @write: %TRUE if this is a write to the sysctl file | ||
1991 | * @filp: the file structure | ||
1992 | * @buffer: the user buffer | ||
1993 | * @lenp: the size of the user buffer | ||
1994 | * | ||
1995 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | ||
1996 | * values from/to the user buffer, treated as an ASCII string. | ||
1997 | * The values read are assumed to be in 1/1000 seconds, and | ||
1998 | * are converted into jiffies. | ||
1999 | * | ||
2000 | * Returns 0 on success. | ||
2001 | */ | ||
2002 | int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | ||
2003 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2004 | { | ||
2005 | return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, | ||
2006 | do_proc_dointvec_ms_jiffies_conv, NULL); | ||
2007 | } | ||
2008 | |||
2009 | #else /* CONFIG_PROC_FS */ | ||
2010 | |||
2011 | int proc_dostring(ctl_table *table, int write, struct file *filp, | ||
2012 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2013 | { | ||
2014 | return -ENOSYS; | ||
2015 | } | ||
2016 | |||
2017 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, | ||
2018 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2019 | { | ||
2020 | return -ENOSYS; | ||
2021 | } | ||
2022 | |||
2023 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | ||
2024 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2025 | { | ||
2026 | return -ENOSYS; | ||
2027 | } | ||
2028 | |||
2029 | int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, | ||
2030 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2031 | { | ||
2032 | return -ENOSYS; | ||
2033 | } | ||
2034 | |||
2035 | int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, | ||
2036 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2037 | { | ||
2038 | return -ENOSYS; | ||
2039 | } | ||
2040 | |||
2041 | int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, | ||
2042 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2043 | { | ||
2044 | return -ENOSYS; | ||
2045 | } | ||
2046 | |||
2047 | int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, | ||
2048 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2049 | { | ||
2050 | return -ENOSYS; | ||
2051 | } | ||
2052 | |||
2053 | int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | ||
2054 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2055 | { | ||
2056 | return -ENOSYS; | ||
2057 | } | ||
2058 | |||
2059 | int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
2060 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2061 | { | ||
2062 | return -ENOSYS; | ||
2063 | } | ||
2064 | |||
2065 | int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, | ||
2066 | struct file *filp, | ||
2067 | void __user *buffer, | ||
2068 | size_t *lenp, loff_t *ppos) | ||
2069 | { | ||
2070 | return -ENOSYS; | ||
2071 | } | ||
2072 | |||
2073 | |||
2074 | #endif /* CONFIG_PROC_FS */ | ||
2075 | |||
2076 | |||
2077 | /* | ||
2078 | * General sysctl support routines | ||
2079 | */ | ||
2080 | |||
2081 | /* The generic string strategy routine: */ | ||
2082 | int sysctl_string(ctl_table *table, int __user *name, int nlen, | ||
2083 | void __user *oldval, size_t __user *oldlenp, | ||
2084 | void __user *newval, size_t newlen, void **context) | ||
2085 | { | ||
2086 | size_t l, len; | ||
2087 | |||
2088 | if (!table->data || !table->maxlen) | ||
2089 | return -ENOTDIR; | ||
2090 | |||
2091 | if (oldval && oldlenp) { | ||
2092 | if (get_user(len, oldlenp)) | ||
2093 | return -EFAULT; | ||
2094 | if (len) { | ||
2095 | l = strlen(table->data); | ||
2096 | if (len > l) len = l; | ||
2097 | if (len >= table->maxlen) | ||
2098 | len = table->maxlen; | ||
2099 | if(copy_to_user(oldval, table->data, len)) | ||
2100 | return -EFAULT; | ||
2101 | if(put_user(0, ((char __user *) oldval) + len)) | ||
2102 | return -EFAULT; | ||
2103 | if(put_user(len, oldlenp)) | ||
2104 | return -EFAULT; | ||
2105 | } | ||
2106 | } | ||
2107 | if (newval && newlen) { | ||
2108 | len = newlen; | ||
2109 | if (len > table->maxlen) | ||
2110 | len = table->maxlen; | ||
2111 | if(copy_from_user(table->data, newval, len)) | ||
2112 | return -EFAULT; | ||
2113 | if (len == table->maxlen) | ||
2114 | len--; | ||
2115 | ((char *) table->data)[len] = 0; | ||
2116 | } | ||
2117 | return 0; | ||
2118 | } | ||
2119 | |||
2120 | /* | ||
2121 | * This function makes sure that all of the integers in the vector | ||
2122 | * are between the minimum and maximum values given in the arrays | ||
2123 | * table->extra1 and table->extra2, respectively. | ||
2124 | */ | ||
2125 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | ||
2126 | void __user *oldval, size_t __user *oldlenp, | ||
2127 | void __user *newval, size_t newlen, void **context) | ||
2128 | { | ||
2129 | |||
2130 | if (newval && newlen) { | ||
2131 | int __user *vec = (int __user *) newval; | ||
2132 | int *min = (int *) table->extra1; | ||
2133 | int *max = (int *) table->extra2; | ||
2134 | size_t length; | ||
2135 | int i; | ||
2136 | |||
2137 | if (newlen % sizeof(int) != 0) | ||
2138 | return -EINVAL; | ||
2139 | |||
2140 | if (!table->extra1 && !table->extra2) | ||
2141 | return 0; | ||
2142 | |||
2143 | if (newlen > table->maxlen) | ||
2144 | newlen = table->maxlen; | ||
2145 | length = newlen / sizeof(int); | ||
2146 | |||
2147 | for (i = 0; i < length; i++) { | ||
2148 | int value; | ||
2149 | if (get_user(value, vec + i)) | ||
2150 | return -EFAULT; | ||
2151 | if (min && value < min[i]) | ||
2152 | return -EINVAL; | ||
2153 | if (max && value > max[i]) | ||
2154 | return -EINVAL; | ||
2155 | } | ||
2156 | } | ||
2157 | return 0; | ||
2158 | } | ||
2159 | |||
2160 | /* Strategy function to convert jiffies to seconds */ | ||
2161 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | ||
2162 | void __user *oldval, size_t __user *oldlenp, | ||
2163 | void __user *newval, size_t newlen, void **context) | ||
2164 | { | ||
2165 | if (oldval) { | ||
2166 | size_t olen; | ||
2167 | if (oldlenp) { | ||
2168 | if (get_user(olen, oldlenp)) | ||
2169 | return -EFAULT; | ||
2170 | if (olen!=sizeof(int)) | ||
2171 | return -EINVAL; | ||
2172 | } | ||
2173 | if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) || | ||
2174 | (oldlenp && put_user(sizeof(int),oldlenp))) | ||
2175 | return -EFAULT; | ||
2176 | } | ||
2177 | if (newval && newlen) { | ||
2178 | int new; | ||
2179 | if (newlen != sizeof(int)) | ||
2180 | return -EINVAL; | ||
2181 | if (get_user(new, (int __user *)newval)) | ||
2182 | return -EFAULT; | ||
2183 | *(int *)(table->data) = new*HZ; | ||
2184 | } | ||
2185 | return 1; | ||
2186 | } | ||
2187 | |||
2188 | /* Strategy function to convert jiffies to seconds */ | ||
2189 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | ||
2190 | void __user *oldval, size_t __user *oldlenp, | ||
2191 | void __user *newval, size_t newlen, void **context) | ||
2192 | { | ||
2193 | if (oldval) { | ||
2194 | size_t olen; | ||
2195 | if (oldlenp) { | ||
2196 | if (get_user(olen, oldlenp)) | ||
2197 | return -EFAULT; | ||
2198 | if (olen!=sizeof(int)) | ||
2199 | return -EINVAL; | ||
2200 | } | ||
2201 | if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) || | ||
2202 | (oldlenp && put_user(sizeof(int),oldlenp))) | ||
2203 | return -EFAULT; | ||
2204 | } | ||
2205 | if (newval && newlen) { | ||
2206 | int new; | ||
2207 | if (newlen != sizeof(int)) | ||
2208 | return -EINVAL; | ||
2209 | if (get_user(new, (int __user *)newval)) | ||
2210 | return -EFAULT; | ||
2211 | *(int *)(table->data) = msecs_to_jiffies(new); | ||
2212 | } | ||
2213 | return 1; | ||
2214 | } | ||
2215 | |||
2216 | #else /* CONFIG_SYSCTL */ | ||
2217 | |||
2218 | |||
2219 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | ||
2220 | { | ||
2221 | return -ENOSYS; | ||
2222 | } | ||
2223 | |||
2224 | int sysctl_string(ctl_table *table, int __user *name, int nlen, | ||
2225 | void __user *oldval, size_t __user *oldlenp, | ||
2226 | void __user *newval, size_t newlen, void **context) | ||
2227 | { | ||
2228 | return -ENOSYS; | ||
2229 | } | ||
2230 | |||
2231 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | ||
2232 | void __user *oldval, size_t __user *oldlenp, | ||
2233 | void __user *newval, size_t newlen, void **context) | ||
2234 | { | ||
2235 | return -ENOSYS; | ||
2236 | } | ||
2237 | |||
2238 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | ||
2239 | void __user *oldval, size_t __user *oldlenp, | ||
2240 | void __user *newval, size_t newlen, void **context) | ||
2241 | { | ||
2242 | return -ENOSYS; | ||
2243 | } | ||
2244 | |||
2245 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | ||
2246 | void __user *oldval, size_t __user *oldlenp, | ||
2247 | void __user *newval, size_t newlen, void **context) | ||
2248 | { | ||
2249 | return -ENOSYS; | ||
2250 | } | ||
2251 | |||
2252 | int proc_dostring(ctl_table *table, int write, struct file *filp, | ||
2253 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2254 | { | ||
2255 | return -ENOSYS; | ||
2256 | } | ||
2257 | |||
2258 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | ||
2259 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2260 | { | ||
2261 | return -ENOSYS; | ||
2262 | } | ||
2263 | |||
2264 | int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, | ||
2265 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2266 | { | ||
2267 | return -ENOSYS; | ||
2268 | } | ||
2269 | |||
2270 | int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, | ||
2271 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2272 | { | ||
2273 | return -ENOSYS; | ||
2274 | } | ||
2275 | |||
2276 | int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, | ||
2277 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2278 | { | ||
2279 | return -ENOSYS; | ||
2280 | } | ||
2281 | |||
2282 | int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, | ||
2283 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2284 | { | ||
2285 | return -ENOSYS; | ||
2286 | } | ||
2287 | |||
2288 | int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | ||
2289 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2290 | { | ||
2291 | return -ENOSYS; | ||
2292 | } | ||
2293 | |||
2294 | int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
2295 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2296 | { | ||
2297 | return -ENOSYS; | ||
2298 | } | ||
2299 | |||
2300 | int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, | ||
2301 | struct file *filp, | ||
2302 | void __user *buffer, | ||
2303 | size_t *lenp, loff_t *ppos) | ||
2304 | { | ||
2305 | return -ENOSYS; | ||
2306 | } | ||
2307 | |||
2308 | struct ctl_table_header * register_sysctl_table(ctl_table * table, | ||
2309 | int insert_at_head) | ||
2310 | { | ||
2311 | return NULL; | ||
2312 | } | ||
2313 | |||
2314 | void unregister_sysctl_table(struct ctl_table_header * table) | ||
2315 | { | ||
2316 | } | ||
2317 | |||
2318 | #endif /* CONFIG_SYSCTL */ | ||
2319 | |||
2320 | /* | ||
2321 | * No sense putting this after each symbol definition, twice, | ||
2322 | * exception granted :-) | ||
2323 | */ | ||
2324 | EXPORT_SYMBOL(proc_dointvec); | ||
2325 | EXPORT_SYMBOL(proc_dointvec_jiffies); | ||
2326 | EXPORT_SYMBOL(proc_dointvec_minmax); | ||
2327 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); | ||
2328 | EXPORT_SYMBOL(proc_dointvec_ms_jiffies); | ||
2329 | EXPORT_SYMBOL(proc_dostring); | ||
2330 | EXPORT_SYMBOL(proc_doulongvec_minmax); | ||
2331 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); | ||
2332 | EXPORT_SYMBOL(register_sysctl_table); | ||
2333 | EXPORT_SYMBOL(sysctl_intvec); | ||
2334 | EXPORT_SYMBOL(sysctl_jiffies); | ||
2335 | EXPORT_SYMBOL(sysctl_ms_jiffies); | ||
2336 | EXPORT_SYMBOL(sysctl_string); | ||
2337 | EXPORT_SYMBOL(unregister_sysctl_table); | ||
diff --git a/kernel/time.c b/kernel/time.c new file mode 100644 index 000000000000..96fd0f499631 --- /dev/null +++ b/kernel/time.c | |||
@@ -0,0 +1,599 @@ | |||
1 | /* | ||
2 | * linux/kernel/time.c | ||
3 | * | ||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
5 | * | ||
6 | * This file contains the interface functions for the various | ||
7 | * time related system calls: time, stime, gettimeofday, settimeofday, | ||
8 | * adjtime | ||
9 | */ | ||
10 | /* | ||
11 | * Modification history kernel/time.c | ||
12 | * | ||
13 | * 1993-09-02 Philip Gladstone | ||
14 | * Created file with time related functions from sched.c and adjtimex() | ||
15 | * 1993-10-08 Torsten Duwe | ||
16 | * adjtime interface update and CMOS clock write code | ||
17 | * 1995-08-13 Torsten Duwe | ||
18 | * kernel PLL updated to 1994-12-13 specs (rfc-1589) | ||
19 | * 1999-01-16 Ulrich Windl | ||
20 | * Introduced error checking for many cases in adjtimex(). | ||
21 | * Updated NTP code according to technical memorandum Jan '96 | ||
22 | * "A Kernel Model for Precision Timekeeping" by Dave Mills | ||
23 | * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) | ||
24 | * (Even though the technical memorandum forbids it) | ||
25 | * 2004-07-14 Christoph Lameter | ||
26 | * Added getnstimeofday to allow the posix timer functions to return | ||
27 | * with nanosecond accuracy | ||
28 | */ | ||
29 | |||
30 | #include <linux/module.h> | ||
31 | #include <linux/timex.h> | ||
32 | #include <linux/errno.h> | ||
33 | #include <linux/smp_lock.h> | ||
34 | #include <linux/syscalls.h> | ||
35 | #include <linux/security.h> | ||
36 | #include <linux/fs.h> | ||
37 | #include <linux/module.h> | ||
38 | |||
39 | #include <asm/uaccess.h> | ||
40 | #include <asm/unistd.h> | ||
41 | |||
42 | /* | ||
43 | * The timezone where the local system is located. Used as a default by some | ||
44 | * programs who obtain this value by using gettimeofday. | ||
45 | */ | ||
46 | struct timezone sys_tz; | ||
47 | |||
48 | EXPORT_SYMBOL(sys_tz); | ||
49 | |||
50 | #ifdef __ARCH_WANT_SYS_TIME | ||
51 | |||
52 | /* | ||
53 | * sys_time() can be implemented in user-level using | ||
54 | * sys_gettimeofday(). Is this for backwards compatibility? If so, | ||
55 | * why not move it into the appropriate arch directory (for those | ||
56 | * architectures that need it). | ||
57 | */ | ||
58 | asmlinkage long sys_time(time_t __user * tloc) | ||
59 | { | ||
60 | time_t i; | ||
61 | struct timeval tv; | ||
62 | |||
63 | do_gettimeofday(&tv); | ||
64 | i = tv.tv_sec; | ||
65 | |||
66 | if (tloc) { | ||
67 | if (put_user(i,tloc)) | ||
68 | i = -EFAULT; | ||
69 | } | ||
70 | return i; | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * sys_stime() can be implemented in user-level using | ||
75 | * sys_settimeofday(). Is this for backwards compatibility? If so, | ||
76 | * why not move it into the appropriate arch directory (for those | ||
77 | * architectures that need it). | ||
78 | */ | ||
79 | |||
80 | asmlinkage long sys_stime(time_t __user *tptr) | ||
81 | { | ||
82 | struct timespec tv; | ||
83 | int err; | ||
84 | |||
85 | if (get_user(tv.tv_sec, tptr)) | ||
86 | return -EFAULT; | ||
87 | |||
88 | tv.tv_nsec = 0; | ||
89 | |||
90 | err = security_settime(&tv, NULL); | ||
91 | if (err) | ||
92 | return err; | ||
93 | |||
94 | do_settimeofday(&tv); | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | #endif /* __ARCH_WANT_SYS_TIME */ | ||
99 | |||
100 | asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz) | ||
101 | { | ||
102 | if (likely(tv != NULL)) { | ||
103 | struct timeval ktv; | ||
104 | do_gettimeofday(&ktv); | ||
105 | if (copy_to_user(tv, &ktv, sizeof(ktv))) | ||
106 | return -EFAULT; | ||
107 | } | ||
108 | if (unlikely(tz != NULL)) { | ||
109 | if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) | ||
110 | return -EFAULT; | ||
111 | } | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Adjust the time obtained from the CMOS to be UTC time instead of | ||
117 | * local time. | ||
118 | * | ||
119 | * This is ugly, but preferable to the alternatives. Otherwise we | ||
120 | * would either need to write a program to do it in /etc/rc (and risk | ||
121 | * confusion if the program gets run more than once; it would also be | ||
122 | * hard to make the program warp the clock precisely n hours) or | ||
123 | * compile in the timezone information into the kernel. Bad, bad.... | ||
124 | * | ||
125 | * - TYT, 1992-01-01 | ||
126 | * | ||
127 | * The best thing to do is to keep the CMOS clock in universal time (UTC) | ||
128 | * as real UNIX machines always do it. This avoids all headaches about | ||
129 | * daylight saving times and warping kernel clocks. | ||
130 | */ | ||
131 | inline static void warp_clock(void) | ||
132 | { | ||
133 | write_seqlock_irq(&xtime_lock); | ||
134 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; | ||
135 | xtime.tv_sec += sys_tz.tz_minuteswest * 60; | ||
136 | time_interpolator_reset(); | ||
137 | write_sequnlock_irq(&xtime_lock); | ||
138 | clock_was_set(); | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * In case for some reason the CMOS clock has not already been running | ||
143 | * in UTC, but in some local time: The first time we set the timezone, | ||
144 | * we will warp the clock so that it is ticking UTC time instead of | ||
145 | * local time. Presumably, if someone is setting the timezone then we | ||
146 | * are running in an environment where the programs understand about | ||
147 | * timezones. This should be done at boot time in the /etc/rc script, | ||
148 | * as soon as possible, so that the clock can be set right. Otherwise, | ||
149 | * various programs will get confused when the clock gets warped. | ||
150 | */ | ||
151 | |||
152 | int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) | ||
153 | { | ||
154 | static int firsttime = 1; | ||
155 | int error = 0; | ||
156 | |||
157 | error = security_settime(tv, tz); | ||
158 | if (error) | ||
159 | return error; | ||
160 | |||
161 | if (tz) { | ||
162 | /* SMP safe, global irq locking makes it work. */ | ||
163 | sys_tz = *tz; | ||
164 | if (firsttime) { | ||
165 | firsttime = 0; | ||
166 | if (!tv) | ||
167 | warp_clock(); | ||
168 | } | ||
169 | } | ||
170 | if (tv) | ||
171 | { | ||
172 | /* SMP safe, again the code in arch/foo/time.c should | ||
173 | * globally block out interrupts when it runs. | ||
174 | */ | ||
175 | return do_settimeofday(tv); | ||
176 | } | ||
177 | return 0; | ||
178 | } | ||
179 | |||
180 | asmlinkage long sys_settimeofday(struct timeval __user *tv, | ||
181 | struct timezone __user *tz) | ||
182 | { | ||
183 | struct timeval user_tv; | ||
184 | struct timespec new_ts; | ||
185 | struct timezone new_tz; | ||
186 | |||
187 | if (tv) { | ||
188 | if (copy_from_user(&user_tv, tv, sizeof(*tv))) | ||
189 | return -EFAULT; | ||
190 | new_ts.tv_sec = user_tv.tv_sec; | ||
191 | new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; | ||
192 | } | ||
193 | if (tz) { | ||
194 | if (copy_from_user(&new_tz, tz, sizeof(*tz))) | ||
195 | return -EFAULT; | ||
196 | } | ||
197 | |||
198 | return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); | ||
199 | } | ||
200 | |||
201 | long pps_offset; /* pps time offset (us) */ | ||
202 | long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */ | ||
203 | |||
204 | long pps_freq; /* frequency offset (scaled ppm) */ | ||
205 | long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ | ||
206 | |||
207 | long pps_valid = PPS_VALID; /* pps signal watchdog counter */ | ||
208 | |||
209 | int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ | ||
210 | |||
211 | long pps_jitcnt; /* jitter limit exceeded */ | ||
212 | long pps_calcnt; /* calibration intervals */ | ||
213 | long pps_errcnt; /* calibration errors */ | ||
214 | long pps_stbcnt; /* stability limit exceeded */ | ||
215 | |||
216 | /* hook for a loadable hardpps kernel module */ | ||
217 | void (*hardpps_ptr)(struct timeval *); | ||
218 | |||
219 | /* we call this to notify the arch when the clock is being | ||
220 | * controlled. If no such arch routine, do nothing. | ||
221 | */ | ||
222 | void __attribute__ ((weak)) notify_arch_cmos_timer(void) | ||
223 | { | ||
224 | return; | ||
225 | } | ||
226 | |||
227 | /* adjtimex mainly allows reading (and writing, if superuser) of | ||
228 | * kernel time-keeping variables. used by xntpd. | ||
229 | */ | ||
230 | int do_adjtimex(struct timex *txc) | ||
231 | { | ||
232 | long ltemp, mtemp, save_adjust; | ||
233 | int result; | ||
234 | |||
235 | /* In order to modify anything, you gotta be super-user! */ | ||
236 | if (txc->modes && !capable(CAP_SYS_TIME)) | ||
237 | return -EPERM; | ||
238 | |||
239 | /* Now we validate the data before disabling interrupts */ | ||
240 | |||
241 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | ||
242 | /* singleshot must not be used with any other mode bits */ | ||
243 | if (txc->modes != ADJ_OFFSET_SINGLESHOT) | ||
244 | return -EINVAL; | ||
245 | |||
246 | if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) | ||
247 | /* adjustment Offset limited to +- .512 seconds */ | ||
248 | if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) | ||
249 | return -EINVAL; | ||
250 | |||
251 | /* if the quartz is off by more than 10% something is VERY wrong ! */ | ||
252 | if (txc->modes & ADJ_TICK) | ||
253 | if (txc->tick < 900000/USER_HZ || | ||
254 | txc->tick > 1100000/USER_HZ) | ||
255 | return -EINVAL; | ||
256 | |||
257 | write_seqlock_irq(&xtime_lock); | ||
258 | result = time_state; /* mostly `TIME_OK' */ | ||
259 | |||
260 | /* Save for later - semantics of adjtime is to return old value */ | ||
261 | save_adjust = time_next_adjust ? time_next_adjust : time_adjust; | ||
262 | |||
263 | #if 0 /* STA_CLOCKERR is never set yet */ | ||
264 | time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ | ||
265 | #endif | ||
266 | /* If there are input parameters, then process them */ | ||
267 | if (txc->modes) | ||
268 | { | ||
269 | if (txc->modes & ADJ_STATUS) /* only set allowed bits */ | ||
270 | time_status = (txc->status & ~STA_RONLY) | | ||
271 | (time_status & STA_RONLY); | ||
272 | |||
273 | if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ | ||
274 | if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { | ||
275 | result = -EINVAL; | ||
276 | goto leave; | ||
277 | } | ||
278 | time_freq = txc->freq - pps_freq; | ||
279 | } | ||
280 | |||
281 | if (txc->modes & ADJ_MAXERROR) { | ||
282 | if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { | ||
283 | result = -EINVAL; | ||
284 | goto leave; | ||
285 | } | ||
286 | time_maxerror = txc->maxerror; | ||
287 | } | ||
288 | |||
289 | if (txc->modes & ADJ_ESTERROR) { | ||
290 | if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { | ||
291 | result = -EINVAL; | ||
292 | goto leave; | ||
293 | } | ||
294 | time_esterror = txc->esterror; | ||
295 | } | ||
296 | |||
297 | if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ | ||
298 | if (txc->constant < 0) { /* NTP v4 uses values > 6 */ | ||
299 | result = -EINVAL; | ||
300 | goto leave; | ||
301 | } | ||
302 | time_constant = txc->constant; | ||
303 | } | ||
304 | |||
305 | if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ | ||
306 | if (txc->modes == ADJ_OFFSET_SINGLESHOT) { | ||
307 | /* adjtime() is independent from ntp_adjtime() */ | ||
308 | if ((time_next_adjust = txc->offset) == 0) | ||
309 | time_adjust = 0; | ||
310 | } | ||
311 | else if ( time_status & (STA_PLL | STA_PPSTIME) ) { | ||
312 | ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) == | ||
313 | (STA_PPSTIME | STA_PPSSIGNAL) ? | ||
314 | pps_offset : txc->offset; | ||
315 | |||
316 | /* | ||
317 | * Scale the phase adjustment and | ||
318 | * clamp to the operating range. | ||
319 | */ | ||
320 | if (ltemp > MAXPHASE) | ||
321 | time_offset = MAXPHASE << SHIFT_UPDATE; | ||
322 | else if (ltemp < -MAXPHASE) | ||
323 | time_offset = -(MAXPHASE << SHIFT_UPDATE); | ||
324 | else | ||
325 | time_offset = ltemp << SHIFT_UPDATE; | ||
326 | |||
327 | /* | ||
328 | * Select whether the frequency is to be controlled | ||
329 | * and in which mode (PLL or FLL). Clamp to the operating | ||
330 | * range. Ugly multiply/divide should be replaced someday. | ||
331 | */ | ||
332 | |||
333 | if (time_status & STA_FREQHOLD || time_reftime == 0) | ||
334 | time_reftime = xtime.tv_sec; | ||
335 | mtemp = xtime.tv_sec - time_reftime; | ||
336 | time_reftime = xtime.tv_sec; | ||
337 | if (time_status & STA_FLL) { | ||
338 | if (mtemp >= MINSEC) { | ||
339 | ltemp = (time_offset / mtemp) << (SHIFT_USEC - | ||
340 | SHIFT_UPDATE); | ||
341 | if (ltemp < 0) | ||
342 | time_freq -= -ltemp >> SHIFT_KH; | ||
343 | else | ||
344 | time_freq += ltemp >> SHIFT_KH; | ||
345 | } else /* calibration interval too short (p. 12) */ | ||
346 | result = TIME_ERROR; | ||
347 | } else { /* PLL mode */ | ||
348 | if (mtemp < MAXSEC) { | ||
349 | ltemp *= mtemp; | ||
350 | if (ltemp < 0) | ||
351 | time_freq -= -ltemp >> (time_constant + | ||
352 | time_constant + | ||
353 | SHIFT_KF - SHIFT_USEC); | ||
354 | else | ||
355 | time_freq += ltemp >> (time_constant + | ||
356 | time_constant + | ||
357 | SHIFT_KF - SHIFT_USEC); | ||
358 | } else /* calibration interval too long (p. 12) */ | ||
359 | result = TIME_ERROR; | ||
360 | } | ||
361 | if (time_freq > time_tolerance) | ||
362 | time_freq = time_tolerance; | ||
363 | else if (time_freq < -time_tolerance) | ||
364 | time_freq = -time_tolerance; | ||
365 | } /* STA_PLL || STA_PPSTIME */ | ||
366 | } /* txc->modes & ADJ_OFFSET */ | ||
367 | if (txc->modes & ADJ_TICK) { | ||
368 | tick_usec = txc->tick; | ||
369 | tick_nsec = TICK_USEC_TO_NSEC(tick_usec); | ||
370 | } | ||
371 | } /* txc->modes */ | ||
372 | leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 | ||
373 | || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0 | ||
374 | && (time_status & STA_PPSSIGNAL) == 0) | ||
375 | /* p. 24, (b) */ | ||
376 | || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) | ||
377 | == (STA_PPSTIME|STA_PPSJITTER)) | ||
378 | /* p. 24, (c) */ | ||
379 | || ((time_status & STA_PPSFREQ) != 0 | ||
380 | && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0)) | ||
381 | /* p. 24, (d) */ | ||
382 | result = TIME_ERROR; | ||
383 | |||
384 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | ||
385 | txc->offset = save_adjust; | ||
386 | else { | ||
387 | if (time_offset < 0) | ||
388 | txc->offset = -(-time_offset >> SHIFT_UPDATE); | ||
389 | else | ||
390 | txc->offset = time_offset >> SHIFT_UPDATE; | ||
391 | } | ||
392 | txc->freq = time_freq + pps_freq; | ||
393 | txc->maxerror = time_maxerror; | ||
394 | txc->esterror = time_esterror; | ||
395 | txc->status = time_status; | ||
396 | txc->constant = time_constant; | ||
397 | txc->precision = time_precision; | ||
398 | txc->tolerance = time_tolerance; | ||
399 | txc->tick = tick_usec; | ||
400 | txc->ppsfreq = pps_freq; | ||
401 | txc->jitter = pps_jitter >> PPS_AVG; | ||
402 | txc->shift = pps_shift; | ||
403 | txc->stabil = pps_stabil; | ||
404 | txc->jitcnt = pps_jitcnt; | ||
405 | txc->calcnt = pps_calcnt; | ||
406 | txc->errcnt = pps_errcnt; | ||
407 | txc->stbcnt = pps_stbcnt; | ||
408 | write_sequnlock_irq(&xtime_lock); | ||
409 | do_gettimeofday(&txc->time); | ||
410 | notify_arch_cmos_timer(); | ||
411 | return(result); | ||
412 | } | ||
413 | |||
414 | asmlinkage long sys_adjtimex(struct timex __user *txc_p) | ||
415 | { | ||
416 | struct timex txc; /* Local copy of parameter */ | ||
417 | int ret; | ||
418 | |||
419 | /* Copy the user data space into the kernel copy | ||
420 | * structure. But bear in mind that the structures | ||
421 | * may change | ||
422 | */ | ||
423 | if(copy_from_user(&txc, txc_p, sizeof(struct timex))) | ||
424 | return -EFAULT; | ||
425 | ret = do_adjtimex(&txc); | ||
426 | return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; | ||
427 | } | ||
428 | |||
429 | inline struct timespec current_kernel_time(void) | ||
430 | { | ||
431 | struct timespec now; | ||
432 | unsigned long seq; | ||
433 | |||
434 | do { | ||
435 | seq = read_seqbegin(&xtime_lock); | ||
436 | |||
437 | now = xtime; | ||
438 | } while (read_seqretry(&xtime_lock, seq)); | ||
439 | |||
440 | return now; | ||
441 | } | ||
442 | |||
443 | EXPORT_SYMBOL(current_kernel_time); | ||
444 | |||
445 | /** | ||
446 | * current_fs_time - Return FS time | ||
447 | * @sb: Superblock. | ||
448 | * | ||
449 | * Return the current time truncated to the time granuality supported by | ||
450 | * the fs. | ||
451 | */ | ||
452 | struct timespec current_fs_time(struct super_block *sb) | ||
453 | { | ||
454 | struct timespec now = current_kernel_time(); | ||
455 | return timespec_trunc(now, sb->s_time_gran); | ||
456 | } | ||
457 | EXPORT_SYMBOL(current_fs_time); | ||
458 | |||
459 | /** | ||
460 | * timespec_trunc - Truncate timespec to a granuality | ||
461 | * @t: Timespec | ||
462 | * @gran: Granuality in ns. | ||
463 | * | ||
464 | * Truncate a timespec to a granuality. gran must be smaller than a second. | ||
465 | * Always rounds down. | ||
466 | * | ||
467 | * This function should be only used for timestamps returned by | ||
468 | * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because | ||
469 | * it doesn't handle the better resolution of the later. | ||
470 | */ | ||
471 | struct timespec timespec_trunc(struct timespec t, unsigned gran) | ||
472 | { | ||
473 | /* | ||
474 | * Division is pretty slow so avoid it for common cases. | ||
475 | * Currently current_kernel_time() never returns better than | ||
476 | * jiffies resolution. Exploit that. | ||
477 | */ | ||
478 | if (gran <= jiffies_to_usecs(1) * 1000) { | ||
479 | /* nothing */ | ||
480 | } else if (gran == 1000000000) { | ||
481 | t.tv_nsec = 0; | ||
482 | } else { | ||
483 | t.tv_nsec -= t.tv_nsec % gran; | ||
484 | } | ||
485 | return t; | ||
486 | } | ||
487 | EXPORT_SYMBOL(timespec_trunc); | ||
488 | |||
489 | #ifdef CONFIG_TIME_INTERPOLATION | ||
490 | void getnstimeofday (struct timespec *tv) | ||
491 | { | ||
492 | unsigned long seq,sec,nsec; | ||
493 | |||
494 | do { | ||
495 | seq = read_seqbegin(&xtime_lock); | ||
496 | sec = xtime.tv_sec; | ||
497 | nsec = xtime.tv_nsec+time_interpolator_get_offset(); | ||
498 | } while (unlikely(read_seqretry(&xtime_lock, seq))); | ||
499 | |||
500 | while (unlikely(nsec >= NSEC_PER_SEC)) { | ||
501 | nsec -= NSEC_PER_SEC; | ||
502 | ++sec; | ||
503 | } | ||
504 | tv->tv_sec = sec; | ||
505 | tv->tv_nsec = nsec; | ||
506 | } | ||
507 | EXPORT_SYMBOL_GPL(getnstimeofday); | ||
508 | |||
509 | int do_settimeofday (struct timespec *tv) | ||
510 | { | ||
511 | time_t wtm_sec, sec = tv->tv_sec; | ||
512 | long wtm_nsec, nsec = tv->tv_nsec; | ||
513 | |||
514 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
515 | return -EINVAL; | ||
516 | |||
517 | write_seqlock_irq(&xtime_lock); | ||
518 | { | ||
519 | /* | ||
520 | * This is revolting. We need to set "xtime" correctly. However, the value | ||
521 | * in this location is the value at the most recent update of wall time. | ||
522 | * Discover what correction gettimeofday would have done, and then undo | ||
523 | * it! | ||
524 | */ | ||
525 | nsec -= time_interpolator_get_offset(); | ||
526 | |||
527 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
528 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
529 | |||
530 | set_normalized_timespec(&xtime, sec, nsec); | ||
531 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
532 | |||
533 | time_adjust = 0; /* stop active adjtime() */ | ||
534 | time_status |= STA_UNSYNC; | ||
535 | time_maxerror = NTP_PHASE_LIMIT; | ||
536 | time_esterror = NTP_PHASE_LIMIT; | ||
537 | time_interpolator_reset(); | ||
538 | } | ||
539 | write_sequnlock_irq(&xtime_lock); | ||
540 | clock_was_set(); | ||
541 | return 0; | ||
542 | } | ||
543 | |||
544 | void do_gettimeofday (struct timeval *tv) | ||
545 | { | ||
546 | unsigned long seq, nsec, usec, sec, offset; | ||
547 | do { | ||
548 | seq = read_seqbegin(&xtime_lock); | ||
549 | offset = time_interpolator_get_offset(); | ||
550 | sec = xtime.tv_sec; | ||
551 | nsec = xtime.tv_nsec; | ||
552 | } while (unlikely(read_seqretry(&xtime_lock, seq))); | ||
553 | |||
554 | usec = (nsec + offset) / 1000; | ||
555 | |||
556 | while (unlikely(usec >= USEC_PER_SEC)) { | ||
557 | usec -= USEC_PER_SEC; | ||
558 | ++sec; | ||
559 | } | ||
560 | |||
561 | tv->tv_sec = sec; | ||
562 | tv->tv_usec = usec; | ||
563 | } | ||
564 | |||
565 | EXPORT_SYMBOL(do_gettimeofday); | ||
566 | |||
567 | |||
568 | #else | ||
569 | /* | ||
570 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | ||
571 | * and therefore only yields usec accuracy | ||
572 | */ | ||
573 | void getnstimeofday(struct timespec *tv) | ||
574 | { | ||
575 | struct timeval x; | ||
576 | |||
577 | do_gettimeofday(&x); | ||
578 | tv->tv_sec = x.tv_sec; | ||
579 | tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; | ||
580 | } | ||
581 | #endif | ||
582 | |||
583 | #if (BITS_PER_LONG < 64) | ||
584 | u64 get_jiffies_64(void) | ||
585 | { | ||
586 | unsigned long seq; | ||
587 | u64 ret; | ||
588 | |||
589 | do { | ||
590 | seq = read_seqbegin(&xtime_lock); | ||
591 | ret = jiffies_64; | ||
592 | } while (read_seqretry(&xtime_lock, seq)); | ||
593 | return ret; | ||
594 | } | ||
595 | |||
596 | EXPORT_SYMBOL(get_jiffies_64); | ||
597 | #endif | ||
598 | |||
599 | EXPORT_SYMBOL(jiffies); | ||
diff --git a/kernel/timer.c b/kernel/timer.c new file mode 100644 index 000000000000..ecb3d67c0e14 --- /dev/null +++ b/kernel/timer.c | |||
@@ -0,0 +1,1611 @@ | |||
1 | /* | ||
2 | * linux/kernel/timer.c | ||
3 | * | ||
4 | * Kernel internal timers, kernel timekeeping, basic process system calls | ||
5 | * | ||
6 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
7 | * | ||
8 | * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. | ||
9 | * | ||
10 | * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 | ||
11 | * "A Kernel Model for Precision Timekeeping" by Dave Mills | ||
12 | * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to | ||
13 | * serialize accesses to xtime/lost_ticks). | ||
14 | * Copyright (C) 1998 Andrea Arcangeli | ||
15 | * 1999-03-10 Improved NTP compatibility by Ulrich Windl | ||
16 | * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love | ||
17 | * 2000-10-05 Implemented scalable SMP per-CPU timer handling. | ||
18 | * Copyright (C) 2000, 2001, 2002 Ingo Molnar | ||
19 | * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel_stat.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/percpu.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/swap.h> | ||
29 | #include <linux/notifier.h> | ||
30 | #include <linux/thread_info.h> | ||
31 | #include <linux/time.h> | ||
32 | #include <linux/jiffies.h> | ||
33 | #include <linux/posix-timers.h> | ||
34 | #include <linux/cpu.h> | ||
35 | #include <linux/syscalls.h> | ||
36 | |||
37 | #include <asm/uaccess.h> | ||
38 | #include <asm/unistd.h> | ||
39 | #include <asm/div64.h> | ||
40 | #include <asm/timex.h> | ||
41 | #include <asm/io.h> | ||
42 | |||
43 | #ifdef CONFIG_TIME_INTERPOLATION | ||
44 | static void time_interpolator_update(long delta_nsec); | ||
45 | #else | ||
46 | #define time_interpolator_update(x) | ||
47 | #endif | ||
48 | |||
49 | /* | ||
50 | * per-CPU timer vector definitions: | ||
51 | */ | ||
52 | |||
53 | #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) | ||
54 | #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) | ||
55 | #define TVN_SIZE (1 << TVN_BITS) | ||
56 | #define TVR_SIZE (1 << TVR_BITS) | ||
57 | #define TVN_MASK (TVN_SIZE - 1) | ||
58 | #define TVR_MASK (TVR_SIZE - 1) | ||
59 | |||
60 | typedef struct tvec_s { | ||
61 | struct list_head vec[TVN_SIZE]; | ||
62 | } tvec_t; | ||
63 | |||
64 | typedef struct tvec_root_s { | ||
65 | struct list_head vec[TVR_SIZE]; | ||
66 | } tvec_root_t; | ||
67 | |||
68 | struct tvec_t_base_s { | ||
69 | spinlock_t lock; | ||
70 | unsigned long timer_jiffies; | ||
71 | struct timer_list *running_timer; | ||
72 | tvec_root_t tv1; | ||
73 | tvec_t tv2; | ||
74 | tvec_t tv3; | ||
75 | tvec_t tv4; | ||
76 | tvec_t tv5; | ||
77 | } ____cacheline_aligned_in_smp; | ||
78 | |||
79 | typedef struct tvec_t_base_s tvec_base_t; | ||
80 | |||
81 | static inline void set_running_timer(tvec_base_t *base, | ||
82 | struct timer_list *timer) | ||
83 | { | ||
84 | #ifdef CONFIG_SMP | ||
85 | base->running_timer = timer; | ||
86 | #endif | ||
87 | } | ||
88 | |||
89 | /* Fake initialization */ | ||
90 | static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED }; | ||
91 | |||
92 | static void check_timer_failed(struct timer_list *timer) | ||
93 | { | ||
94 | static int whine_count; | ||
95 | if (whine_count < 16) { | ||
96 | whine_count++; | ||
97 | printk("Uninitialised timer!\n"); | ||
98 | printk("This is just a warning. Your computer is OK\n"); | ||
99 | printk("function=0x%p, data=0x%lx\n", | ||
100 | timer->function, timer->data); | ||
101 | dump_stack(); | ||
102 | } | ||
103 | /* | ||
104 | * Now fix it up | ||
105 | */ | ||
106 | spin_lock_init(&timer->lock); | ||
107 | timer->magic = TIMER_MAGIC; | ||
108 | } | ||
109 | |||
110 | static inline void check_timer(struct timer_list *timer) | ||
111 | { | ||
112 | if (timer->magic != TIMER_MAGIC) | ||
113 | check_timer_failed(timer); | ||
114 | } | ||
115 | |||
116 | |||
117 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | ||
118 | { | ||
119 | unsigned long expires = timer->expires; | ||
120 | unsigned long idx = expires - base->timer_jiffies; | ||
121 | struct list_head *vec; | ||
122 | |||
123 | if (idx < TVR_SIZE) { | ||
124 | int i = expires & TVR_MASK; | ||
125 | vec = base->tv1.vec + i; | ||
126 | } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { | ||
127 | int i = (expires >> TVR_BITS) & TVN_MASK; | ||
128 | vec = base->tv2.vec + i; | ||
129 | } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { | ||
130 | int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; | ||
131 | vec = base->tv3.vec + i; | ||
132 | } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { | ||
133 | int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; | ||
134 | vec = base->tv4.vec + i; | ||
135 | } else if ((signed long) idx < 0) { | ||
136 | /* | ||
137 | * Can happen if you add a timer with expires == jiffies, | ||
138 | * or you set a timer to go off in the past | ||
139 | */ | ||
140 | vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); | ||
141 | } else { | ||
142 | int i; | ||
143 | /* If the timeout is larger than 0xffffffff on 64-bit | ||
144 | * architectures then we use the maximum timeout: | ||
145 | */ | ||
146 | if (idx > 0xffffffffUL) { | ||
147 | idx = 0xffffffffUL; | ||
148 | expires = idx + base->timer_jiffies; | ||
149 | } | ||
150 | i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; | ||
151 | vec = base->tv5.vec + i; | ||
152 | } | ||
153 | /* | ||
154 | * Timers are FIFO: | ||
155 | */ | ||
156 | list_add_tail(&timer->entry, vec); | ||
157 | } | ||
158 | |||
159 | int __mod_timer(struct timer_list *timer, unsigned long expires) | ||
160 | { | ||
161 | tvec_base_t *old_base, *new_base; | ||
162 | unsigned long flags; | ||
163 | int ret = 0; | ||
164 | |||
165 | BUG_ON(!timer->function); | ||
166 | |||
167 | check_timer(timer); | ||
168 | |||
169 | spin_lock_irqsave(&timer->lock, flags); | ||
170 | new_base = &__get_cpu_var(tvec_bases); | ||
171 | repeat: | ||
172 | old_base = timer->base; | ||
173 | |||
174 | /* | ||
175 | * Prevent deadlocks via ordering by old_base < new_base. | ||
176 | */ | ||
177 | if (old_base && (new_base != old_base)) { | ||
178 | if (old_base < new_base) { | ||
179 | spin_lock(&new_base->lock); | ||
180 | spin_lock(&old_base->lock); | ||
181 | } else { | ||
182 | spin_lock(&old_base->lock); | ||
183 | spin_lock(&new_base->lock); | ||
184 | } | ||
185 | /* | ||
186 | * The timer base might have been cancelled while we were | ||
187 | * trying to take the lock(s): | ||
188 | */ | ||
189 | if (timer->base != old_base) { | ||
190 | spin_unlock(&new_base->lock); | ||
191 | spin_unlock(&old_base->lock); | ||
192 | goto repeat; | ||
193 | } | ||
194 | } else { | ||
195 | spin_lock(&new_base->lock); | ||
196 | if (timer->base != old_base) { | ||
197 | spin_unlock(&new_base->lock); | ||
198 | goto repeat; | ||
199 | } | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * Delete the previous timeout (if there was any), and install | ||
204 | * the new one: | ||
205 | */ | ||
206 | if (old_base) { | ||
207 | list_del(&timer->entry); | ||
208 | ret = 1; | ||
209 | } | ||
210 | timer->expires = expires; | ||
211 | internal_add_timer(new_base, timer); | ||
212 | timer->base = new_base; | ||
213 | |||
214 | if (old_base && (new_base != old_base)) | ||
215 | spin_unlock(&old_base->lock); | ||
216 | spin_unlock(&new_base->lock); | ||
217 | spin_unlock_irqrestore(&timer->lock, flags); | ||
218 | |||
219 | return ret; | ||
220 | } | ||
221 | |||
222 | EXPORT_SYMBOL(__mod_timer); | ||
223 | |||
224 | /*** | ||
225 | * add_timer_on - start a timer on a particular CPU | ||
226 | * @timer: the timer to be added | ||
227 | * @cpu: the CPU to start it on | ||
228 | * | ||
229 | * This is not very scalable on SMP. Double adds are not possible. | ||
230 | */ | ||
231 | void add_timer_on(struct timer_list *timer, int cpu) | ||
232 | { | ||
233 | tvec_base_t *base = &per_cpu(tvec_bases, cpu); | ||
234 | unsigned long flags; | ||
235 | |||
236 | BUG_ON(timer_pending(timer) || !timer->function); | ||
237 | |||
238 | check_timer(timer); | ||
239 | |||
240 | spin_lock_irqsave(&base->lock, flags); | ||
241 | internal_add_timer(base, timer); | ||
242 | timer->base = base; | ||
243 | spin_unlock_irqrestore(&base->lock, flags); | ||
244 | } | ||
245 | |||
246 | |||
247 | /*** | ||
248 | * mod_timer - modify a timer's timeout | ||
249 | * @timer: the timer to be modified | ||
250 | * | ||
251 | * mod_timer is a more efficient way to update the expire field of an | ||
252 | * active timer (if the timer is inactive it will be activated) | ||
253 | * | ||
254 | * mod_timer(timer, expires) is equivalent to: | ||
255 | * | ||
256 | * del_timer(timer); timer->expires = expires; add_timer(timer); | ||
257 | * | ||
258 | * Note that if there are multiple unserialized concurrent users of the | ||
259 | * same timer, then mod_timer() is the only safe way to modify the timeout, | ||
260 | * since add_timer() cannot modify an already running timer. | ||
261 | * | ||
262 | * The function returns whether it has modified a pending timer or not. | ||
263 | * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an | ||
264 | * active timer returns 1.) | ||
265 | */ | ||
266 | int mod_timer(struct timer_list *timer, unsigned long expires) | ||
267 | { | ||
268 | BUG_ON(!timer->function); | ||
269 | |||
270 | check_timer(timer); | ||
271 | |||
272 | /* | ||
273 | * This is a common optimization triggered by the | ||
274 | * networking code - if the timer is re-modified | ||
275 | * to be the same thing then just return: | ||
276 | */ | ||
277 | if (timer->expires == expires && timer_pending(timer)) | ||
278 | return 1; | ||
279 | |||
280 | return __mod_timer(timer, expires); | ||
281 | } | ||
282 | |||
283 | EXPORT_SYMBOL(mod_timer); | ||
284 | |||
285 | /*** | ||
286 | * del_timer - deactive a timer. | ||
287 | * @timer: the timer to be deactivated | ||
288 | * | ||
289 | * del_timer() deactivates a timer - this works on both active and inactive | ||
290 | * timers. | ||
291 | * | ||
292 | * The function returns whether it has deactivated a pending timer or not. | ||
293 | * (ie. del_timer() of an inactive timer returns 0, del_timer() of an | ||
294 | * active timer returns 1.) | ||
295 | */ | ||
296 | int del_timer(struct timer_list *timer) | ||
297 | { | ||
298 | unsigned long flags; | ||
299 | tvec_base_t *base; | ||
300 | |||
301 | check_timer(timer); | ||
302 | |||
303 | repeat: | ||
304 | base = timer->base; | ||
305 | if (!base) | ||
306 | return 0; | ||
307 | spin_lock_irqsave(&base->lock, flags); | ||
308 | if (base != timer->base) { | ||
309 | spin_unlock_irqrestore(&base->lock, flags); | ||
310 | goto repeat; | ||
311 | } | ||
312 | list_del(&timer->entry); | ||
313 | /* Need to make sure that anybody who sees a NULL base also sees the list ops */ | ||
314 | smp_wmb(); | ||
315 | timer->base = NULL; | ||
316 | spin_unlock_irqrestore(&base->lock, flags); | ||
317 | |||
318 | return 1; | ||
319 | } | ||
320 | |||
321 | EXPORT_SYMBOL(del_timer); | ||
322 | |||
323 | #ifdef CONFIG_SMP | ||
324 | /*** | ||
325 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | ||
326 | * @timer: the timer to be deactivated | ||
327 | * | ||
328 | * This function only differs from del_timer() on SMP: besides deactivating | ||
329 | * the timer it also makes sure the handler has finished executing on other | ||
330 | * CPUs. | ||
331 | * | ||
332 | * Synchronization rules: callers must prevent restarting of the timer, | ||
333 | * otherwise this function is meaningless. It must not be called from | ||
334 | * interrupt contexts. The caller must not hold locks which would prevent | ||
335 | * completion of the timer's handler. Upon exit the timer is not queued and | ||
336 | * the handler is not running on any CPU. | ||
337 | * | ||
338 | * The function returns whether it has deactivated a pending timer or not. | ||
339 | * | ||
340 | * del_timer_sync() is slow and complicated because it copes with timer | ||
341 | * handlers which re-arm the timer (periodic timers). If the timer handler | ||
342 | * is known to not do this (a single shot timer) then use | ||
343 | * del_singleshot_timer_sync() instead. | ||
344 | */ | ||
345 | int del_timer_sync(struct timer_list *timer) | ||
346 | { | ||
347 | tvec_base_t *base; | ||
348 | int i, ret = 0; | ||
349 | |||
350 | check_timer(timer); | ||
351 | |||
352 | del_again: | ||
353 | ret += del_timer(timer); | ||
354 | |||
355 | for_each_online_cpu(i) { | ||
356 | base = &per_cpu(tvec_bases, i); | ||
357 | if (base->running_timer == timer) { | ||
358 | while (base->running_timer == timer) { | ||
359 | cpu_relax(); | ||
360 | preempt_check_resched(); | ||
361 | } | ||
362 | break; | ||
363 | } | ||
364 | } | ||
365 | smp_rmb(); | ||
366 | if (timer_pending(timer)) | ||
367 | goto del_again; | ||
368 | |||
369 | return ret; | ||
370 | } | ||
371 | EXPORT_SYMBOL(del_timer_sync); | ||
372 | |||
373 | /*** | ||
374 | * del_singleshot_timer_sync - deactivate a non-recursive timer | ||
375 | * @timer: the timer to be deactivated | ||
376 | * | ||
377 | * This function is an optimization of del_timer_sync for the case where the | ||
378 | * caller can guarantee the timer does not reschedule itself in its timer | ||
379 | * function. | ||
380 | * | ||
381 | * Synchronization rules: callers must prevent restarting of the timer, | ||
382 | * otherwise this function is meaningless. It must not be called from | ||
383 | * interrupt contexts. The caller must not hold locks which wold prevent | ||
384 | * completion of the timer's handler. Upon exit the timer is not queued and | ||
385 | * the handler is not running on any CPU. | ||
386 | * | ||
387 | * The function returns whether it has deactivated a pending timer or not. | ||
388 | */ | ||
389 | int del_singleshot_timer_sync(struct timer_list *timer) | ||
390 | { | ||
391 | int ret = del_timer(timer); | ||
392 | |||
393 | if (!ret) { | ||
394 | ret = del_timer_sync(timer); | ||
395 | BUG_ON(ret); | ||
396 | } | ||
397 | |||
398 | return ret; | ||
399 | } | ||
400 | EXPORT_SYMBOL(del_singleshot_timer_sync); | ||
401 | #endif | ||
402 | |||
403 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | ||
404 | { | ||
405 | /* cascade all the timers from tv up one level */ | ||
406 | struct list_head *head, *curr; | ||
407 | |||
408 | head = tv->vec + index; | ||
409 | curr = head->next; | ||
410 | /* | ||
411 | * We are removing _all_ timers from the list, so we don't have to | ||
412 | * detach them individually, just clear the list afterwards. | ||
413 | */ | ||
414 | while (curr != head) { | ||
415 | struct timer_list *tmp; | ||
416 | |||
417 | tmp = list_entry(curr, struct timer_list, entry); | ||
418 | BUG_ON(tmp->base != base); | ||
419 | curr = curr->next; | ||
420 | internal_add_timer(base, tmp); | ||
421 | } | ||
422 | INIT_LIST_HEAD(head); | ||
423 | |||
424 | return index; | ||
425 | } | ||
426 | |||
427 | /*** | ||
428 | * __run_timers - run all expired timers (if any) on this CPU. | ||
429 | * @base: the timer vector to be processed. | ||
430 | * | ||
431 | * This function cascades all vectors and executes all expired timer | ||
432 | * vectors. | ||
433 | */ | ||
434 | #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK | ||
435 | |||
436 | static inline void __run_timers(tvec_base_t *base) | ||
437 | { | ||
438 | struct timer_list *timer; | ||
439 | |||
440 | spin_lock_irq(&base->lock); | ||
441 | while (time_after_eq(jiffies, base->timer_jiffies)) { | ||
442 | struct list_head work_list = LIST_HEAD_INIT(work_list); | ||
443 | struct list_head *head = &work_list; | ||
444 | int index = base->timer_jiffies & TVR_MASK; | ||
445 | |||
446 | /* | ||
447 | * Cascade timers: | ||
448 | */ | ||
449 | if (!index && | ||
450 | (!cascade(base, &base->tv2, INDEX(0))) && | ||
451 | (!cascade(base, &base->tv3, INDEX(1))) && | ||
452 | !cascade(base, &base->tv4, INDEX(2))) | ||
453 | cascade(base, &base->tv5, INDEX(3)); | ||
454 | ++base->timer_jiffies; | ||
455 | list_splice_init(base->tv1.vec + index, &work_list); | ||
456 | repeat: | ||
457 | if (!list_empty(head)) { | ||
458 | void (*fn)(unsigned long); | ||
459 | unsigned long data; | ||
460 | |||
461 | timer = list_entry(head->next,struct timer_list,entry); | ||
462 | fn = timer->function; | ||
463 | data = timer->data; | ||
464 | |||
465 | list_del(&timer->entry); | ||
466 | set_running_timer(base, timer); | ||
467 | smp_wmb(); | ||
468 | timer->base = NULL; | ||
469 | spin_unlock_irq(&base->lock); | ||
470 | { | ||
471 | u32 preempt_count = preempt_count(); | ||
472 | fn(data); | ||
473 | if (preempt_count != preempt_count()) { | ||
474 | printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); | ||
475 | BUG(); | ||
476 | } | ||
477 | } | ||
478 | spin_lock_irq(&base->lock); | ||
479 | goto repeat; | ||
480 | } | ||
481 | } | ||
482 | set_running_timer(base, NULL); | ||
483 | spin_unlock_irq(&base->lock); | ||
484 | } | ||
485 | |||
486 | #ifdef CONFIG_NO_IDLE_HZ | ||
487 | /* | ||
488 | * Find out when the next timer event is due to happen. This | ||
489 | * is used on S/390 to stop all activity when a cpus is idle. | ||
490 | * This functions needs to be called disabled. | ||
491 | */ | ||
492 | unsigned long next_timer_interrupt(void) | ||
493 | { | ||
494 | tvec_base_t *base; | ||
495 | struct list_head *list; | ||
496 | struct timer_list *nte; | ||
497 | unsigned long expires; | ||
498 | tvec_t *varray[4]; | ||
499 | int i, j; | ||
500 | |||
501 | base = &__get_cpu_var(tvec_bases); | ||
502 | spin_lock(&base->lock); | ||
503 | expires = base->timer_jiffies + (LONG_MAX >> 1); | ||
504 | list = 0; | ||
505 | |||
506 | /* Look for timer events in tv1. */ | ||
507 | j = base->timer_jiffies & TVR_MASK; | ||
508 | do { | ||
509 | list_for_each_entry(nte, base->tv1.vec + j, entry) { | ||
510 | expires = nte->expires; | ||
511 | if (j < (base->timer_jiffies & TVR_MASK)) | ||
512 | list = base->tv2.vec + (INDEX(0)); | ||
513 | goto found; | ||
514 | } | ||
515 | j = (j + 1) & TVR_MASK; | ||
516 | } while (j != (base->timer_jiffies & TVR_MASK)); | ||
517 | |||
518 | /* Check tv2-tv5. */ | ||
519 | varray[0] = &base->tv2; | ||
520 | varray[1] = &base->tv3; | ||
521 | varray[2] = &base->tv4; | ||
522 | varray[3] = &base->tv5; | ||
523 | for (i = 0; i < 4; i++) { | ||
524 | j = INDEX(i); | ||
525 | do { | ||
526 | if (list_empty(varray[i]->vec + j)) { | ||
527 | j = (j + 1) & TVN_MASK; | ||
528 | continue; | ||
529 | } | ||
530 | list_for_each_entry(nte, varray[i]->vec + j, entry) | ||
531 | if (time_before(nte->expires, expires)) | ||
532 | expires = nte->expires; | ||
533 | if (j < (INDEX(i)) && i < 3) | ||
534 | list = varray[i + 1]->vec + (INDEX(i + 1)); | ||
535 | goto found; | ||
536 | } while (j != (INDEX(i))); | ||
537 | } | ||
538 | found: | ||
539 | if (list) { | ||
540 | /* | ||
541 | * The search wrapped. We need to look at the next list | ||
542 | * from next tv element that would cascade into tv element | ||
543 | * where we found the timer element. | ||
544 | */ | ||
545 | list_for_each_entry(nte, list, entry) { | ||
546 | if (time_before(nte->expires, expires)) | ||
547 | expires = nte->expires; | ||
548 | } | ||
549 | } | ||
550 | spin_unlock(&base->lock); | ||
551 | return expires; | ||
552 | } | ||
553 | #endif | ||
554 | |||
555 | /******************************************************************/ | ||
556 | |||
557 | /* | ||
558 | * Timekeeping variables | ||
559 | */ | ||
560 | unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ | ||
561 | unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ | ||
562 | |||
563 | /* | ||
564 | * The current time | ||
565 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | ||
566 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | ||
567 | * at zero at system boot time, so wall_to_monotonic will be negative, | ||
568 | * however, we will ALWAYS keep the tv_nsec part positive so we can use | ||
569 | * the usual normalization. | ||
570 | */ | ||
571 | struct timespec xtime __attribute__ ((aligned (16))); | ||
572 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | ||
573 | |||
574 | EXPORT_SYMBOL(xtime); | ||
575 | |||
576 | /* Don't completely fail for HZ > 500. */ | ||
577 | int tickadj = 500/HZ ? : 1; /* microsecs */ | ||
578 | |||
579 | |||
580 | /* | ||
581 | * phase-lock loop variables | ||
582 | */ | ||
583 | /* TIME_ERROR prevents overwriting the CMOS clock */ | ||
584 | int time_state = TIME_OK; /* clock synchronization status */ | ||
585 | int time_status = STA_UNSYNC; /* clock status bits */ | ||
586 | long time_offset; /* time adjustment (us) */ | ||
587 | long time_constant = 2; /* pll time constant */ | ||
588 | long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ | ||
589 | long time_precision = 1; /* clock precision (us) */ | ||
590 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ | ||
591 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ | ||
592 | static long time_phase; /* phase offset (scaled us) */ | ||
593 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; | ||
594 | /* frequency offset (scaled ppm)*/ | ||
595 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ | ||
596 | long time_reftime; /* time at last adjustment (s) */ | ||
597 | long time_adjust; | ||
598 | long time_next_adjust; | ||
599 | |||
600 | /* | ||
601 | * this routine handles the overflow of the microsecond field | ||
602 | * | ||
603 | * The tricky bits of code to handle the accurate clock support | ||
604 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
605 | * They were originally developed for SUN and DEC kernels. | ||
606 | * All the kudos should go to Dave for this stuff. | ||
607 | * | ||
608 | */ | ||
609 | static void second_overflow(void) | ||
610 | { | ||
611 | long ltemp; | ||
612 | |||
613 | /* Bump the maxerror field */ | ||
614 | time_maxerror += time_tolerance >> SHIFT_USEC; | ||
615 | if ( time_maxerror > NTP_PHASE_LIMIT ) { | ||
616 | time_maxerror = NTP_PHASE_LIMIT; | ||
617 | time_status |= STA_UNSYNC; | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * Leap second processing. If in leap-insert state at | ||
622 | * the end of the day, the system clock is set back one | ||
623 | * second; if in leap-delete state, the system clock is | ||
624 | * set ahead one second. The microtime() routine or | ||
625 | * external clock driver will insure that reported time | ||
626 | * is always monotonic. The ugly divides should be | ||
627 | * replaced. | ||
628 | */ | ||
629 | switch (time_state) { | ||
630 | |||
631 | case TIME_OK: | ||
632 | if (time_status & STA_INS) | ||
633 | time_state = TIME_INS; | ||
634 | else if (time_status & STA_DEL) | ||
635 | time_state = TIME_DEL; | ||
636 | break; | ||
637 | |||
638 | case TIME_INS: | ||
639 | if (xtime.tv_sec % 86400 == 0) { | ||
640 | xtime.tv_sec--; | ||
641 | wall_to_monotonic.tv_sec++; | ||
642 | /* The timer interpolator will make time change gradually instead | ||
643 | * of an immediate jump by one second. | ||
644 | */ | ||
645 | time_interpolator_update(-NSEC_PER_SEC); | ||
646 | time_state = TIME_OOP; | ||
647 | clock_was_set(); | ||
648 | printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); | ||
649 | } | ||
650 | break; | ||
651 | |||
652 | case TIME_DEL: | ||
653 | if ((xtime.tv_sec + 1) % 86400 == 0) { | ||
654 | xtime.tv_sec++; | ||
655 | wall_to_monotonic.tv_sec--; | ||
656 | /* Use of time interpolator for a gradual change of time */ | ||
657 | time_interpolator_update(NSEC_PER_SEC); | ||
658 | time_state = TIME_WAIT; | ||
659 | clock_was_set(); | ||
660 | printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); | ||
661 | } | ||
662 | break; | ||
663 | |||
664 | case TIME_OOP: | ||
665 | time_state = TIME_WAIT; | ||
666 | break; | ||
667 | |||
668 | case TIME_WAIT: | ||
669 | if (!(time_status & (STA_INS | STA_DEL))) | ||
670 | time_state = TIME_OK; | ||
671 | } | ||
672 | |||
673 | /* | ||
674 | * Compute the phase adjustment for the next second. In | ||
675 | * PLL mode, the offset is reduced by a fixed factor | ||
676 | * times the time constant. In FLL mode the offset is | ||
677 | * used directly. In either mode, the maximum phase | ||
678 | * adjustment for each second is clamped so as to spread | ||
679 | * the adjustment over not more than the number of | ||
680 | * seconds between updates. | ||
681 | */ | ||
682 | if (time_offset < 0) { | ||
683 | ltemp = -time_offset; | ||
684 | if (!(time_status & STA_FLL)) | ||
685 | ltemp >>= SHIFT_KG + time_constant; | ||
686 | if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) | ||
687 | ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; | ||
688 | time_offset += ltemp; | ||
689 | time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | ||
690 | } else { | ||
691 | ltemp = time_offset; | ||
692 | if (!(time_status & STA_FLL)) | ||
693 | ltemp >>= SHIFT_KG + time_constant; | ||
694 | if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) | ||
695 | ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; | ||
696 | time_offset -= ltemp; | ||
697 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | ||
698 | } | ||
699 | |||
700 | /* | ||
701 | * Compute the frequency estimate and additional phase | ||
702 | * adjustment due to frequency error for the next | ||
703 | * second. When the PPS signal is engaged, gnaw on the | ||
704 | * watchdog counter and update the frequency computed by | ||
705 | * the pll and the PPS signal. | ||
706 | */ | ||
707 | pps_valid++; | ||
708 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ | ||
709 | pps_jitter = MAXTIME; | ||
710 | pps_stabil = MAXFREQ; | ||
711 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | ||
712 | STA_PPSWANDER | STA_PPSERROR); | ||
713 | } | ||
714 | ltemp = time_freq + pps_freq; | ||
715 | if (ltemp < 0) | ||
716 | time_adj -= -ltemp >> | ||
717 | (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); | ||
718 | else | ||
719 | time_adj += ltemp >> | ||
720 | (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); | ||
721 | |||
722 | #if HZ == 100 | ||
723 | /* Compensate for (HZ==100) != (1 << SHIFT_HZ). | ||
724 | * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) | ||
725 | */ | ||
726 | if (time_adj < 0) | ||
727 | time_adj -= (-time_adj >> 2) + (-time_adj >> 5); | ||
728 | else | ||
729 | time_adj += (time_adj >> 2) + (time_adj >> 5); | ||
730 | #endif | ||
731 | #if HZ == 1000 | ||
732 | /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). | ||
733 | * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) | ||
734 | */ | ||
735 | if (time_adj < 0) | ||
736 | time_adj -= (-time_adj >> 6) + (-time_adj >> 7); | ||
737 | else | ||
738 | time_adj += (time_adj >> 6) + (time_adj >> 7); | ||
739 | #endif | ||
740 | } | ||
741 | |||
742 | /* in the NTP reference this is called "hardclock()" */ | ||
743 | static void update_wall_time_one_tick(void) | ||
744 | { | ||
745 | long time_adjust_step, delta_nsec; | ||
746 | |||
747 | if ( (time_adjust_step = time_adjust) != 0 ) { | ||
748 | /* We are doing an adjtime thing. | ||
749 | * | ||
750 | * Prepare time_adjust_step to be within bounds. | ||
751 | * Note that a positive time_adjust means we want the clock | ||
752 | * to run faster. | ||
753 | * | ||
754 | * Limit the amount of the step to be in the range | ||
755 | * -tickadj .. +tickadj | ||
756 | */ | ||
757 | if (time_adjust > tickadj) | ||
758 | time_adjust_step = tickadj; | ||
759 | else if (time_adjust < -tickadj) | ||
760 | time_adjust_step = -tickadj; | ||
761 | |||
762 | /* Reduce by this step the amount of time left */ | ||
763 | time_adjust -= time_adjust_step; | ||
764 | } | ||
765 | delta_nsec = tick_nsec + time_adjust_step * 1000; | ||
766 | /* | ||
767 | * Advance the phase, once it gets to one microsecond, then | ||
768 | * advance the tick more. | ||
769 | */ | ||
770 | time_phase += time_adj; | ||
771 | if (time_phase <= -FINENSEC) { | ||
772 | long ltemp = -time_phase >> (SHIFT_SCALE - 10); | ||
773 | time_phase += ltemp << (SHIFT_SCALE - 10); | ||
774 | delta_nsec -= ltemp; | ||
775 | } | ||
776 | else if (time_phase >= FINENSEC) { | ||
777 | long ltemp = time_phase >> (SHIFT_SCALE - 10); | ||
778 | time_phase -= ltemp << (SHIFT_SCALE - 10); | ||
779 | delta_nsec += ltemp; | ||
780 | } | ||
781 | xtime.tv_nsec += delta_nsec; | ||
782 | time_interpolator_update(delta_nsec); | ||
783 | |||
784 | /* Changes by adjtime() do not take effect till next tick. */ | ||
785 | if (time_next_adjust != 0) { | ||
786 | time_adjust = time_next_adjust; | ||
787 | time_next_adjust = 0; | ||
788 | } | ||
789 | } | ||
790 | |||
791 | /* | ||
792 | * Using a loop looks inefficient, but "ticks" is | ||
793 | * usually just one (we shouldn't be losing ticks, | ||
794 | * we're doing this this way mainly for interrupt | ||
795 | * latency reasons, not because we think we'll | ||
796 | * have lots of lost timer ticks | ||
797 | */ | ||
798 | static void update_wall_time(unsigned long ticks) | ||
799 | { | ||
800 | do { | ||
801 | ticks--; | ||
802 | update_wall_time_one_tick(); | ||
803 | if (xtime.tv_nsec >= 1000000000) { | ||
804 | xtime.tv_nsec -= 1000000000; | ||
805 | xtime.tv_sec++; | ||
806 | second_overflow(); | ||
807 | } | ||
808 | } while (ticks); | ||
809 | } | ||
810 | |||
811 | /* | ||
812 | * Called from the timer interrupt handler to charge one tick to the current | ||
813 | * process. user_tick is 1 if the tick is user time, 0 for system. | ||
814 | */ | ||
815 | void update_process_times(int user_tick) | ||
816 | { | ||
817 | struct task_struct *p = current; | ||
818 | int cpu = smp_processor_id(); | ||
819 | |||
820 | /* Note: this timer irq context must be accounted for as well. */ | ||
821 | if (user_tick) | ||
822 | account_user_time(p, jiffies_to_cputime(1)); | ||
823 | else | ||
824 | account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); | ||
825 | run_local_timers(); | ||
826 | if (rcu_pending(cpu)) | ||
827 | rcu_check_callbacks(cpu, user_tick); | ||
828 | scheduler_tick(); | ||
829 | run_posix_cpu_timers(p); | ||
830 | } | ||
831 | |||
832 | /* | ||
833 | * Nr of active tasks - counted in fixed-point numbers | ||
834 | */ | ||
835 | static unsigned long count_active_tasks(void) | ||
836 | { | ||
837 | return (nr_running() + nr_uninterruptible()) * FIXED_1; | ||
838 | } | ||
839 | |||
840 | /* | ||
841 | * Hmm.. Changed this, as the GNU make sources (load.c) seems to | ||
842 | * imply that avenrun[] is the standard name for this kind of thing. | ||
843 | * Nothing else seems to be standardized: the fractional size etc | ||
844 | * all seem to differ on different machines. | ||
845 | * | ||
846 | * Requires xtime_lock to access. | ||
847 | */ | ||
848 | unsigned long avenrun[3]; | ||
849 | |||
850 | EXPORT_SYMBOL(avenrun); | ||
851 | |||
852 | /* | ||
853 | * calc_load - given tick count, update the avenrun load estimates. | ||
854 | * This is called while holding a write_lock on xtime_lock. | ||
855 | */ | ||
856 | static inline void calc_load(unsigned long ticks) | ||
857 | { | ||
858 | unsigned long active_tasks; /* fixed-point */ | ||
859 | static int count = LOAD_FREQ; | ||
860 | |||
861 | count -= ticks; | ||
862 | if (count < 0) { | ||
863 | count += LOAD_FREQ; | ||
864 | active_tasks = count_active_tasks(); | ||
865 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); | ||
866 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); | ||
867 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); | ||
868 | } | ||
869 | } | ||
870 | |||
871 | /* jiffies at the most recent update of wall time */ | ||
872 | unsigned long wall_jiffies = INITIAL_JIFFIES; | ||
873 | |||
874 | /* | ||
875 | * This read-write spinlock protects us from races in SMP while | ||
876 | * playing with xtime and avenrun. | ||
877 | */ | ||
878 | #ifndef ARCH_HAVE_XTIME_LOCK | ||
879 | seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; | ||
880 | |||
881 | EXPORT_SYMBOL(xtime_lock); | ||
882 | #endif | ||
883 | |||
884 | /* | ||
885 | * This function runs timers and the timer-tq in bottom half context. | ||
886 | */ | ||
887 | static void run_timer_softirq(struct softirq_action *h) | ||
888 | { | ||
889 | tvec_base_t *base = &__get_cpu_var(tvec_bases); | ||
890 | |||
891 | if (time_after_eq(jiffies, base->timer_jiffies)) | ||
892 | __run_timers(base); | ||
893 | } | ||
894 | |||
895 | /* | ||
896 | * Called by the local, per-CPU timer interrupt on SMP. | ||
897 | */ | ||
898 | void run_local_timers(void) | ||
899 | { | ||
900 | raise_softirq(TIMER_SOFTIRQ); | ||
901 | } | ||
902 | |||
903 | /* | ||
904 | * Called by the timer interrupt. xtime_lock must already be taken | ||
905 | * by the timer IRQ! | ||
906 | */ | ||
907 | static inline void update_times(void) | ||
908 | { | ||
909 | unsigned long ticks; | ||
910 | |||
911 | ticks = jiffies - wall_jiffies; | ||
912 | if (ticks) { | ||
913 | wall_jiffies += ticks; | ||
914 | update_wall_time(ticks); | ||
915 | } | ||
916 | calc_load(ticks); | ||
917 | } | ||
918 | |||
919 | /* | ||
920 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | ||
921 | * without sampling the sequence number in xtime_lock. | ||
922 | * jiffies is defined in the linker script... | ||
923 | */ | ||
924 | |||
925 | void do_timer(struct pt_regs *regs) | ||
926 | { | ||
927 | jiffies_64++; | ||
928 | update_times(); | ||
929 | } | ||
930 | |||
931 | #ifdef __ARCH_WANT_SYS_ALARM | ||
932 | |||
933 | /* | ||
934 | * For backwards compatibility? This can be done in libc so Alpha | ||
935 | * and all newer ports shouldn't need it. | ||
936 | */ | ||
937 | asmlinkage unsigned long sys_alarm(unsigned int seconds) | ||
938 | { | ||
939 | struct itimerval it_new, it_old; | ||
940 | unsigned int oldalarm; | ||
941 | |||
942 | it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; | ||
943 | it_new.it_value.tv_sec = seconds; | ||
944 | it_new.it_value.tv_usec = 0; | ||
945 | do_setitimer(ITIMER_REAL, &it_new, &it_old); | ||
946 | oldalarm = it_old.it_value.tv_sec; | ||
947 | /* ehhh.. We can't return 0 if we have an alarm pending.. */ | ||
948 | /* And we'd better return too much than too little anyway */ | ||
949 | if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) | ||
950 | oldalarm++; | ||
951 | return oldalarm; | ||
952 | } | ||
953 | |||
954 | #endif | ||
955 | |||
956 | #ifndef __alpha__ | ||
957 | |||
958 | /* | ||
959 | * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this | ||
960 | * should be moved into arch/i386 instead? | ||
961 | */ | ||
962 | |||
963 | /** | ||
964 | * sys_getpid - return the thread group id of the current process | ||
965 | * | ||
966 | * Note, despite the name, this returns the tgid not the pid. The tgid and | ||
967 | * the pid are identical unless CLONE_THREAD was specified on clone() in | ||
968 | * which case the tgid is the same in all threads of the same group. | ||
969 | * | ||
970 | * This is SMP safe as current->tgid does not change. | ||
971 | */ | ||
972 | asmlinkage long sys_getpid(void) | ||
973 | { | ||
974 | return current->tgid; | ||
975 | } | ||
976 | |||
977 | /* | ||
978 | * Accessing ->group_leader->real_parent is not SMP-safe, it could | ||
979 | * change from under us. However, rather than getting any lock | ||
980 | * we can use an optimistic algorithm: get the parent | ||
981 | * pid, and go back and check that the parent is still | ||
982 | * the same. If it has changed (which is extremely unlikely | ||
983 | * indeed), we just try again.. | ||
984 | * | ||
985 | * NOTE! This depends on the fact that even if we _do_ | ||
986 | * get an old value of "parent", we can happily dereference | ||
987 | * the pointer (it was and remains a dereferencable kernel pointer | ||
988 | * no matter what): we just can't necessarily trust the result | ||
989 | * until we know that the parent pointer is valid. | ||
990 | * | ||
991 | * NOTE2: ->group_leader never changes from under us. | ||
992 | */ | ||
993 | asmlinkage long sys_getppid(void) | ||
994 | { | ||
995 | int pid; | ||
996 | struct task_struct *me = current; | ||
997 | struct task_struct *parent; | ||
998 | |||
999 | parent = me->group_leader->real_parent; | ||
1000 | for (;;) { | ||
1001 | pid = parent->tgid; | ||
1002 | #ifdef CONFIG_SMP | ||
1003 | { | ||
1004 | struct task_struct *old = parent; | ||
1005 | |||
1006 | /* | ||
1007 | * Make sure we read the pid before re-reading the | ||
1008 | * parent pointer: | ||
1009 | */ | ||
1010 | rmb(); | ||
1011 | parent = me->group_leader->real_parent; | ||
1012 | if (old != parent) | ||
1013 | continue; | ||
1014 | } | ||
1015 | #endif | ||
1016 | break; | ||
1017 | } | ||
1018 | return pid; | ||
1019 | } | ||
1020 | |||
1021 | asmlinkage long sys_getuid(void) | ||
1022 | { | ||
1023 | /* Only we change this so SMP safe */ | ||
1024 | return current->uid; | ||
1025 | } | ||
1026 | |||
1027 | asmlinkage long sys_geteuid(void) | ||
1028 | { | ||
1029 | /* Only we change this so SMP safe */ | ||
1030 | return current->euid; | ||
1031 | } | ||
1032 | |||
1033 | asmlinkage long sys_getgid(void) | ||
1034 | { | ||
1035 | /* Only we change this so SMP safe */ | ||
1036 | return current->gid; | ||
1037 | } | ||
1038 | |||
1039 | asmlinkage long sys_getegid(void) | ||
1040 | { | ||
1041 | /* Only we change this so SMP safe */ | ||
1042 | return current->egid; | ||
1043 | } | ||
1044 | |||
1045 | #endif | ||
1046 | |||
1047 | static void process_timeout(unsigned long __data) | ||
1048 | { | ||
1049 | wake_up_process((task_t *)__data); | ||
1050 | } | ||
1051 | |||
1052 | /** | ||
1053 | * schedule_timeout - sleep until timeout | ||
1054 | * @timeout: timeout value in jiffies | ||
1055 | * | ||
1056 | * Make the current task sleep until @timeout jiffies have | ||
1057 | * elapsed. The routine will return immediately unless | ||
1058 | * the current task state has been set (see set_current_state()). | ||
1059 | * | ||
1060 | * You can set the task state as follows - | ||
1061 | * | ||
1062 | * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to | ||
1063 | * pass before the routine returns. The routine will return 0 | ||
1064 | * | ||
1065 | * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | ||
1066 | * delivered to the current task. In this case the remaining time | ||
1067 | * in jiffies will be returned, or 0 if the timer expired in time | ||
1068 | * | ||
1069 | * The current task state is guaranteed to be TASK_RUNNING when this | ||
1070 | * routine returns. | ||
1071 | * | ||
1072 | * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule | ||
1073 | * the CPU away without a bound on the timeout. In this case the return | ||
1074 | * value will be %MAX_SCHEDULE_TIMEOUT. | ||
1075 | * | ||
1076 | * In all cases the return value is guaranteed to be non-negative. | ||
1077 | */ | ||
1078 | fastcall signed long __sched schedule_timeout(signed long timeout) | ||
1079 | { | ||
1080 | struct timer_list timer; | ||
1081 | unsigned long expire; | ||
1082 | |||
1083 | switch (timeout) | ||
1084 | { | ||
1085 | case MAX_SCHEDULE_TIMEOUT: | ||
1086 | /* | ||
1087 | * These two special cases are useful to be comfortable | ||
1088 | * in the caller. Nothing more. We could take | ||
1089 | * MAX_SCHEDULE_TIMEOUT from one of the negative value | ||
1090 | * but I' d like to return a valid offset (>=0) to allow | ||
1091 | * the caller to do everything it want with the retval. | ||
1092 | */ | ||
1093 | schedule(); | ||
1094 | goto out; | ||
1095 | default: | ||
1096 | /* | ||
1097 | * Another bit of PARANOID. Note that the retval will be | ||
1098 | * 0 since no piece of kernel is supposed to do a check | ||
1099 | * for a negative retval of schedule_timeout() (since it | ||
1100 | * should never happens anyway). You just have the printk() | ||
1101 | * that will tell you if something is gone wrong and where. | ||
1102 | */ | ||
1103 | if (timeout < 0) | ||
1104 | { | ||
1105 | printk(KERN_ERR "schedule_timeout: wrong timeout " | ||
1106 | "value %lx from %p\n", timeout, | ||
1107 | __builtin_return_address(0)); | ||
1108 | current->state = TASK_RUNNING; | ||
1109 | goto out; | ||
1110 | } | ||
1111 | } | ||
1112 | |||
1113 | expire = timeout + jiffies; | ||
1114 | |||
1115 | init_timer(&timer); | ||
1116 | timer.expires = expire; | ||
1117 | timer.data = (unsigned long) current; | ||
1118 | timer.function = process_timeout; | ||
1119 | |||
1120 | add_timer(&timer); | ||
1121 | schedule(); | ||
1122 | del_singleshot_timer_sync(&timer); | ||
1123 | |||
1124 | timeout = expire - jiffies; | ||
1125 | |||
1126 | out: | ||
1127 | return timeout < 0 ? 0 : timeout; | ||
1128 | } | ||
1129 | |||
1130 | EXPORT_SYMBOL(schedule_timeout); | ||
1131 | |||
1132 | /* Thread ID - the internal kernel "pid" */ | ||
1133 | asmlinkage long sys_gettid(void) | ||
1134 | { | ||
1135 | return current->pid; | ||
1136 | } | ||
1137 | |||
1138 | static long __sched nanosleep_restart(struct restart_block *restart) | ||
1139 | { | ||
1140 | unsigned long expire = restart->arg0, now = jiffies; | ||
1141 | struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; | ||
1142 | long ret; | ||
1143 | |||
1144 | /* Did it expire while we handled signals? */ | ||
1145 | if (!time_after(expire, now)) | ||
1146 | return 0; | ||
1147 | |||
1148 | current->state = TASK_INTERRUPTIBLE; | ||
1149 | expire = schedule_timeout(expire - now); | ||
1150 | |||
1151 | ret = 0; | ||
1152 | if (expire) { | ||
1153 | struct timespec t; | ||
1154 | jiffies_to_timespec(expire, &t); | ||
1155 | |||
1156 | ret = -ERESTART_RESTARTBLOCK; | ||
1157 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) | ||
1158 | ret = -EFAULT; | ||
1159 | /* The 'restart' block is already filled in */ | ||
1160 | } | ||
1161 | return ret; | ||
1162 | } | ||
1163 | |||
1164 | asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | ||
1165 | { | ||
1166 | struct timespec t; | ||
1167 | unsigned long expire; | ||
1168 | long ret; | ||
1169 | |||
1170 | if (copy_from_user(&t, rqtp, sizeof(t))) | ||
1171 | return -EFAULT; | ||
1172 | |||
1173 | if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) | ||
1174 | return -EINVAL; | ||
1175 | |||
1176 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | ||
1177 | current->state = TASK_INTERRUPTIBLE; | ||
1178 | expire = schedule_timeout(expire); | ||
1179 | |||
1180 | ret = 0; | ||
1181 | if (expire) { | ||
1182 | struct restart_block *restart; | ||
1183 | jiffies_to_timespec(expire, &t); | ||
1184 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) | ||
1185 | return -EFAULT; | ||
1186 | |||
1187 | restart = ¤t_thread_info()->restart_block; | ||
1188 | restart->fn = nanosleep_restart; | ||
1189 | restart->arg0 = jiffies + expire; | ||
1190 | restart->arg1 = (unsigned long) rmtp; | ||
1191 | ret = -ERESTART_RESTARTBLOCK; | ||
1192 | } | ||
1193 | return ret; | ||
1194 | } | ||
1195 | |||
1196 | /* | ||
1197 | * sys_sysinfo - fill in sysinfo struct | ||
1198 | */ | ||
1199 | asmlinkage long sys_sysinfo(struct sysinfo __user *info) | ||
1200 | { | ||
1201 | struct sysinfo val; | ||
1202 | unsigned long mem_total, sav_total; | ||
1203 | unsigned int mem_unit, bitcount; | ||
1204 | unsigned long seq; | ||
1205 | |||
1206 | memset((char *)&val, 0, sizeof(struct sysinfo)); | ||
1207 | |||
1208 | do { | ||
1209 | struct timespec tp; | ||
1210 | seq = read_seqbegin(&xtime_lock); | ||
1211 | |||
1212 | /* | ||
1213 | * This is annoying. The below is the same thing | ||
1214 | * posix_get_clock_monotonic() does, but it wants to | ||
1215 | * take the lock which we want to cover the loads stuff | ||
1216 | * too. | ||
1217 | */ | ||
1218 | |||
1219 | getnstimeofday(&tp); | ||
1220 | tp.tv_sec += wall_to_monotonic.tv_sec; | ||
1221 | tp.tv_nsec += wall_to_monotonic.tv_nsec; | ||
1222 | if (tp.tv_nsec - NSEC_PER_SEC >= 0) { | ||
1223 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; | ||
1224 | tp.tv_sec++; | ||
1225 | } | ||
1226 | val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); | ||
1227 | |||
1228 | val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); | ||
1229 | val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); | ||
1230 | val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); | ||
1231 | |||
1232 | val.procs = nr_threads; | ||
1233 | } while (read_seqretry(&xtime_lock, seq)); | ||
1234 | |||
1235 | si_meminfo(&val); | ||
1236 | si_swapinfo(&val); | ||
1237 | |||
1238 | /* | ||
1239 | * If the sum of all the available memory (i.e. ram + swap) | ||
1240 | * is less than can be stored in a 32 bit unsigned long then | ||
1241 | * we can be binary compatible with 2.2.x kernels. If not, | ||
1242 | * well, in that case 2.2.x was broken anyways... | ||
1243 | * | ||
1244 | * -Erik Andersen <andersee@debian.org> | ||
1245 | */ | ||
1246 | |||
1247 | mem_total = val.totalram + val.totalswap; | ||
1248 | if (mem_total < val.totalram || mem_total < val.totalswap) | ||
1249 | goto out; | ||
1250 | bitcount = 0; | ||
1251 | mem_unit = val.mem_unit; | ||
1252 | while (mem_unit > 1) { | ||
1253 | bitcount++; | ||
1254 | mem_unit >>= 1; | ||
1255 | sav_total = mem_total; | ||
1256 | mem_total <<= 1; | ||
1257 | if (mem_total < sav_total) | ||
1258 | goto out; | ||
1259 | } | ||
1260 | |||
1261 | /* | ||
1262 | * If mem_total did not overflow, multiply all memory values by | ||
1263 | * val.mem_unit and set it to 1. This leaves things compatible | ||
1264 | * with 2.2.x, and also retains compatibility with earlier 2.4.x | ||
1265 | * kernels... | ||
1266 | */ | ||
1267 | |||
1268 | val.mem_unit = 1; | ||
1269 | val.totalram <<= bitcount; | ||
1270 | val.freeram <<= bitcount; | ||
1271 | val.sharedram <<= bitcount; | ||
1272 | val.bufferram <<= bitcount; | ||
1273 | val.totalswap <<= bitcount; | ||
1274 | val.freeswap <<= bitcount; | ||
1275 | val.totalhigh <<= bitcount; | ||
1276 | val.freehigh <<= bitcount; | ||
1277 | |||
1278 | out: | ||
1279 | if (copy_to_user(info, &val, sizeof(struct sysinfo))) | ||
1280 | return -EFAULT; | ||
1281 | |||
1282 | return 0; | ||
1283 | } | ||
1284 | |||
1285 | static void __devinit init_timers_cpu(int cpu) | ||
1286 | { | ||
1287 | int j; | ||
1288 | tvec_base_t *base; | ||
1289 | |||
1290 | base = &per_cpu(tvec_bases, cpu); | ||
1291 | spin_lock_init(&base->lock); | ||
1292 | for (j = 0; j < TVN_SIZE; j++) { | ||
1293 | INIT_LIST_HEAD(base->tv5.vec + j); | ||
1294 | INIT_LIST_HEAD(base->tv4.vec + j); | ||
1295 | INIT_LIST_HEAD(base->tv3.vec + j); | ||
1296 | INIT_LIST_HEAD(base->tv2.vec + j); | ||
1297 | } | ||
1298 | for (j = 0; j < TVR_SIZE; j++) | ||
1299 | INIT_LIST_HEAD(base->tv1.vec + j); | ||
1300 | |||
1301 | base->timer_jiffies = jiffies; | ||
1302 | } | ||
1303 | |||
1304 | #ifdef CONFIG_HOTPLUG_CPU | ||
1305 | static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | ||
1306 | { | ||
1307 | struct timer_list *timer; | ||
1308 | |||
1309 | while (!list_empty(head)) { | ||
1310 | timer = list_entry(head->next, struct timer_list, entry); | ||
1311 | /* We're locking backwards from __mod_timer order here, | ||
1312 | beware deadlock. */ | ||
1313 | if (!spin_trylock(&timer->lock)) | ||
1314 | return 0; | ||
1315 | list_del(&timer->entry); | ||
1316 | internal_add_timer(new_base, timer); | ||
1317 | timer->base = new_base; | ||
1318 | spin_unlock(&timer->lock); | ||
1319 | } | ||
1320 | return 1; | ||
1321 | } | ||
1322 | |||
1323 | static void __devinit migrate_timers(int cpu) | ||
1324 | { | ||
1325 | tvec_base_t *old_base; | ||
1326 | tvec_base_t *new_base; | ||
1327 | int i; | ||
1328 | |||
1329 | BUG_ON(cpu_online(cpu)); | ||
1330 | old_base = &per_cpu(tvec_bases, cpu); | ||
1331 | new_base = &get_cpu_var(tvec_bases); | ||
1332 | |||
1333 | local_irq_disable(); | ||
1334 | again: | ||
1335 | /* Prevent deadlocks via ordering by old_base < new_base. */ | ||
1336 | if (old_base < new_base) { | ||
1337 | spin_lock(&new_base->lock); | ||
1338 | spin_lock(&old_base->lock); | ||
1339 | } else { | ||
1340 | spin_lock(&old_base->lock); | ||
1341 | spin_lock(&new_base->lock); | ||
1342 | } | ||
1343 | |||
1344 | if (old_base->running_timer) | ||
1345 | BUG(); | ||
1346 | for (i = 0; i < TVR_SIZE; i++) | ||
1347 | if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) | ||
1348 | goto unlock_again; | ||
1349 | for (i = 0; i < TVN_SIZE; i++) | ||
1350 | if (!migrate_timer_list(new_base, old_base->tv2.vec + i) | ||
1351 | || !migrate_timer_list(new_base, old_base->tv3.vec + i) | ||
1352 | || !migrate_timer_list(new_base, old_base->tv4.vec + i) | ||
1353 | || !migrate_timer_list(new_base, old_base->tv5.vec + i)) | ||
1354 | goto unlock_again; | ||
1355 | spin_unlock(&old_base->lock); | ||
1356 | spin_unlock(&new_base->lock); | ||
1357 | local_irq_enable(); | ||
1358 | put_cpu_var(tvec_bases); | ||
1359 | return; | ||
1360 | |||
1361 | unlock_again: | ||
1362 | /* Avoid deadlock with __mod_timer, by backing off. */ | ||
1363 | spin_unlock(&old_base->lock); | ||
1364 | spin_unlock(&new_base->lock); | ||
1365 | cpu_relax(); | ||
1366 | goto again; | ||
1367 | } | ||
1368 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1369 | |||
1370 | static int __devinit timer_cpu_notify(struct notifier_block *self, | ||
1371 | unsigned long action, void *hcpu) | ||
1372 | { | ||
1373 | long cpu = (long)hcpu; | ||
1374 | switch(action) { | ||
1375 | case CPU_UP_PREPARE: | ||
1376 | init_timers_cpu(cpu); | ||
1377 | break; | ||
1378 | #ifdef CONFIG_HOTPLUG_CPU | ||
1379 | case CPU_DEAD: | ||
1380 | migrate_timers(cpu); | ||
1381 | break; | ||
1382 | #endif | ||
1383 | default: | ||
1384 | break; | ||
1385 | } | ||
1386 | return NOTIFY_OK; | ||
1387 | } | ||
1388 | |||
1389 | static struct notifier_block __devinitdata timers_nb = { | ||
1390 | .notifier_call = timer_cpu_notify, | ||
1391 | }; | ||
1392 | |||
1393 | |||
1394 | void __init init_timers(void) | ||
1395 | { | ||
1396 | timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | ||
1397 | (void *)(long)smp_processor_id()); | ||
1398 | register_cpu_notifier(&timers_nb); | ||
1399 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); | ||
1400 | } | ||
1401 | |||
1402 | #ifdef CONFIG_TIME_INTERPOLATION | ||
1403 | |||
1404 | struct time_interpolator *time_interpolator; | ||
1405 | static struct time_interpolator *time_interpolator_list; | ||
1406 | static DEFINE_SPINLOCK(time_interpolator_lock); | ||
1407 | |||
1408 | static inline u64 time_interpolator_get_cycles(unsigned int src) | ||
1409 | { | ||
1410 | unsigned long (*x)(void); | ||
1411 | |||
1412 | switch (src) | ||
1413 | { | ||
1414 | case TIME_SOURCE_FUNCTION: | ||
1415 | x = time_interpolator->addr; | ||
1416 | return x(); | ||
1417 | |||
1418 | case TIME_SOURCE_MMIO64 : | ||
1419 | return readq((void __iomem *) time_interpolator->addr); | ||
1420 | |||
1421 | case TIME_SOURCE_MMIO32 : | ||
1422 | return readl((void __iomem *) time_interpolator->addr); | ||
1423 | |||
1424 | default: return get_cycles(); | ||
1425 | } | ||
1426 | } | ||
1427 | |||
1428 | static inline u64 time_interpolator_get_counter(void) | ||
1429 | { | ||
1430 | unsigned int src = time_interpolator->source; | ||
1431 | |||
1432 | if (time_interpolator->jitter) | ||
1433 | { | ||
1434 | u64 lcycle; | ||
1435 | u64 now; | ||
1436 | |||
1437 | do { | ||
1438 | lcycle = time_interpolator->last_cycle; | ||
1439 | now = time_interpolator_get_cycles(src); | ||
1440 | if (lcycle && time_after(lcycle, now)) | ||
1441 | return lcycle; | ||
1442 | /* Keep track of the last timer value returned. The use of cmpxchg here | ||
1443 | * will cause contention in an SMP environment. | ||
1444 | */ | ||
1445 | } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); | ||
1446 | return now; | ||
1447 | } | ||
1448 | else | ||
1449 | return time_interpolator_get_cycles(src); | ||
1450 | } | ||
1451 | |||
1452 | void time_interpolator_reset(void) | ||
1453 | { | ||
1454 | time_interpolator->offset = 0; | ||
1455 | time_interpolator->last_counter = time_interpolator_get_counter(); | ||
1456 | } | ||
1457 | |||
1458 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) | ||
1459 | |||
1460 | unsigned long time_interpolator_get_offset(void) | ||
1461 | { | ||
1462 | /* If we do not have a time interpolator set up then just return zero */ | ||
1463 | if (!time_interpolator) | ||
1464 | return 0; | ||
1465 | |||
1466 | return time_interpolator->offset + | ||
1467 | GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); | ||
1468 | } | ||
1469 | |||
1470 | #define INTERPOLATOR_ADJUST 65536 | ||
1471 | #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST | ||
1472 | |||
1473 | static void time_interpolator_update(long delta_nsec) | ||
1474 | { | ||
1475 | u64 counter; | ||
1476 | unsigned long offset; | ||
1477 | |||
1478 | /* If there is no time interpolator set up then do nothing */ | ||
1479 | if (!time_interpolator) | ||
1480 | return; | ||
1481 | |||
1482 | /* The interpolator compensates for late ticks by accumulating | ||
1483 | * the late time in time_interpolator->offset. A tick earlier than | ||
1484 | * expected will lead to a reset of the offset and a corresponding | ||
1485 | * jump of the clock forward. Again this only works if the | ||
1486 | * interpolator clock is running slightly slower than the regular clock | ||
1487 | * and the tuning logic insures that. | ||
1488 | */ | ||
1489 | |||
1490 | counter = time_interpolator_get_counter(); | ||
1491 | offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); | ||
1492 | |||
1493 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | ||
1494 | time_interpolator->offset = offset - delta_nsec; | ||
1495 | else { | ||
1496 | time_interpolator->skips++; | ||
1497 | time_interpolator->ns_skipped += delta_nsec - offset; | ||
1498 | time_interpolator->offset = 0; | ||
1499 | } | ||
1500 | time_interpolator->last_counter = counter; | ||
1501 | |||
1502 | /* Tuning logic for time interpolator invoked every minute or so. | ||
1503 | * Decrease interpolator clock speed if no skips occurred and an offset is carried. | ||
1504 | * Increase interpolator clock speed if we skip too much time. | ||
1505 | */ | ||
1506 | if (jiffies % INTERPOLATOR_ADJUST == 0) | ||
1507 | { | ||
1508 | if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) | ||
1509 | time_interpolator->nsec_per_cyc--; | ||
1510 | if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) | ||
1511 | time_interpolator->nsec_per_cyc++; | ||
1512 | time_interpolator->skips = 0; | ||
1513 | time_interpolator->ns_skipped = 0; | ||
1514 | } | ||
1515 | } | ||
1516 | |||
1517 | static inline int | ||
1518 | is_better_time_interpolator(struct time_interpolator *new) | ||
1519 | { | ||
1520 | if (!time_interpolator) | ||
1521 | return 1; | ||
1522 | return new->frequency > 2*time_interpolator->frequency || | ||
1523 | (unsigned long)new->drift < (unsigned long)time_interpolator->drift; | ||
1524 | } | ||
1525 | |||
1526 | void | ||
1527 | register_time_interpolator(struct time_interpolator *ti) | ||
1528 | { | ||
1529 | unsigned long flags; | ||
1530 | |||
1531 | /* Sanity check */ | ||
1532 | if (ti->frequency == 0 || ti->mask == 0) | ||
1533 | BUG(); | ||
1534 | |||
1535 | ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; | ||
1536 | spin_lock(&time_interpolator_lock); | ||
1537 | write_seqlock_irqsave(&xtime_lock, flags); | ||
1538 | if (is_better_time_interpolator(ti)) { | ||
1539 | time_interpolator = ti; | ||
1540 | time_interpolator_reset(); | ||
1541 | } | ||
1542 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1543 | |||
1544 | ti->next = time_interpolator_list; | ||
1545 | time_interpolator_list = ti; | ||
1546 | spin_unlock(&time_interpolator_lock); | ||
1547 | } | ||
1548 | |||
1549 | void | ||
1550 | unregister_time_interpolator(struct time_interpolator *ti) | ||
1551 | { | ||
1552 | struct time_interpolator *curr, **prev; | ||
1553 | unsigned long flags; | ||
1554 | |||
1555 | spin_lock(&time_interpolator_lock); | ||
1556 | prev = &time_interpolator_list; | ||
1557 | for (curr = *prev; curr; curr = curr->next) { | ||
1558 | if (curr == ti) { | ||
1559 | *prev = curr->next; | ||
1560 | break; | ||
1561 | } | ||
1562 | prev = &curr->next; | ||
1563 | } | ||
1564 | |||
1565 | write_seqlock_irqsave(&xtime_lock, flags); | ||
1566 | if (ti == time_interpolator) { | ||
1567 | /* we lost the best time-interpolator: */ | ||
1568 | time_interpolator = NULL; | ||
1569 | /* find the next-best interpolator */ | ||
1570 | for (curr = time_interpolator_list; curr; curr = curr->next) | ||
1571 | if (is_better_time_interpolator(curr)) | ||
1572 | time_interpolator = curr; | ||
1573 | time_interpolator_reset(); | ||
1574 | } | ||
1575 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1576 | spin_unlock(&time_interpolator_lock); | ||
1577 | } | ||
1578 | #endif /* CONFIG_TIME_INTERPOLATION */ | ||
1579 | |||
1580 | /** | ||
1581 | * msleep - sleep safely even with waitqueue interruptions | ||
1582 | * @msecs: Time in milliseconds to sleep for | ||
1583 | */ | ||
1584 | void msleep(unsigned int msecs) | ||
1585 | { | ||
1586 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; | ||
1587 | |||
1588 | while (timeout) { | ||
1589 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1590 | timeout = schedule_timeout(timeout); | ||
1591 | } | ||
1592 | } | ||
1593 | |||
1594 | EXPORT_SYMBOL(msleep); | ||
1595 | |||
1596 | /** | ||
1597 | * msleep_interruptible - sleep waiting for waitqueue interruptions | ||
1598 | * @msecs: Time in milliseconds to sleep for | ||
1599 | */ | ||
1600 | unsigned long msleep_interruptible(unsigned int msecs) | ||
1601 | { | ||
1602 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; | ||
1603 | |||
1604 | while (timeout && !signal_pending(current)) { | ||
1605 | set_current_state(TASK_INTERRUPTIBLE); | ||
1606 | timeout = schedule_timeout(timeout); | ||
1607 | } | ||
1608 | return jiffies_to_msecs(timeout); | ||
1609 | } | ||
1610 | |||
1611 | EXPORT_SYMBOL(msleep_interruptible); | ||
diff --git a/kernel/uid16.c b/kernel/uid16.c new file mode 100644 index 000000000000..f669941e8b26 --- /dev/null +++ b/kernel/uid16.c | |||
@@ -0,0 +1,196 @@ | |||
1 | /* | ||
2 | * Wrapper functions for 16bit uid back compatibility. All nicely tied | ||
3 | * together in the faint hope we can take the out in five years time. | ||
4 | */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/utsname.h> | ||
8 | #include <linux/mman.h> | ||
9 | #include <linux/smp_lock.h> | ||
10 | #include <linux/notifier.h> | ||
11 | #include <linux/reboot.h> | ||
12 | #include <linux/prctl.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/highuid.h> | ||
15 | #include <linux/security.h> | ||
16 | #include <linux/syscalls.h> | ||
17 | |||
18 | #include <asm/uaccess.h> | ||
19 | |||
20 | asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) | ||
21 | { | ||
22 | return sys_chown(filename, low2highuid(user), low2highgid(group)); | ||
23 | } | ||
24 | |||
25 | asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) | ||
26 | { | ||
27 | return sys_lchown(filename, low2highuid(user), low2highgid(group)); | ||
28 | } | ||
29 | |||
30 | asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) | ||
31 | { | ||
32 | return sys_fchown(fd, low2highuid(user), low2highgid(group)); | ||
33 | } | ||
34 | |||
35 | asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) | ||
36 | { | ||
37 | return sys_setregid(low2highgid(rgid), low2highgid(egid)); | ||
38 | } | ||
39 | |||
40 | asmlinkage long sys_setgid16(old_gid_t gid) | ||
41 | { | ||
42 | return sys_setgid(low2highgid(gid)); | ||
43 | } | ||
44 | |||
45 | asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) | ||
46 | { | ||
47 | return sys_setreuid(low2highuid(ruid), low2highuid(euid)); | ||
48 | } | ||
49 | |||
50 | asmlinkage long sys_setuid16(old_uid_t uid) | ||
51 | { | ||
52 | return sys_setuid(low2highuid(uid)); | ||
53 | } | ||
54 | |||
55 | asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) | ||
56 | { | ||
57 | return sys_setresuid(low2highuid(ruid), low2highuid(euid), | ||
58 | low2highuid(suid)); | ||
59 | } | ||
60 | |||
61 | asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) | ||
62 | { | ||
63 | int retval; | ||
64 | |||
65 | if (!(retval = put_user(high2lowuid(current->uid), ruid)) && | ||
66 | !(retval = put_user(high2lowuid(current->euid), euid))) | ||
67 | retval = put_user(high2lowuid(current->suid), suid); | ||
68 | |||
69 | return retval; | ||
70 | } | ||
71 | |||
72 | asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) | ||
73 | { | ||
74 | return sys_setresgid(low2highgid(rgid), low2highgid(egid), | ||
75 | low2highgid(sgid)); | ||
76 | } | ||
77 | |||
78 | asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) | ||
79 | { | ||
80 | int retval; | ||
81 | |||
82 | if (!(retval = put_user(high2lowgid(current->gid), rgid)) && | ||
83 | !(retval = put_user(high2lowgid(current->egid), egid))) | ||
84 | retval = put_user(high2lowgid(current->sgid), sgid); | ||
85 | |||
86 | return retval; | ||
87 | } | ||
88 | |||
89 | asmlinkage long sys_setfsuid16(old_uid_t uid) | ||
90 | { | ||
91 | return sys_setfsuid(low2highuid(uid)); | ||
92 | } | ||
93 | |||
94 | asmlinkage long sys_setfsgid16(old_gid_t gid) | ||
95 | { | ||
96 | return sys_setfsgid(low2highgid(gid)); | ||
97 | } | ||
98 | |||
99 | static int groups16_to_user(old_gid_t __user *grouplist, | ||
100 | struct group_info *group_info) | ||
101 | { | ||
102 | int i; | ||
103 | old_gid_t group; | ||
104 | |||
105 | for (i = 0; i < group_info->ngroups; i++) { | ||
106 | group = high2lowgid(GROUP_AT(group_info, i)); | ||
107 | if (put_user(group, grouplist+i)) | ||
108 | return -EFAULT; | ||
109 | } | ||
110 | |||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | static int groups16_from_user(struct group_info *group_info, | ||
115 | old_gid_t __user *grouplist) | ||
116 | { | ||
117 | int i; | ||
118 | old_gid_t group; | ||
119 | |||
120 | for (i = 0; i < group_info->ngroups; i++) { | ||
121 | if (get_user(group, grouplist+i)) | ||
122 | return -EFAULT; | ||
123 | GROUP_AT(group_info, i) = low2highgid(group); | ||
124 | } | ||
125 | |||
126 | return 0; | ||
127 | } | ||
128 | |||
129 | asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) | ||
130 | { | ||
131 | int i = 0; | ||
132 | |||
133 | if (gidsetsize < 0) | ||
134 | return -EINVAL; | ||
135 | |||
136 | get_group_info(current->group_info); | ||
137 | i = current->group_info->ngroups; | ||
138 | if (gidsetsize) { | ||
139 | if (i > gidsetsize) { | ||
140 | i = -EINVAL; | ||
141 | goto out; | ||
142 | } | ||
143 | if (groups16_to_user(grouplist, current->group_info)) { | ||
144 | i = -EFAULT; | ||
145 | goto out; | ||
146 | } | ||
147 | } | ||
148 | out: | ||
149 | put_group_info(current->group_info); | ||
150 | return i; | ||
151 | } | ||
152 | |||
153 | asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) | ||
154 | { | ||
155 | struct group_info *group_info; | ||
156 | int retval; | ||
157 | |||
158 | if (!capable(CAP_SETGID)) | ||
159 | return -EPERM; | ||
160 | if ((unsigned)gidsetsize > NGROUPS_MAX) | ||
161 | return -EINVAL; | ||
162 | |||
163 | group_info = groups_alloc(gidsetsize); | ||
164 | if (!group_info) | ||
165 | return -ENOMEM; | ||
166 | retval = groups16_from_user(group_info, grouplist); | ||
167 | if (retval) { | ||
168 | put_group_info(group_info); | ||
169 | return retval; | ||
170 | } | ||
171 | |||
172 | retval = set_current_groups(group_info); | ||
173 | put_group_info(group_info); | ||
174 | |||
175 | return retval; | ||
176 | } | ||
177 | |||
178 | asmlinkage long sys_getuid16(void) | ||
179 | { | ||
180 | return high2lowuid(current->uid); | ||
181 | } | ||
182 | |||
183 | asmlinkage long sys_geteuid16(void) | ||
184 | { | ||
185 | return high2lowuid(current->euid); | ||
186 | } | ||
187 | |||
188 | asmlinkage long sys_getgid16(void) | ||
189 | { | ||
190 | return high2lowgid(current->gid); | ||
191 | } | ||
192 | |||
193 | asmlinkage long sys_getegid16(void) | ||
194 | { | ||
195 | return high2lowgid(current->egid); | ||
196 | } | ||
diff --git a/kernel/user.c b/kernel/user.c new file mode 100644 index 000000000000..734575d55769 --- /dev/null +++ b/kernel/user.c | |||
@@ -0,0 +1,189 @@ | |||
1 | /* | ||
2 | * The "user cache". | ||
3 | * | ||
4 | * (C) Copyright 1991-2000 Linus Torvalds | ||
5 | * | ||
6 | * We have a per-user structure to keep track of how many | ||
7 | * processes, files etc the user has claimed, in order to be | ||
8 | * able to have per-user limits for system resources. | ||
9 | */ | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/bitops.h> | ||
15 | #include <linux/key.h> | ||
16 | |||
17 | /* | ||
18 | * UID task count cache, to get fast user lookup in "alloc_uid" | ||
19 | * when changing user ID's (ie setuid() and friends). | ||
20 | */ | ||
21 | |||
22 | #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) | ||
23 | #define UIDHASH_SZ (1 << UIDHASH_BITS) | ||
24 | #define UIDHASH_MASK (UIDHASH_SZ - 1) | ||
25 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) | ||
26 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) | ||
27 | |||
28 | static kmem_cache_t *uid_cachep; | ||
29 | static struct list_head uidhash_table[UIDHASH_SZ]; | ||
30 | static DEFINE_SPINLOCK(uidhash_lock); | ||
31 | |||
32 | struct user_struct root_user = { | ||
33 | .__count = ATOMIC_INIT(1), | ||
34 | .processes = ATOMIC_INIT(1), | ||
35 | .files = ATOMIC_INIT(0), | ||
36 | .sigpending = ATOMIC_INIT(0), | ||
37 | .mq_bytes = 0, | ||
38 | .locked_shm = 0, | ||
39 | #ifdef CONFIG_KEYS | ||
40 | .uid_keyring = &root_user_keyring, | ||
41 | .session_keyring = &root_session_keyring, | ||
42 | #endif | ||
43 | }; | ||
44 | |||
45 | /* | ||
46 | * These routines must be called with the uidhash spinlock held! | ||
47 | */ | ||
48 | static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) | ||
49 | { | ||
50 | list_add(&up->uidhash_list, hashent); | ||
51 | } | ||
52 | |||
53 | static inline void uid_hash_remove(struct user_struct *up) | ||
54 | { | ||
55 | list_del(&up->uidhash_list); | ||
56 | } | ||
57 | |||
58 | static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) | ||
59 | { | ||
60 | struct list_head *up; | ||
61 | |||
62 | list_for_each(up, hashent) { | ||
63 | struct user_struct *user; | ||
64 | |||
65 | user = list_entry(up, struct user_struct, uidhash_list); | ||
66 | |||
67 | if(user->uid == uid) { | ||
68 | atomic_inc(&user->__count); | ||
69 | return user; | ||
70 | } | ||
71 | } | ||
72 | |||
73 | return NULL; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * Locate the user_struct for the passed UID. If found, take a ref on it. The | ||
78 | * caller must undo that ref with free_uid(). | ||
79 | * | ||
80 | * If the user_struct could not be found, return NULL. | ||
81 | */ | ||
82 | struct user_struct *find_user(uid_t uid) | ||
83 | { | ||
84 | struct user_struct *ret; | ||
85 | |||
86 | spin_lock(&uidhash_lock); | ||
87 | ret = uid_hash_find(uid, uidhashentry(uid)); | ||
88 | spin_unlock(&uidhash_lock); | ||
89 | return ret; | ||
90 | } | ||
91 | |||
92 | void free_uid(struct user_struct *up) | ||
93 | { | ||
94 | if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { | ||
95 | uid_hash_remove(up); | ||
96 | key_put(up->uid_keyring); | ||
97 | key_put(up->session_keyring); | ||
98 | kmem_cache_free(uid_cachep, up); | ||
99 | spin_unlock(&uidhash_lock); | ||
100 | } | ||
101 | } | ||
102 | |||
103 | struct user_struct * alloc_uid(uid_t uid) | ||
104 | { | ||
105 | struct list_head *hashent = uidhashentry(uid); | ||
106 | struct user_struct *up; | ||
107 | |||
108 | spin_lock(&uidhash_lock); | ||
109 | up = uid_hash_find(uid, hashent); | ||
110 | spin_unlock(&uidhash_lock); | ||
111 | |||
112 | if (!up) { | ||
113 | struct user_struct *new; | ||
114 | |||
115 | new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); | ||
116 | if (!new) | ||
117 | return NULL; | ||
118 | new->uid = uid; | ||
119 | atomic_set(&new->__count, 1); | ||
120 | atomic_set(&new->processes, 0); | ||
121 | atomic_set(&new->files, 0); | ||
122 | atomic_set(&new->sigpending, 0); | ||
123 | |||
124 | new->mq_bytes = 0; | ||
125 | new->locked_shm = 0; | ||
126 | |||
127 | if (alloc_uid_keyring(new) < 0) { | ||
128 | kmem_cache_free(uid_cachep, new); | ||
129 | return NULL; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Before adding this, check whether we raced | ||
134 | * on adding the same user already.. | ||
135 | */ | ||
136 | spin_lock(&uidhash_lock); | ||
137 | up = uid_hash_find(uid, hashent); | ||
138 | if (up) { | ||
139 | key_put(new->uid_keyring); | ||
140 | key_put(new->session_keyring); | ||
141 | kmem_cache_free(uid_cachep, new); | ||
142 | } else { | ||
143 | uid_hash_insert(new, hashent); | ||
144 | up = new; | ||
145 | } | ||
146 | spin_unlock(&uidhash_lock); | ||
147 | |||
148 | } | ||
149 | return up; | ||
150 | } | ||
151 | |||
152 | void switch_uid(struct user_struct *new_user) | ||
153 | { | ||
154 | struct user_struct *old_user; | ||
155 | |||
156 | /* What if a process setreuid()'s and this brings the | ||
157 | * new uid over his NPROC rlimit? We can check this now | ||
158 | * cheaply with the new uid cache, so if it matters | ||
159 | * we should be checking for it. -DaveM | ||
160 | */ | ||
161 | old_user = current->user; | ||
162 | atomic_inc(&new_user->processes); | ||
163 | atomic_dec(&old_user->processes); | ||
164 | switch_uid_keyring(new_user); | ||
165 | current->user = new_user; | ||
166 | free_uid(old_user); | ||
167 | suid_keys(current); | ||
168 | } | ||
169 | |||
170 | |||
171 | static int __init uid_cache_init(void) | ||
172 | { | ||
173 | int n; | ||
174 | |||
175 | uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), | ||
176 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | ||
177 | |||
178 | for(n = 0; n < UIDHASH_SZ; ++n) | ||
179 | INIT_LIST_HEAD(uidhash_table + n); | ||
180 | |||
181 | /* Insert the root user immediately (init already runs as root) */ | ||
182 | spin_lock(&uidhash_lock); | ||
183 | uid_hash_insert(&root_user, uidhashentry(0)); | ||
184 | spin_unlock(&uidhash_lock); | ||
185 | |||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | module_init(uid_cache_init); | ||
diff --git a/kernel/wait.c b/kernel/wait.c new file mode 100644 index 000000000000..791681cfea98 --- /dev/null +++ b/kernel/wait.c | |||
@@ -0,0 +1,246 @@ | |||
1 | /* | ||
2 | * Generic waiting primitives. | ||
3 | * | ||
4 | * (C) 2004 William Irwin, Oracle | ||
5 | */ | ||
6 | #include <linux/config.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/sched.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/wait.h> | ||
12 | #include <linux/hash.h> | ||
13 | |||
14 | void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) | ||
15 | { | ||
16 | unsigned long flags; | ||
17 | |||
18 | wait->flags &= ~WQ_FLAG_EXCLUSIVE; | ||
19 | spin_lock_irqsave(&q->lock, flags); | ||
20 | __add_wait_queue(q, wait); | ||
21 | spin_unlock_irqrestore(&q->lock, flags); | ||
22 | } | ||
23 | EXPORT_SYMBOL(add_wait_queue); | ||
24 | |||
25 | void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) | ||
26 | { | ||
27 | unsigned long flags; | ||
28 | |||
29 | wait->flags |= WQ_FLAG_EXCLUSIVE; | ||
30 | spin_lock_irqsave(&q->lock, flags); | ||
31 | __add_wait_queue_tail(q, wait); | ||
32 | spin_unlock_irqrestore(&q->lock, flags); | ||
33 | } | ||
34 | EXPORT_SYMBOL(add_wait_queue_exclusive); | ||
35 | |||
36 | void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) | ||
37 | { | ||
38 | unsigned long flags; | ||
39 | |||
40 | spin_lock_irqsave(&q->lock, flags); | ||
41 | __remove_wait_queue(q, wait); | ||
42 | spin_unlock_irqrestore(&q->lock, flags); | ||
43 | } | ||
44 | EXPORT_SYMBOL(remove_wait_queue); | ||
45 | |||
46 | |||
47 | /* | ||
48 | * Note: we use "set_current_state()" _after_ the wait-queue add, | ||
49 | * because we need a memory barrier there on SMP, so that any | ||
50 | * wake-function that tests for the wait-queue being active | ||
51 | * will be guaranteed to see waitqueue addition _or_ subsequent | ||
52 | * tests in this thread will see the wakeup having taken place. | ||
53 | * | ||
54 | * The spin_unlock() itself is semi-permeable and only protects | ||
55 | * one way (it only protects stuff inside the critical region and | ||
56 | * stops them from bleeding out - it would still allow subsequent | ||
57 | * loads to move into the the critical region). | ||
58 | */ | ||
59 | void fastcall | ||
60 | prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) | ||
61 | { | ||
62 | unsigned long flags; | ||
63 | |||
64 | wait->flags &= ~WQ_FLAG_EXCLUSIVE; | ||
65 | spin_lock_irqsave(&q->lock, flags); | ||
66 | if (list_empty(&wait->task_list)) | ||
67 | __add_wait_queue(q, wait); | ||
68 | /* | ||
69 | * don't alter the task state if this is just going to | ||
70 | * queue an async wait queue callback | ||
71 | */ | ||
72 | if (is_sync_wait(wait)) | ||
73 | set_current_state(state); | ||
74 | spin_unlock_irqrestore(&q->lock, flags); | ||
75 | } | ||
76 | EXPORT_SYMBOL(prepare_to_wait); | ||
77 | |||
78 | void fastcall | ||
79 | prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | ||
80 | { | ||
81 | unsigned long flags; | ||
82 | |||
83 | wait->flags |= WQ_FLAG_EXCLUSIVE; | ||
84 | spin_lock_irqsave(&q->lock, flags); | ||
85 | if (list_empty(&wait->task_list)) | ||
86 | __add_wait_queue_tail(q, wait); | ||
87 | /* | ||
88 | * don't alter the task state if this is just going to | ||
89 | * queue an async wait queue callback | ||
90 | */ | ||
91 | if (is_sync_wait(wait)) | ||
92 | set_current_state(state); | ||
93 | spin_unlock_irqrestore(&q->lock, flags); | ||
94 | } | ||
95 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | ||
96 | |||
97 | void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | ||
98 | { | ||
99 | unsigned long flags; | ||
100 | |||
101 | __set_current_state(TASK_RUNNING); | ||
102 | /* | ||
103 | * We can check for list emptiness outside the lock | ||
104 | * IFF: | ||
105 | * - we use the "careful" check that verifies both | ||
106 | * the next and prev pointers, so that there cannot | ||
107 | * be any half-pending updates in progress on other | ||
108 | * CPU's that we haven't seen yet (and that might | ||
109 | * still change the stack area. | ||
110 | * and | ||
111 | * - all other users take the lock (ie we can only | ||
112 | * have _one_ other CPU that looks at or modifies | ||
113 | * the list). | ||
114 | */ | ||
115 | if (!list_empty_careful(&wait->task_list)) { | ||
116 | spin_lock_irqsave(&q->lock, flags); | ||
117 | list_del_init(&wait->task_list); | ||
118 | spin_unlock_irqrestore(&q->lock, flags); | ||
119 | } | ||
120 | } | ||
121 | EXPORT_SYMBOL(finish_wait); | ||
122 | |||
123 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
124 | { | ||
125 | int ret = default_wake_function(wait, mode, sync, key); | ||
126 | |||
127 | if (ret) | ||
128 | list_del_init(&wait->task_list); | ||
129 | return ret; | ||
130 | } | ||
131 | EXPORT_SYMBOL(autoremove_wake_function); | ||
132 | |||
133 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) | ||
134 | { | ||
135 | struct wait_bit_key *key = arg; | ||
136 | struct wait_bit_queue *wait_bit | ||
137 | = container_of(wait, struct wait_bit_queue, wait); | ||
138 | |||
139 | if (wait_bit->key.flags != key->flags || | ||
140 | wait_bit->key.bit_nr != key->bit_nr || | ||
141 | test_bit(key->bit_nr, key->flags)) | ||
142 | return 0; | ||
143 | else | ||
144 | return autoremove_wake_function(wait, mode, sync, key); | ||
145 | } | ||
146 | EXPORT_SYMBOL(wake_bit_function); | ||
147 | |||
148 | /* | ||
149 | * To allow interruptible waiting and asynchronous (i.e. nonblocking) | ||
150 | * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are | ||
151 | * permitted return codes. Nonzero return codes halt waiting and return. | ||
152 | */ | ||
153 | int __sched fastcall | ||
154 | __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, | ||
155 | int (*action)(void *), unsigned mode) | ||
156 | { | ||
157 | int ret = 0; | ||
158 | |||
159 | do { | ||
160 | prepare_to_wait(wq, &q->wait, mode); | ||
161 | if (test_bit(q->key.bit_nr, q->key.flags)) | ||
162 | ret = (*action)(q->key.flags); | ||
163 | } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); | ||
164 | finish_wait(wq, &q->wait); | ||
165 | return ret; | ||
166 | } | ||
167 | EXPORT_SYMBOL(__wait_on_bit); | ||
168 | |||
169 | int __sched fastcall out_of_line_wait_on_bit(void *word, int bit, | ||
170 | int (*action)(void *), unsigned mode) | ||
171 | { | ||
172 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | ||
173 | DEFINE_WAIT_BIT(wait, word, bit); | ||
174 | |||
175 | return __wait_on_bit(wq, &wait, action, mode); | ||
176 | } | ||
177 | EXPORT_SYMBOL(out_of_line_wait_on_bit); | ||
178 | |||
179 | int __sched fastcall | ||
180 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | ||
181 | int (*action)(void *), unsigned mode) | ||
182 | { | ||
183 | int ret = 0; | ||
184 | |||
185 | do { | ||
186 | prepare_to_wait_exclusive(wq, &q->wait, mode); | ||
187 | if (test_bit(q->key.bit_nr, q->key.flags)) { | ||
188 | if ((ret = (*action)(q->key.flags))) | ||
189 | break; | ||
190 | } | ||
191 | } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); | ||
192 | finish_wait(wq, &q->wait); | ||
193 | return ret; | ||
194 | } | ||
195 | EXPORT_SYMBOL(__wait_on_bit_lock); | ||
196 | |||
197 | int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit, | ||
198 | int (*action)(void *), unsigned mode) | ||
199 | { | ||
200 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | ||
201 | DEFINE_WAIT_BIT(wait, word, bit); | ||
202 | |||
203 | return __wait_on_bit_lock(wq, &wait, action, mode); | ||
204 | } | ||
205 | EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); | ||
206 | |||
207 | void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) | ||
208 | { | ||
209 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); | ||
210 | if (waitqueue_active(wq)) | ||
211 | __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key); | ||
212 | } | ||
213 | EXPORT_SYMBOL(__wake_up_bit); | ||
214 | |||
215 | /** | ||
216 | * wake_up_bit - wake up a waiter on a bit | ||
217 | * @word: the word being waited on, a kernel virtual address | ||
218 | * @bit: the bit of the word being waited on | ||
219 | * | ||
220 | * There is a standard hashed waitqueue table for generic use. This | ||
221 | * is the part of the hashtable's accessor API that wakes up waiters | ||
222 | * on a bit. For instance, if one were to have waiters on a bitflag, | ||
223 | * one would call wake_up_bit() after clearing the bit. | ||
224 | * | ||
225 | * In order for this to function properly, as it uses waitqueue_active() | ||
226 | * internally, some kind of memory barrier must be done prior to calling | ||
227 | * this. Typically, this will be smp_mb__after_clear_bit(), but in some | ||
228 | * cases where bitflags are manipulated non-atomically under a lock, one | ||
229 | * may need to use a less regular barrier, such fs/inode.c's smp_mb(), | ||
230 | * because spin_unlock() does not guarantee a memory barrier. | ||
231 | */ | ||
232 | void fastcall wake_up_bit(void *word, int bit) | ||
233 | { | ||
234 | __wake_up_bit(bit_waitqueue(word, bit), word, bit); | ||
235 | } | ||
236 | EXPORT_SYMBOL(wake_up_bit); | ||
237 | |||
238 | fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit) | ||
239 | { | ||
240 | const int shift = BITS_PER_LONG == 32 ? 5 : 6; | ||
241 | const struct zone *zone = page_zone(virt_to_page(word)); | ||
242 | unsigned long val = (unsigned long)word << shift | bit; | ||
243 | |||
244 | return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; | ||
245 | } | ||
246 | EXPORT_SYMBOL(bit_waitqueue); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c new file mode 100644 index 000000000000..52ef419d2747 --- /dev/null +++ b/kernel/workqueue.c | |||
@@ -0,0 +1,555 @@ | |||
1 | /* | ||
2 | * linux/kernel/workqueue.c | ||
3 | * | ||
4 | * Generic mechanism for defining kernel helper threads for running | ||
5 | * arbitrary tasks in process context. | ||
6 | * | ||
7 | * Started by Ingo Molnar, Copyright (C) 2002 | ||
8 | * | ||
9 | * Derived from the taskqueue/keventd code by: | ||
10 | * | ||
11 | * David Woodhouse <dwmw2@infradead.org> | ||
12 | * Andrew Morton <andrewm@uow.edu.au> | ||
13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | ||
14 | * Theodore Ts'o <tytso@mit.edu> | ||
15 | */ | ||
16 | |||
17 | #include <linux/module.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/signal.h> | ||
22 | #include <linux/completion.h> | ||
23 | #include <linux/workqueue.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/cpu.h> | ||
26 | #include <linux/notifier.h> | ||
27 | #include <linux/kthread.h> | ||
28 | |||
29 | /* | ||
30 | * The per-CPU workqueue (if single thread, we always use cpu 0's). | ||
31 | * | ||
32 | * The sequence counters are for flush_scheduled_work(). It wants to wait | ||
33 | * until until all currently-scheduled works are completed, but it doesn't | ||
34 | * want to be livelocked by new, incoming ones. So it waits until | ||
35 | * remove_sequence is >= the insert_sequence which pertained when | ||
36 | * flush_scheduled_work() was called. | ||
37 | */ | ||
38 | struct cpu_workqueue_struct { | ||
39 | |||
40 | spinlock_t lock; | ||
41 | |||
42 | long remove_sequence; /* Least-recently added (next to run) */ | ||
43 | long insert_sequence; /* Next to add */ | ||
44 | |||
45 | struct list_head worklist; | ||
46 | wait_queue_head_t more_work; | ||
47 | wait_queue_head_t work_done; | ||
48 | |||
49 | struct workqueue_struct *wq; | ||
50 | task_t *thread; | ||
51 | |||
52 | int run_depth; /* Detect run_workqueue() recursion depth */ | ||
53 | } ____cacheline_aligned; | ||
54 | |||
55 | /* | ||
56 | * The externally visible workqueue abstraction is an array of | ||
57 | * per-CPU workqueues: | ||
58 | */ | ||
59 | struct workqueue_struct { | ||
60 | struct cpu_workqueue_struct cpu_wq[NR_CPUS]; | ||
61 | const char *name; | ||
62 | struct list_head list; /* Empty if single thread */ | ||
63 | }; | ||
64 | |||
65 | /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove | ||
66 | threads to each one as cpus come/go. */ | ||
67 | static DEFINE_SPINLOCK(workqueue_lock); | ||
68 | static LIST_HEAD(workqueues); | ||
69 | |||
70 | /* If it's single threaded, it isn't in the list of workqueues. */ | ||
71 | static inline int is_single_threaded(struct workqueue_struct *wq) | ||
72 | { | ||
73 | return list_empty(&wq->list); | ||
74 | } | ||
75 | |||
76 | /* Preempt must be disabled. */ | ||
77 | static void __queue_work(struct cpu_workqueue_struct *cwq, | ||
78 | struct work_struct *work) | ||
79 | { | ||
80 | unsigned long flags; | ||
81 | |||
82 | spin_lock_irqsave(&cwq->lock, flags); | ||
83 | work->wq_data = cwq; | ||
84 | list_add_tail(&work->entry, &cwq->worklist); | ||
85 | cwq->insert_sequence++; | ||
86 | wake_up(&cwq->more_work); | ||
87 | spin_unlock_irqrestore(&cwq->lock, flags); | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * Queue work on a workqueue. Return non-zero if it was successfully | ||
92 | * added. | ||
93 | * | ||
94 | * We queue the work to the CPU it was submitted, but there is no | ||
95 | * guarantee that it will be processed by that CPU. | ||
96 | */ | ||
97 | int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | ||
98 | { | ||
99 | int ret = 0, cpu = get_cpu(); | ||
100 | |||
101 | if (!test_and_set_bit(0, &work->pending)) { | ||
102 | if (unlikely(is_single_threaded(wq))) | ||
103 | cpu = 0; | ||
104 | BUG_ON(!list_empty(&work->entry)); | ||
105 | __queue_work(wq->cpu_wq + cpu, work); | ||
106 | ret = 1; | ||
107 | } | ||
108 | put_cpu(); | ||
109 | return ret; | ||
110 | } | ||
111 | |||
112 | static void delayed_work_timer_fn(unsigned long __data) | ||
113 | { | ||
114 | struct work_struct *work = (struct work_struct *)__data; | ||
115 | struct workqueue_struct *wq = work->wq_data; | ||
116 | int cpu = smp_processor_id(); | ||
117 | |||
118 | if (unlikely(is_single_threaded(wq))) | ||
119 | cpu = 0; | ||
120 | |||
121 | __queue_work(wq->cpu_wq + cpu, work); | ||
122 | } | ||
123 | |||
124 | int fastcall queue_delayed_work(struct workqueue_struct *wq, | ||
125 | struct work_struct *work, unsigned long delay) | ||
126 | { | ||
127 | int ret = 0; | ||
128 | struct timer_list *timer = &work->timer; | ||
129 | |||
130 | if (!test_and_set_bit(0, &work->pending)) { | ||
131 | BUG_ON(timer_pending(timer)); | ||
132 | BUG_ON(!list_empty(&work->entry)); | ||
133 | |||
134 | /* This stores wq for the moment, for the timer_fn */ | ||
135 | work->wq_data = wq; | ||
136 | timer->expires = jiffies + delay; | ||
137 | timer->data = (unsigned long)work; | ||
138 | timer->function = delayed_work_timer_fn; | ||
139 | add_timer(timer); | ||
140 | ret = 1; | ||
141 | } | ||
142 | return ret; | ||
143 | } | ||
144 | |||
145 | static inline void run_workqueue(struct cpu_workqueue_struct *cwq) | ||
146 | { | ||
147 | unsigned long flags; | ||
148 | |||
149 | /* | ||
150 | * Keep taking off work from the queue until | ||
151 | * done. | ||
152 | */ | ||
153 | spin_lock_irqsave(&cwq->lock, flags); | ||
154 | cwq->run_depth++; | ||
155 | if (cwq->run_depth > 3) { | ||
156 | /* morton gets to eat his hat */ | ||
157 | printk("%s: recursion depth exceeded: %d\n", | ||
158 | __FUNCTION__, cwq->run_depth); | ||
159 | dump_stack(); | ||
160 | } | ||
161 | while (!list_empty(&cwq->worklist)) { | ||
162 | struct work_struct *work = list_entry(cwq->worklist.next, | ||
163 | struct work_struct, entry); | ||
164 | void (*f) (void *) = work->func; | ||
165 | void *data = work->data; | ||
166 | |||
167 | list_del_init(cwq->worklist.next); | ||
168 | spin_unlock_irqrestore(&cwq->lock, flags); | ||
169 | |||
170 | BUG_ON(work->wq_data != cwq); | ||
171 | clear_bit(0, &work->pending); | ||
172 | f(data); | ||
173 | |||
174 | spin_lock_irqsave(&cwq->lock, flags); | ||
175 | cwq->remove_sequence++; | ||
176 | wake_up(&cwq->work_done); | ||
177 | } | ||
178 | cwq->run_depth--; | ||
179 | spin_unlock_irqrestore(&cwq->lock, flags); | ||
180 | } | ||
181 | |||
182 | static int worker_thread(void *__cwq) | ||
183 | { | ||
184 | struct cpu_workqueue_struct *cwq = __cwq; | ||
185 | DECLARE_WAITQUEUE(wait, current); | ||
186 | struct k_sigaction sa; | ||
187 | sigset_t blocked; | ||
188 | |||
189 | current->flags |= PF_NOFREEZE; | ||
190 | |||
191 | set_user_nice(current, -5); | ||
192 | |||
193 | /* Block and flush all signals */ | ||
194 | sigfillset(&blocked); | ||
195 | sigprocmask(SIG_BLOCK, &blocked, NULL); | ||
196 | flush_signals(current); | ||
197 | |||
198 | /* SIG_IGN makes children autoreap: see do_notify_parent(). */ | ||
199 | sa.sa.sa_handler = SIG_IGN; | ||
200 | sa.sa.sa_flags = 0; | ||
201 | siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); | ||
202 | do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); | ||
203 | |||
204 | set_current_state(TASK_INTERRUPTIBLE); | ||
205 | while (!kthread_should_stop()) { | ||
206 | add_wait_queue(&cwq->more_work, &wait); | ||
207 | if (list_empty(&cwq->worklist)) | ||
208 | schedule(); | ||
209 | else | ||
210 | __set_current_state(TASK_RUNNING); | ||
211 | remove_wait_queue(&cwq->more_work, &wait); | ||
212 | |||
213 | if (!list_empty(&cwq->worklist)) | ||
214 | run_workqueue(cwq); | ||
215 | set_current_state(TASK_INTERRUPTIBLE); | ||
216 | } | ||
217 | __set_current_state(TASK_RUNNING); | ||
218 | return 0; | ||
219 | } | ||
220 | |||
221 | static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) | ||
222 | { | ||
223 | if (cwq->thread == current) { | ||
224 | /* | ||
225 | * Probably keventd trying to flush its own queue. So simply run | ||
226 | * it by hand rather than deadlocking. | ||
227 | */ | ||
228 | run_workqueue(cwq); | ||
229 | } else { | ||
230 | DEFINE_WAIT(wait); | ||
231 | long sequence_needed; | ||
232 | |||
233 | spin_lock_irq(&cwq->lock); | ||
234 | sequence_needed = cwq->insert_sequence; | ||
235 | |||
236 | while (sequence_needed - cwq->remove_sequence > 0) { | ||
237 | prepare_to_wait(&cwq->work_done, &wait, | ||
238 | TASK_UNINTERRUPTIBLE); | ||
239 | spin_unlock_irq(&cwq->lock); | ||
240 | schedule(); | ||
241 | spin_lock_irq(&cwq->lock); | ||
242 | } | ||
243 | finish_wait(&cwq->work_done, &wait); | ||
244 | spin_unlock_irq(&cwq->lock); | ||
245 | } | ||
246 | } | ||
247 | |||
248 | /* | ||
249 | * flush_workqueue - ensure that any scheduled work has run to completion. | ||
250 | * | ||
251 | * Forces execution of the workqueue and blocks until its completion. | ||
252 | * This is typically used in driver shutdown handlers. | ||
253 | * | ||
254 | * This function will sample each workqueue's current insert_sequence number and | ||
255 | * will sleep until the head sequence is greater than or equal to that. This | ||
256 | * means that we sleep until all works which were queued on entry have been | ||
257 | * handled, but we are not livelocked by new incoming ones. | ||
258 | * | ||
259 | * This function used to run the workqueues itself. Now we just wait for the | ||
260 | * helper threads to do it. | ||
261 | */ | ||
262 | void fastcall flush_workqueue(struct workqueue_struct *wq) | ||
263 | { | ||
264 | might_sleep(); | ||
265 | |||
266 | if (is_single_threaded(wq)) { | ||
267 | /* Always use cpu 0's area. */ | ||
268 | flush_cpu_workqueue(wq->cpu_wq + 0); | ||
269 | } else { | ||
270 | int cpu; | ||
271 | |||
272 | lock_cpu_hotplug(); | ||
273 | for_each_online_cpu(cpu) | ||
274 | flush_cpu_workqueue(wq->cpu_wq + cpu); | ||
275 | unlock_cpu_hotplug(); | ||
276 | } | ||
277 | } | ||
278 | |||
279 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | ||
280 | int cpu) | ||
281 | { | ||
282 | struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; | ||
283 | struct task_struct *p; | ||
284 | |||
285 | spin_lock_init(&cwq->lock); | ||
286 | cwq->wq = wq; | ||
287 | cwq->thread = NULL; | ||
288 | cwq->insert_sequence = 0; | ||
289 | cwq->remove_sequence = 0; | ||
290 | INIT_LIST_HEAD(&cwq->worklist); | ||
291 | init_waitqueue_head(&cwq->more_work); | ||
292 | init_waitqueue_head(&cwq->work_done); | ||
293 | |||
294 | if (is_single_threaded(wq)) | ||
295 | p = kthread_create(worker_thread, cwq, "%s", wq->name); | ||
296 | else | ||
297 | p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu); | ||
298 | if (IS_ERR(p)) | ||
299 | return NULL; | ||
300 | cwq->thread = p; | ||
301 | return p; | ||
302 | } | ||
303 | |||
304 | struct workqueue_struct *__create_workqueue(const char *name, | ||
305 | int singlethread) | ||
306 | { | ||
307 | int cpu, destroy = 0; | ||
308 | struct workqueue_struct *wq; | ||
309 | struct task_struct *p; | ||
310 | |||
311 | BUG_ON(strlen(name) > 10); | ||
312 | |||
313 | wq = kmalloc(sizeof(*wq), GFP_KERNEL); | ||
314 | if (!wq) | ||
315 | return NULL; | ||
316 | memset(wq, 0, sizeof(*wq)); | ||
317 | |||
318 | wq->name = name; | ||
319 | /* We don't need the distraction of CPUs appearing and vanishing. */ | ||
320 | lock_cpu_hotplug(); | ||
321 | if (singlethread) { | ||
322 | INIT_LIST_HEAD(&wq->list); | ||
323 | p = create_workqueue_thread(wq, 0); | ||
324 | if (!p) | ||
325 | destroy = 1; | ||
326 | else | ||
327 | wake_up_process(p); | ||
328 | } else { | ||
329 | spin_lock(&workqueue_lock); | ||
330 | list_add(&wq->list, &workqueues); | ||
331 | spin_unlock(&workqueue_lock); | ||
332 | for_each_online_cpu(cpu) { | ||
333 | p = create_workqueue_thread(wq, cpu); | ||
334 | if (p) { | ||
335 | kthread_bind(p, cpu); | ||
336 | wake_up_process(p); | ||
337 | } else | ||
338 | destroy = 1; | ||
339 | } | ||
340 | } | ||
341 | unlock_cpu_hotplug(); | ||
342 | |||
343 | /* | ||
344 | * Was there any error during startup? If yes then clean up: | ||
345 | */ | ||
346 | if (destroy) { | ||
347 | destroy_workqueue(wq); | ||
348 | wq = NULL; | ||
349 | } | ||
350 | return wq; | ||
351 | } | ||
352 | |||
353 | static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) | ||
354 | { | ||
355 | struct cpu_workqueue_struct *cwq; | ||
356 | unsigned long flags; | ||
357 | struct task_struct *p; | ||
358 | |||
359 | cwq = wq->cpu_wq + cpu; | ||
360 | spin_lock_irqsave(&cwq->lock, flags); | ||
361 | p = cwq->thread; | ||
362 | cwq->thread = NULL; | ||
363 | spin_unlock_irqrestore(&cwq->lock, flags); | ||
364 | if (p) | ||
365 | kthread_stop(p); | ||
366 | } | ||
367 | |||
368 | void destroy_workqueue(struct workqueue_struct *wq) | ||
369 | { | ||
370 | int cpu; | ||
371 | |||
372 | flush_workqueue(wq); | ||
373 | |||
374 | /* We don't need the distraction of CPUs appearing and vanishing. */ | ||
375 | lock_cpu_hotplug(); | ||
376 | if (is_single_threaded(wq)) | ||
377 | cleanup_workqueue_thread(wq, 0); | ||
378 | else { | ||
379 | for_each_online_cpu(cpu) | ||
380 | cleanup_workqueue_thread(wq, cpu); | ||
381 | spin_lock(&workqueue_lock); | ||
382 | list_del(&wq->list); | ||
383 | spin_unlock(&workqueue_lock); | ||
384 | } | ||
385 | unlock_cpu_hotplug(); | ||
386 | kfree(wq); | ||
387 | } | ||
388 | |||
389 | static struct workqueue_struct *keventd_wq; | ||
390 | |||
391 | int fastcall schedule_work(struct work_struct *work) | ||
392 | { | ||
393 | return queue_work(keventd_wq, work); | ||
394 | } | ||
395 | |||
396 | int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) | ||
397 | { | ||
398 | return queue_delayed_work(keventd_wq, work, delay); | ||
399 | } | ||
400 | |||
401 | int schedule_delayed_work_on(int cpu, | ||
402 | struct work_struct *work, unsigned long delay) | ||
403 | { | ||
404 | int ret = 0; | ||
405 | struct timer_list *timer = &work->timer; | ||
406 | |||
407 | if (!test_and_set_bit(0, &work->pending)) { | ||
408 | BUG_ON(timer_pending(timer)); | ||
409 | BUG_ON(!list_empty(&work->entry)); | ||
410 | /* This stores keventd_wq for the moment, for the timer_fn */ | ||
411 | work->wq_data = keventd_wq; | ||
412 | timer->expires = jiffies + delay; | ||
413 | timer->data = (unsigned long)work; | ||
414 | timer->function = delayed_work_timer_fn; | ||
415 | add_timer_on(timer, cpu); | ||
416 | ret = 1; | ||
417 | } | ||
418 | return ret; | ||
419 | } | ||
420 | |||
421 | void flush_scheduled_work(void) | ||
422 | { | ||
423 | flush_workqueue(keventd_wq); | ||
424 | } | ||
425 | |||
426 | /** | ||
427 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed | ||
428 | * work whose handler rearms the delayed work. | ||
429 | * @wq: the controlling workqueue structure | ||
430 | * @work: the delayed work struct | ||
431 | */ | ||
432 | static void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, | ||
433 | struct work_struct *work) | ||
434 | { | ||
435 | while (!cancel_delayed_work(work)) | ||
436 | flush_workqueue(wq); | ||
437 | } | ||
438 | |||
439 | /** | ||
440 | * cancel_rearming_delayed_work - reliably kill off a delayed keventd | ||
441 | * work whose handler rearms the delayed work. | ||
442 | * @work: the delayed work struct | ||
443 | */ | ||
444 | void cancel_rearming_delayed_work(struct work_struct *work) | ||
445 | { | ||
446 | cancel_rearming_delayed_workqueue(keventd_wq, work); | ||
447 | } | ||
448 | EXPORT_SYMBOL(cancel_rearming_delayed_work); | ||
449 | |||
450 | int keventd_up(void) | ||
451 | { | ||
452 | return keventd_wq != NULL; | ||
453 | } | ||
454 | |||
455 | int current_is_keventd(void) | ||
456 | { | ||
457 | struct cpu_workqueue_struct *cwq; | ||
458 | int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ | ||
459 | int ret = 0; | ||
460 | |||
461 | BUG_ON(!keventd_wq); | ||
462 | |||
463 | cwq = keventd_wq->cpu_wq + cpu; | ||
464 | if (current == cwq->thread) | ||
465 | ret = 1; | ||
466 | |||
467 | return ret; | ||
468 | |||
469 | } | ||
470 | |||
471 | #ifdef CONFIG_HOTPLUG_CPU | ||
472 | /* Take the work from this (downed) CPU. */ | ||
473 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | ||
474 | { | ||
475 | struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; | ||
476 | LIST_HEAD(list); | ||
477 | struct work_struct *work; | ||
478 | |||
479 | spin_lock_irq(&cwq->lock); | ||
480 | list_splice_init(&cwq->worklist, &list); | ||
481 | |||
482 | while (!list_empty(&list)) { | ||
483 | printk("Taking work for %s\n", wq->name); | ||
484 | work = list_entry(list.next,struct work_struct,entry); | ||
485 | list_del(&work->entry); | ||
486 | __queue_work(wq->cpu_wq + smp_processor_id(), work); | ||
487 | } | ||
488 | spin_unlock_irq(&cwq->lock); | ||
489 | } | ||
490 | |||
491 | /* We're holding the cpucontrol mutex here */ | ||
492 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | ||
493 | unsigned long action, | ||
494 | void *hcpu) | ||
495 | { | ||
496 | unsigned int hotcpu = (unsigned long)hcpu; | ||
497 | struct workqueue_struct *wq; | ||
498 | |||
499 | switch (action) { | ||
500 | case CPU_UP_PREPARE: | ||
501 | /* Create a new workqueue thread for it. */ | ||
502 | list_for_each_entry(wq, &workqueues, list) { | ||
503 | if (create_workqueue_thread(wq, hotcpu) < 0) { | ||
504 | printk("workqueue for %i failed\n", hotcpu); | ||
505 | return NOTIFY_BAD; | ||
506 | } | ||
507 | } | ||
508 | break; | ||
509 | |||
510 | case CPU_ONLINE: | ||
511 | /* Kick off worker threads. */ | ||
512 | list_for_each_entry(wq, &workqueues, list) { | ||
513 | kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); | ||
514 | wake_up_process(wq->cpu_wq[hotcpu].thread); | ||
515 | } | ||
516 | break; | ||
517 | |||
518 | case CPU_UP_CANCELED: | ||
519 | list_for_each_entry(wq, &workqueues, list) { | ||
520 | /* Unbind so it can run. */ | ||
521 | kthread_bind(wq->cpu_wq[hotcpu].thread, | ||
522 | smp_processor_id()); | ||
523 | cleanup_workqueue_thread(wq, hotcpu); | ||
524 | } | ||
525 | break; | ||
526 | |||
527 | case CPU_DEAD: | ||
528 | list_for_each_entry(wq, &workqueues, list) | ||
529 | cleanup_workqueue_thread(wq, hotcpu); | ||
530 | list_for_each_entry(wq, &workqueues, list) | ||
531 | take_over_work(wq, hotcpu); | ||
532 | break; | ||
533 | } | ||
534 | |||
535 | return NOTIFY_OK; | ||
536 | } | ||
537 | #endif | ||
538 | |||
539 | void init_workqueues(void) | ||
540 | { | ||
541 | hotcpu_notifier(workqueue_cpu_callback, 0); | ||
542 | keventd_wq = create_workqueue("events"); | ||
543 | BUG_ON(!keventd_wq); | ||
544 | } | ||
545 | |||
546 | EXPORT_SYMBOL_GPL(__create_workqueue); | ||
547 | EXPORT_SYMBOL_GPL(queue_work); | ||
548 | EXPORT_SYMBOL_GPL(queue_delayed_work); | ||
549 | EXPORT_SYMBOL_GPL(flush_workqueue); | ||
550 | EXPORT_SYMBOL_GPL(destroy_workqueue); | ||
551 | |||
552 | EXPORT_SYMBOL(schedule_work); | ||
553 | EXPORT_SYMBOL(schedule_delayed_work); | ||
554 | EXPORT_SYMBOL(schedule_delayed_work_on); | ||
555 | EXPORT_SYMBOL(flush_scheduled_work); | ||