aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/acct.c12
-rw-r--r--kernel/audit.c333
-rw-r--r--kernel/audit.h92
-rw-r--r--kernel/auditfilter.c857
-rw-r--r--kernel/auditsc.c917
-rw-r--r--kernel/capability.c16
-rw-r--r--kernel/compat.c82
-rw-r--r--kernel/cpu.c32
-rw-r--r--kernel/cpuset.c485
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c165
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c183
-rw-r--r--kernel/futex.c174
-rw-r--r--kernel/futex_compat.c144
-rw-r--r--kernel/hrtimer.c253
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/manage.c26
-rw-r--r--kernel/irq/migration.c62
-rw-r--r--kernel/itimer.c117
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kprobes.c27
-rw-r--r--kernel/ksysfs.c7
-rw-r--r--kernel/kthread.c9
-rw-r--r--kernel/module.c473
-rw-r--r--kernel/panic.c103
-rw-r--r--kernel/params.c24
-rw-r--r--kernel/pid.c250
-rw-r--r--kernel/posix-timers.c69
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c20
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/pm.c37
-rw-r--r--kernel/power/power.h75
-rw-r--r--kernel/power/process.c64
-rw-r--r--kernel/power/smp.c4
-rw-r--r--kernel/power/snapshot.c344
-rw-r--r--kernel/power/swap.c545
-rw-r--r--kernel/power/swsusp.c889
-rw-r--r--kernel/power/user.c333
-rw-r--r--kernel/printk.c82
-rw-r--r--kernel/profile.c66
-rw-r--r--kernel/ptrace.c103
-rw-r--r--kernel/rcupdate.c122
-rw-r--r--kernel/rcutorture.c37
-rw-r--r--kernel/relay.c1012
-rw-r--r--kernel/sched.c428
-rw-r--r--kernel/signal.c374
-rw-r--r--kernel/softirq.c24
-rw-r--r--kernel/softlockup.c61
-rw-r--r--kernel/spinlock.c9
-rw-r--r--kernel/sys.c533
-rw-r--r--kernel/sys_ni.c18
-rw-r--r--kernel/sysctl.c54
-rw-r--r--kernel/time.c71
-rw-r--r--kernel/timer.c261
-rw-r--r--kernel/uid16.c59
-rw-r--r--kernel/user.c10
-rw-r--r--kernel/workqueue.c31
61 files changed, 7186 insertions, 3413 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde81..58908f9d15 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
12 12
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
14obj-$(CONFIG_FUTEX) += futex.o 14obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o
17endif
15obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
16obj-$(CONFIG_SMP) += cpu.o spinlock.o 19obj-$(CONFIG_SMP) += cpu.o spinlock.o
17obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
@@ -26,7 +29,7 @@ obj-$(CONFIG_COMPAT) += compat.o
26obj-$(CONFIG_CPUSETS) += cpuset.o 29obj-$(CONFIG_CPUSETS) += cpuset.o
27obj-$(CONFIG_IKCONFIG) += configs.o 30obj-$(CONFIG_IKCONFIG) += configs.o
28obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 31obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
29obj-$(CONFIG_AUDIT) += audit.o 32obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
30obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 33obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
31obj-$(CONFIG_KPROBES) += kprobes.o 34obj-$(CONFIG_KPROBES) += kprobes.o
32obj-$(CONFIG_SYSFS) += ksysfs.o 35obj-$(CONFIG_SYSFS) += ksysfs.o
@@ -34,6 +37,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
34obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 37obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
35obj-$(CONFIG_SECCOMP) += seccomp.o 38obj-$(CONFIG_SECCOMP) += seccomp.o
36obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 39obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
40obj-$(CONFIG_RELAY) += relay.o
37 41
38ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 42ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
39# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 43# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51..b327f4d201 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
449 /* calculate run_time in nsec*/ 449 /* calculate run_time in nsec*/
450 do_posix_clock_monotonic_gettime(&uptime); 450 do_posix_clock_monotonic_gettime(&uptime);
451 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; 451 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
452 run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC 452 run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
453 + current->start_time.tv_nsec; 453 + current->group_leader->start_time.tv_nsec;
454 /* convert nsec -> AHZ */ 454 /* convert nsec -> AHZ */
455 elapsed = nsec_to_AHZ(run_time); 455 elapsed = nsec_to_AHZ(run_time);
456#if ACCT_VERSION==3 456#if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 469#endif
470 do_div(elapsed, AHZ); 470 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 471 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, 472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime)); 473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); 474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, 475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime)); 476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); 477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 478 /* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt + 524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->group_leader->min_flt); 525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt + 526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->group_leader->maj_flt); 527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 528 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode; 529 ac.ac_exitcode = exitcode;
530 530
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a813d2883..df57b493e1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -52,8 +52,12 @@
52#include <linux/audit.h> 52#include <linux/audit.h>
53 53
54#include <net/sock.h> 54#include <net/sock.h>
55#include <net/netlink.h>
55#include <linux/skbuff.h> 56#include <linux/skbuff.h>
56#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h>
59
60#include "audit.h"
57 61
58/* No auditing will take place until audit_initialized != 0. 62/* No auditing will take place until audit_initialized != 0.
59 * (Initialization happens after skb_init is called.) */ 63 * (Initialization happens after skb_init is called.) */
@@ -72,7 +76,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
72 * contains the (non-zero) pid. */ 76 * contains the (non-zero) pid. */
73int audit_pid; 77int audit_pid;
74 78
75/* If audit_limit is non-zero, limit the rate of sending audit records 79/* If audit_rate_limit is non-zero, limit the rate of sending audit records
76 * to that number per second. This prevents DoS attacks, but results in 80 * to that number per second. This prevents DoS attacks, but results in
77 * audit records being dropped. */ 81 * audit records being dropped. */
78static int audit_rate_limit; 82static int audit_rate_limit;
@@ -102,7 +106,7 @@ static struct sock *audit_sock;
102 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of 106 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
103 * being placed on the freelist). */ 107 * being placed on the freelist). */
104static DEFINE_SPINLOCK(audit_freelist_lock); 108static DEFINE_SPINLOCK(audit_freelist_lock);
105static int audit_freelist_count = 0; 109static int audit_freelist_count;
106static LIST_HEAD(audit_freelist); 110static LIST_HEAD(audit_freelist);
107 111
108static struct sk_buff_head audit_skb_queue; 112static struct sk_buff_head audit_skb_queue;
@@ -113,7 +117,7 @@ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
113/* The netlink socket is only to be read by 1 CPU, which lets us assume 117/* The netlink socket is only to be read by 1 CPU, which lets us assume
114 * that list additions and deletions never happen simultaneously in 118 * that list additions and deletions never happen simultaneously in
115 * auditsc.c */ 119 * auditsc.c */
116DECLARE_MUTEX(audit_netlink_sem); 120DEFINE_MUTEX(audit_netlink_mutex);
117 121
118/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 122/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
119 * audit records. Since printk uses a 1024 byte buffer, this buffer 123 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -142,7 +146,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
142 nlh->nlmsg_pid = pid; 146 nlh->nlmsg_pid = pid;
143} 147}
144 148
145static void audit_panic(const char *message) 149void audit_panic(const char *message)
146{ 150{
147 switch (audit_failure) 151 switch (audit_failure)
148 { 152 {
@@ -186,8 +190,14 @@ static inline int audit_rate_check(void)
186 return retval; 190 return retval;
187} 191}
188 192
189/* Emit at least 1 message per second, even if audit_rate_check is 193/**
190 * throttling. */ 194 * audit_log_lost - conditionally log lost audit message event
195 * @message: the message stating reason for lost audit message
196 *
197 * Emit at least 1 message per second, even if audit_rate_check is
198 * throttling.
199 * Always increment the lost messages counter.
200*/
191void audit_log_lost(const char *message) 201void audit_log_lost(const char *message)
192{ 202{
193 static unsigned long last_msg = 0; 203 static unsigned long last_msg = 0;
@@ -218,52 +228,105 @@ void audit_log_lost(const char *message)
218 audit_backlog_limit); 228 audit_backlog_limit);
219 audit_panic(message); 229 audit_panic(message);
220 } 230 }
221
222} 231}
223 232
224static int audit_set_rate_limit(int limit, uid_t loginuid) 233static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
225{ 234{
226 int old = audit_rate_limit; 235 int old = audit_rate_limit;
227 audit_rate_limit = limit; 236
228 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 237 if (sid) {
238 char *ctx = NULL;
239 u32 len;
240 int rc;
241 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
242 return rc;
243 else
244 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
245 "audit_rate_limit=%d old=%d by auid=%u subj=%s",
246 limit, old, loginuid, ctx);
247 kfree(ctx);
248 } else
249 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
229 "audit_rate_limit=%d old=%d by auid=%u", 250 "audit_rate_limit=%d old=%d by auid=%u",
230 audit_rate_limit, old, loginuid); 251 limit, old, loginuid);
252 audit_rate_limit = limit;
231 return old; 253 return old;
232} 254}
233 255
234static int audit_set_backlog_limit(int limit, uid_t loginuid) 256static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
235{ 257{
236 int old = audit_backlog_limit; 258 int old = audit_backlog_limit;
237 audit_backlog_limit = limit; 259
238 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 260 if (sid) {
261 char *ctx = NULL;
262 u32 len;
263 int rc;
264 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
265 return rc;
266 else
267 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
268 "audit_backlog_limit=%d old=%d by auid=%u subj=%s",
269 limit, old, loginuid, ctx);
270 kfree(ctx);
271 } else
272 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
239 "audit_backlog_limit=%d old=%d by auid=%u", 273 "audit_backlog_limit=%d old=%d by auid=%u",
240 audit_backlog_limit, old, loginuid); 274 limit, old, loginuid);
275 audit_backlog_limit = limit;
241 return old; 276 return old;
242} 277}
243 278
244static int audit_set_enabled(int state, uid_t loginuid) 279static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
245{ 280{
246 int old = audit_enabled; 281 int old = audit_enabled;
282
247 if (state != 0 && state != 1) 283 if (state != 0 && state != 1)
248 return -EINVAL; 284 return -EINVAL;
249 audit_enabled = state; 285
250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 286 if (sid) {
287 char *ctx = NULL;
288 u32 len;
289 int rc;
290 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
291 return rc;
292 else
293 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
294 "audit_enabled=%d old=%d by auid=%u subj=%s",
295 state, old, loginuid, ctx);
296 kfree(ctx);
297 } else
298 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
251 "audit_enabled=%d old=%d by auid=%u", 299 "audit_enabled=%d old=%d by auid=%u",
252 audit_enabled, old, loginuid); 300 state, old, loginuid);
301 audit_enabled = state;
253 return old; 302 return old;
254} 303}
255 304
256static int audit_set_failure(int state, uid_t loginuid) 305static int audit_set_failure(int state, uid_t loginuid, u32 sid)
257{ 306{
258 int old = audit_failure; 307 int old = audit_failure;
308
259 if (state != AUDIT_FAIL_SILENT 309 if (state != AUDIT_FAIL_SILENT
260 && state != AUDIT_FAIL_PRINTK 310 && state != AUDIT_FAIL_PRINTK
261 && state != AUDIT_FAIL_PANIC) 311 && state != AUDIT_FAIL_PANIC)
262 return -EINVAL; 312 return -EINVAL;
263 audit_failure = state; 313
264 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 314 if (sid) {
315 char *ctx = NULL;
316 u32 len;
317 int rc;
318 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
319 return rc;
320 else
321 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
322 "audit_failure=%d old=%d by auid=%u subj=%s",
323 state, old, loginuid, ctx);
324 kfree(ctx);
325 } else
326 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
265 "audit_failure=%d old=%d by auid=%u", 327 "audit_failure=%d old=%d by auid=%u",
266 audit_failure, old, loginuid); 328 state, old, loginuid);
329 audit_failure = state;
267 return old; 330 return old;
268} 331}
269 332
@@ -300,8 +363,22 @@ static int kauditd_thread(void *dummy)
300 remove_wait_queue(&kauditd_wait, &wait); 363 remove_wait_queue(&kauditd_wait, &wait);
301 } 364 }
302 } 365 }
366 return 0;
303} 367}
304 368
369/**
370 * audit_send_reply - send an audit reply message via netlink
371 * @pid: process id to send reply to
372 * @seq: sequence number
373 * @type: audit message type
374 * @done: done (last) flag
375 * @multi: multi-part message flag
376 * @payload: payload data
377 * @size: payload size
378 *
379 * Allocates an skb, builds the netlink message, and sends it to the pid.
380 * No failure notifications.
381 */
305void audit_send_reply(int pid, int seq, int type, int done, int multi, 382void audit_send_reply(int pid, int seq, int type, int done, int multi,
306 void *payload, int size) 383 void *payload, int size)
307{ 384{
@@ -342,15 +419,19 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
342 switch (msg_type) { 419 switch (msg_type) {
343 case AUDIT_GET: 420 case AUDIT_GET:
344 case AUDIT_LIST: 421 case AUDIT_LIST:
422 case AUDIT_LIST_RULES:
345 case AUDIT_SET: 423 case AUDIT_SET:
346 case AUDIT_ADD: 424 case AUDIT_ADD:
425 case AUDIT_ADD_RULE:
347 case AUDIT_DEL: 426 case AUDIT_DEL:
427 case AUDIT_DEL_RULE:
348 case AUDIT_SIGNAL_INFO: 428 case AUDIT_SIGNAL_INFO:
349 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) 429 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
350 err = -EPERM; 430 err = -EPERM;
351 break; 431 break;
352 case AUDIT_USER: 432 case AUDIT_USER:
353 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 433 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
434 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
354 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) 435 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
355 err = -EPERM; 436 err = -EPERM;
356 break; 437 break;
@@ -363,7 +444,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
363 444
364static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 445static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
365{ 446{
366 u32 uid, pid, seq; 447 u32 uid, pid, seq, sid;
367 void *data; 448 void *data;
368 struct audit_status *status_get, status_set; 449 struct audit_status *status_get, status_set;
369 int err; 450 int err;
@@ -376,7 +457,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
376 if (err) 457 if (err)
377 return err; 458 return err;
378 459
379 /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ 460 /* As soon as there's any sign of userspace auditd,
461 * start kauditd to talk to it */
380 if (!kauditd_task) 462 if (!kauditd_task)
381 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); 463 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
382 if (IS_ERR(kauditd_task)) { 464 if (IS_ERR(kauditd_task)) {
@@ -388,6 +470,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
388 pid = NETLINK_CREDS(skb)->pid; 470 pid = NETLINK_CREDS(skb)->pid;
389 uid = NETLINK_CREDS(skb)->uid; 471 uid = NETLINK_CREDS(skb)->uid;
390 loginuid = NETLINK_CB(skb).loginuid; 472 loginuid = NETLINK_CB(skb).loginuid;
473 sid = NETLINK_CB(skb).sid;
391 seq = nlh->nlmsg_seq; 474 seq = nlh->nlmsg_seq;
392 data = NLMSG_DATA(nlh); 475 data = NLMSG_DATA(nlh);
393 476
@@ -408,28 +491,47 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
408 return -EINVAL; 491 return -EINVAL;
409 status_get = (struct audit_status *)data; 492 status_get = (struct audit_status *)data;
410 if (status_get->mask & AUDIT_STATUS_ENABLED) { 493 if (status_get->mask & AUDIT_STATUS_ENABLED) {
411 err = audit_set_enabled(status_get->enabled, loginuid); 494 err = audit_set_enabled(status_get->enabled,
495 loginuid, sid);
412 if (err < 0) return err; 496 if (err < 0) return err;
413 } 497 }
414 if (status_get->mask & AUDIT_STATUS_FAILURE) { 498 if (status_get->mask & AUDIT_STATUS_FAILURE) {
415 err = audit_set_failure(status_get->failure, loginuid); 499 err = audit_set_failure(status_get->failure,
500 loginuid, sid);
416 if (err < 0) return err; 501 if (err < 0) return err;
417 } 502 }
418 if (status_get->mask & AUDIT_STATUS_PID) { 503 if (status_get->mask & AUDIT_STATUS_PID) {
419 int old = audit_pid; 504 int old = audit_pid;
505 if (sid) {
506 char *ctx = NULL;
507 u32 len;
508 int rc;
509 if ((rc = selinux_ctxid_to_string(
510 sid, &ctx, &len)))
511 return rc;
512 else
513 audit_log(NULL, GFP_KERNEL,
514 AUDIT_CONFIG_CHANGE,
515 "audit_pid=%d old=%d by auid=%u subj=%s",
516 status_get->pid, old,
517 loginuid, ctx);
518 kfree(ctx);
519 } else
520 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
521 "audit_pid=%d old=%d by auid=%u",
522 status_get->pid, old, loginuid);
420 audit_pid = status_get->pid; 523 audit_pid = status_get->pid;
421 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
422 "audit_pid=%d old=%d by auid=%u",
423 audit_pid, old, loginuid);
424 } 524 }
425 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 525 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
426 audit_set_rate_limit(status_get->rate_limit, loginuid); 526 audit_set_rate_limit(status_get->rate_limit,
527 loginuid, sid);
427 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 528 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
428 audit_set_backlog_limit(status_get->backlog_limit, 529 audit_set_backlog_limit(status_get->backlog_limit,
429 loginuid); 530 loginuid, sid);
430 break; 531 break;
431 case AUDIT_USER: 532 case AUDIT_USER:
432 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 533 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
534 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
433 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 535 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
434 return 0; 536 return 0;
435 537
@@ -439,8 +541,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
439 ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 541 ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
440 if (ab) { 542 if (ab) {
441 audit_log_format(ab, 543 audit_log_format(ab,
442 "user pid=%d uid=%u auid=%u msg='%.1024s'", 544 "user pid=%d uid=%u auid=%u",
443 pid, uid, loginuid, (char *)data); 545 pid, uid, loginuid);
546 if (sid) {
547 char *ctx = NULL;
548 u32 len;
549 if (selinux_ctxid_to_string(
550 sid, &ctx, &len)) {
551 audit_log_format(ab,
552 " ssid=%u", sid);
553 /* Maybe call audit_panic? */
554 } else
555 audit_log_format(ab,
556 " subj=%s", ctx);
557 kfree(ctx);
558 }
559 audit_log_format(ab, " msg='%.1024s'",
560 (char *)data);
444 audit_set_pid(ab, pid); 561 audit_set_pid(ab, pid);
445 audit_log_end(ab); 562 audit_log_end(ab);
446 } 563 }
@@ -448,12 +565,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
448 break; 565 break;
449 case AUDIT_ADD: 566 case AUDIT_ADD:
450 case AUDIT_DEL: 567 case AUDIT_DEL:
451 if (nlh->nlmsg_len < sizeof(struct audit_rule)) 568 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
452 return -EINVAL; 569 return -EINVAL;
453 /* fallthrough */ 570 /* fallthrough */
454 case AUDIT_LIST: 571 case AUDIT_LIST:
455 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 572 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
456 uid, seq, data, loginuid); 573 uid, seq, data, nlmsg_len(nlh),
574 loginuid, sid);
575 break;
576 case AUDIT_ADD_RULE:
577 case AUDIT_DEL_RULE:
578 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
579 return -EINVAL;
580 /* fallthrough */
581 case AUDIT_LIST_RULES:
582 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
583 uid, seq, data, nlmsg_len(nlh),
584 loginuid, sid);
457 break; 585 break;
458 case AUDIT_SIGNAL_INFO: 586 case AUDIT_SIGNAL_INFO:
459 sig_data.uid = audit_sig_uid; 587 sig_data.uid = audit_sig_uid;
@@ -469,9 +597,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
469 return err < 0 ? err : 0; 597 return err < 0 ? err : 0;
470} 598}
471 599
472/* Get message from skb (based on rtnetlink_rcv_skb). Each message is 600/*
601 * Get message from skb (based on rtnetlink_rcv_skb). Each message is
473 * processed by audit_receive_msg. Malformed skbs with wrong length are 602 * processed by audit_receive_msg. Malformed skbs with wrong length are
474 * discarded silently. */ 603 * discarded silently.
604 */
475static void audit_receive_skb(struct sk_buff *skb) 605static void audit_receive_skb(struct sk_buff *skb)
476{ 606{
477 int err; 607 int err;
@@ -499,14 +629,14 @@ static void audit_receive(struct sock *sk, int length)
499 struct sk_buff *skb; 629 struct sk_buff *skb;
500 unsigned int qlen; 630 unsigned int qlen;
501 631
502 down(&audit_netlink_sem); 632 mutex_lock(&audit_netlink_mutex);
503 633
504 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { 634 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
505 skb = skb_dequeue(&sk->sk_receive_queue); 635 skb = skb_dequeue(&sk->sk_receive_queue);
506 audit_receive_skb(skb); 636 audit_receive_skb(skb);
507 kfree_skb(skb); 637 kfree_skb(skb);
508 } 638 }
509 up(&audit_netlink_sem); 639 mutex_unlock(&audit_netlink_mutex);
510} 640}
511 641
512 642
@@ -519,11 +649,17 @@ static int __init audit_init(void)
519 THIS_MODULE); 649 THIS_MODULE);
520 if (!audit_sock) 650 if (!audit_sock)
521 audit_panic("cannot initialize netlink socket"); 651 audit_panic("cannot initialize netlink socket");
652 else
653 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
522 654
523 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
524 skb_queue_head_init(&audit_skb_queue); 655 skb_queue_head_init(&audit_skb_queue);
525 audit_initialized = 1; 656 audit_initialized = 1;
526 audit_enabled = audit_default; 657 audit_enabled = audit_default;
658
659 /* Register the callback with selinux. This callback will be invoked
660 * when a new policy is loaded. */
661 selinux_audit_set_callback(&selinux_audit_rule_update);
662
527 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 663 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
528 return 0; 664 return 0;
529} 665}
@@ -538,7 +674,7 @@ static int __init audit_enable(char *str)
538 audit_initialized ? "" : " (after initialization)"); 674 audit_initialized ? "" : " (after initialization)");
539 if (audit_initialized) 675 if (audit_initialized)
540 audit_enabled = audit_default; 676 audit_enabled = audit_default;
541 return 0; 677 return 1;
542} 678}
543 679
544__setup("audit=", audit_enable); 680__setup("audit=", audit_enable);
@@ -600,7 +736,10 @@ err:
600 return NULL; 736 return NULL;
601} 737}
602 738
603/* Compute a serial number for the audit record. Audit records are 739/**
740 * audit_serial - compute a serial number for the audit record
741 *
742 * Compute a serial number for the audit record. Audit records are
604 * written to user-space as soon as they are generated, so a complete 743 * written to user-space as soon as they are generated, so a complete
605 * audit record may be written in several pieces. The timestamp of the 744 * audit record may be written in several pieces. The timestamp of the
606 * record and this serial number are used by the user-space tools to 745 * record and this serial number are used by the user-space tools to
@@ -612,8 +751,8 @@ err:
612 * audit context (for those records that have a context), and emit them 751 * audit context (for those records that have a context), and emit them
613 * all at syscall exit. However, this could delay the reporting of 752 * all at syscall exit. However, this could delay the reporting of
614 * significant errors until syscall exit (or never, if the system 753 * significant errors until syscall exit (or never, if the system
615 * halts). */ 754 * halts).
616 755 */
617unsigned int audit_serial(void) 756unsigned int audit_serial(void)
618{ 757{
619 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 758 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
@@ -649,6 +788,21 @@ static inline void audit_get_stamp(struct audit_context *ctx,
649 * will be written at syscall exit. If there is no associated task, tsk 788 * will be written at syscall exit. If there is no associated task, tsk
650 * should be NULL. */ 789 * should be NULL. */
651 790
791/**
792 * audit_log_start - obtain an audit buffer
793 * @ctx: audit_context (may be NULL)
794 * @gfp_mask: type of allocation
795 * @type: audit message type
796 *
797 * Returns audit_buffer pointer on success or NULL on error.
798 *
799 * Obtain an audit buffer. This routine does locking to obtain the
800 * audit buffer, but then no locking is required for calls to
801 * audit_log_*format. If the task (ctx) is a task that is currently in a
802 * syscall, then the syscall is marked as auditable and an audit record
803 * will be written at syscall exit. If there is no associated task, then
804 * task context (ctx) should be NULL.
805 */
652struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, 806struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
653 int type) 807 int type)
654{ 808{
@@ -661,6 +815,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
661 if (!audit_initialized) 815 if (!audit_initialized)
662 return NULL; 816 return NULL;
663 817
818 if (unlikely(audit_filter_type(type)))
819 return NULL;
820
664 if (gfp_mask & __GFP_WAIT) 821 if (gfp_mask & __GFP_WAIT)
665 reserve = 0; 822 reserve = 0;
666 else 823 else
@@ -713,6 +870,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
713/** 870/**
714 * audit_expand - expand skb in the audit buffer 871 * audit_expand - expand skb in the audit buffer
715 * @ab: audit_buffer 872 * @ab: audit_buffer
873 * @extra: space to add at tail of the skb
716 * 874 *
717 * Returns 0 (no space) on failed expansion, or available space if 875 * Returns 0 (no space) on failed expansion, or available space if
718 * successful. 876 * successful.
@@ -729,10 +887,12 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
729 return skb_tailroom(skb); 887 return skb_tailroom(skb);
730} 888}
731 889
732/* Format an audit message into the audit buffer. If there isn't enough 890/*
891 * Format an audit message into the audit buffer. If there isn't enough
733 * room in the audit buffer, more room will be allocated and vsnprint 892 * room in the audit buffer, more room will be allocated and vsnprint
734 * will be called a second time. Currently, we assume that a printk 893 * will be called a second time. Currently, we assume that a printk
735 * can't format message larger than 1024 bytes, so we don't either. */ 894 * can't format message larger than 1024 bytes, so we don't either.
895 */
736static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, 896static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
737 va_list args) 897 va_list args)
738{ 898{
@@ -757,7 +917,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
757 /* The printk buffer is 1024 bytes long, so if we get 917 /* The printk buffer is 1024 bytes long, so if we get
758 * here and AUDIT_BUFSIZ is at least 1024, then we can 918 * here and AUDIT_BUFSIZ is at least 1024, then we can
759 * log everything that printk could have logged. */ 919 * log everything that printk could have logged. */
760 avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 920 avail = audit_expand(ab,
921 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
761 if (!avail) 922 if (!avail)
762 goto out; 923 goto out;
763 len = vsnprintf(skb->tail, avail, fmt, args2); 924 len = vsnprintf(skb->tail, avail, fmt, args2);
@@ -768,8 +929,14 @@ out:
768 return; 929 return;
769} 930}
770 931
771/* Format a message into the audit buffer. All the work is done in 932/**
772 * audit_log_vformat. */ 933 * audit_log_format - format a message into the audit buffer.
934 * @ab: audit_buffer
935 * @fmt: format string
936 * @...: optional parameters matching @fmt string
937 *
938 * All the work is done in audit_log_vformat.
939 */
773void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) 940void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
774{ 941{
775 va_list args; 942 va_list args;
@@ -781,9 +948,18 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
781 va_end(args); 948 va_end(args);
782} 949}
783 950
784/* This function will take the passed buf and convert it into a string of 951/**
785 * ascii hex digits. The new string is placed onto the skb. */ 952 * audit_log_hex - convert a buffer to hex and append it to the audit skb
786void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 953 * @ab: the audit_buffer
954 * @buf: buffer to convert to hex
955 * @len: length of @buf to be converted
956 *
957 * No return value; failure to expand is silently ignored.
958 *
959 * This function will take the passed buf and convert it into a string of
960 * ascii hex digits. The new string is placed onto the skb.
961 */
962void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
787 size_t len) 963 size_t len)
788{ 964{
789 int i, avail, new_len; 965 int i, avail, new_len;
@@ -812,10 +988,16 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
812 skb_put(skb, len << 1); /* new string is twice the old string */ 988 skb_put(skb, len << 1); /* new string is twice the old string */
813} 989}
814 990
815/* This code will escape a string that is passed to it if the string 991/**
816 * contains a control character, unprintable character, double quote mark, 992 * audit_log_unstrustedstring - log a string that may contain random characters
993 * @ab: audit_buffer
994 * @string: string to be logged
995 *
996 * This code will escape a string that is passed to it if the string
997 * contains a control character, unprintable character, double quote mark,
817 * or a space. Unescaped strings will start and end with a double quote mark. 998 * or a space. Unescaped strings will start and end with a double quote mark.
818 * Strings that are escaped are printed in hex (2 digits per char). */ 999 * Strings that are escaped are printed in hex (2 digits per char).
1000 */
819void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1001void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
820{ 1002{
821 const unsigned char *p = string; 1003 const unsigned char *p = string;
@@ -854,10 +1036,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
854 kfree(path); 1036 kfree(path);
855} 1037}
856 1038
857/* The netlink_* functions cannot be called inside an irq context, so 1039/**
858 * the audit buffer is places on a queue and a tasklet is scheduled to 1040 * audit_log_end - end one audit record
1041 * @ab: the audit_buffer
1042 *
1043 * The netlink_* functions cannot be called inside an irq context, so
1044 * the audit buffer is placed on a queue and a tasklet is scheduled to
859 * remove them from the queue outside the irq context. May be called in 1045 * remove them from the queue outside the irq context. May be called in
860 * any context. */ 1046 * any context.
1047 */
861void audit_log_end(struct audit_buffer *ab) 1048void audit_log_end(struct audit_buffer *ab)
862{ 1049{
863 if (!ab) 1050 if (!ab)
@@ -878,9 +1065,18 @@ void audit_log_end(struct audit_buffer *ab)
878 audit_buffer_free(ab); 1065 audit_buffer_free(ab);
879} 1066}
880 1067
881/* Log an audit record. This is a convenience function that calls 1068/**
882 * audit_log_start, audit_log_vformat, and audit_log_end. It may be 1069 * audit_log - Log an audit record
883 * called in any context. */ 1070 * @ctx: audit context
1071 * @gfp_mask: type of allocation
1072 * @type: audit message type
1073 * @fmt: format string to use
1074 * @...: variable parameters matching the format string
1075 *
1076 * This is a convenience function that calls audit_log_start,
1077 * audit_log_vformat, and audit_log_end. It may be called
1078 * in any context.
1079 */
884void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 1080void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
885 const char *fmt, ...) 1081 const char *fmt, ...)
886{ 1082{
@@ -895,3 +1091,8 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
895 audit_log_end(ab); 1091 audit_log_end(ab);
896 } 1092 }
897} 1093}
1094
1095EXPORT_SYMBOL(audit_log_start);
1096EXPORT_SYMBOL(audit_log_end);
1097EXPORT_SYMBOL(audit_log_format);
1098EXPORT_SYMBOL(audit_log);
diff --git a/kernel/audit.h b/kernel/audit.h
new file mode 100644
index 0000000000..6f733920fd
--- /dev/null
+++ b/kernel/audit.h
@@ -0,0 +1,92 @@
1/* audit -- definition of audit_context structure and supporting types
2 *
3 * Copyright 2003-2004 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/mutex.h>
23#include <linux/fs.h>
24#include <linux/audit.h>
25
26/* 0 = no checking
27 1 = put_count checking
28 2 = verbose put_count checking
29*/
30#define AUDIT_DEBUG 0
31
32/* At task start time, the audit_state is set in the audit_context using
33 a per-task filter. At syscall entry, the audit_state is augmented by
34 the syscall filter. */
35enum audit_state {
36 AUDIT_DISABLED, /* Do not create per-task audit_context.
37 * No syscall-specific audit records can
38 * be generated. */
39 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
40 * but don't necessarily fill it in at
41 * syscall entry time (i.e., filter
42 * instead). */
43 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
44 * and always fill it in at syscall
45 * entry time. This makes a full
46 * syscall record available if some
47 * other part of the kernel decides it
48 * should be recorded. */
49 AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
50 * always fill it in at syscall entry
51 * time, and always write out the audit
52 * record at syscall exit time. */
53};
54
55/* Rule lists */
56struct audit_field {
57 u32 type;
58 u32 val;
59 u32 op;
60 char *se_str;
61 struct selinux_audit_rule *se_rule;
62};
63
64struct audit_krule {
65 int vers_ops;
66 u32 flags;
67 u32 listnr;
68 u32 action;
69 u32 mask[AUDIT_BITMASK_SIZE];
70 u32 buflen; /* for data alloc on list rules */
71 u32 field_count;
72 struct audit_field *fields;
73};
74
75struct audit_entry {
76 struct list_head list;
77 struct rcu_head rcu;
78 struct audit_krule rule;
79};
80
81
82extern int audit_pid;
83extern int audit_comparator(const u32 left, const u32 op, const u32 right);
84
85extern void audit_send_reply(int pid, int seq, int type,
86 int done, int multi,
87 void *payload, int size);
88extern void audit_log_lost(const char *message);
89extern void audit_panic(const char *message);
90extern struct mutex audit_netlink_mutex;
91
92extern int selinux_audit_rule_update(void);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
new file mode 100644
index 0000000000..7c134906d6
--- /dev/null
+++ b/kernel/auditfilter.c
@@ -0,0 +1,857 @@
1/* auditfilter.c -- filtering of audit events
2 *
3 * Copyright 2003-2004 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/netlink.h>
26#include <linux/selinux.h>
27#include "audit.h"
28
29/* There are three lists of rules -- one to search at task creation
30 * time, one to search at syscall entry time, and another to search at
31 * syscall exit time. */
32struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
33 LIST_HEAD_INIT(audit_filter_list[0]),
34 LIST_HEAD_INIT(audit_filter_list[1]),
35 LIST_HEAD_INIT(audit_filter_list[2]),
36 LIST_HEAD_INIT(audit_filter_list[3]),
37 LIST_HEAD_INIT(audit_filter_list[4]),
38 LIST_HEAD_INIT(audit_filter_list[5]),
39#if AUDIT_NR_FILTERS != 6
40#error Fix audit_filter_list initialiser
41#endif
42};
43
44static inline void audit_free_rule(struct audit_entry *e)
45{
46 int i;
47 if (e->rule.fields)
48 for (i = 0; i < e->rule.field_count; i++) {
49 struct audit_field *f = &e->rule.fields[i];
50 kfree(f->se_str);
51 selinux_audit_rule_free(f->se_rule);
52 }
53 kfree(e->rule.fields);
54 kfree(e);
55}
56
57static inline void audit_free_rule_rcu(struct rcu_head *head)
58{
59 struct audit_entry *e = container_of(head, struct audit_entry, rcu);
60 audit_free_rule(e);
61}
62
63/* Initialize an audit filterlist entry. */
64static inline struct audit_entry *audit_init_entry(u32 field_count)
65{
66 struct audit_entry *entry;
67 struct audit_field *fields;
68
69 entry = kzalloc(sizeof(*entry), GFP_KERNEL);
70 if (unlikely(!entry))
71 return NULL;
72
73 fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
74 if (unlikely(!fields)) {
75 kfree(entry);
76 return NULL;
77 }
78 entry->rule.fields = fields;
79
80 return entry;
81}
82
83/* Unpack a filter field's string representation from user-space
84 * buffer. */
85static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
86{
87 char *str;
88
89 if (!*bufp || (len == 0) || (len > *remain))
90 return ERR_PTR(-EINVAL);
91
92 /* Of the currently implemented string fields, PATH_MAX
93 * defines the longest valid length.
94 */
95 if (len > PATH_MAX)
96 return ERR_PTR(-ENAMETOOLONG);
97
98 str = kmalloc(len + 1, GFP_KERNEL);
99 if (unlikely(!str))
100 return ERR_PTR(-ENOMEM);
101
102 memcpy(str, *bufp, len);
103 str[len] = 0;
104 *bufp += len;
105 *remain -= len;
106
107 return str;
108}
109
110/* Common user-space to kernel rule translation. */
111static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
112{
113 unsigned listnr;
114 struct audit_entry *entry;
115 int i, err;
116
117 err = -EINVAL;
118 listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
119 switch(listnr) {
120 default:
121 goto exit_err;
122 case AUDIT_FILTER_USER:
123 case AUDIT_FILTER_TYPE:
124#ifdef CONFIG_AUDITSYSCALL
125 case AUDIT_FILTER_ENTRY:
126 case AUDIT_FILTER_EXIT:
127 case AUDIT_FILTER_TASK:
128#endif
129 ;
130 }
131 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE &&
132 rule->action != AUDIT_ALWAYS)
133 goto exit_err;
134 if (rule->field_count > AUDIT_MAX_FIELDS)
135 goto exit_err;
136
137 err = -ENOMEM;
138 entry = audit_init_entry(rule->field_count);
139 if (!entry)
140 goto exit_err;
141
142 entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
143 entry->rule.listnr = listnr;
144 entry->rule.action = rule->action;
145 entry->rule.field_count = rule->field_count;
146
147 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
148 entry->rule.mask[i] = rule->mask[i];
149
150 return entry;
151
152exit_err:
153 return ERR_PTR(err);
154}
155
156/* Translate struct audit_rule to kernel's rule respresentation.
157 * Exists for backward compatibility with userspace. */
158static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
159{
160 struct audit_entry *entry;
161 int err = 0;
162 int i;
163
164 entry = audit_to_entry_common(rule);
165 if (IS_ERR(entry))
166 goto exit_nofree;
167
168 for (i = 0; i < rule->field_count; i++) {
169 struct audit_field *f = &entry->rule.fields[i];
170
171 f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
172 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
173 f->val = rule->values[i];
174
175 if (f->type & AUDIT_UNUSED_BITS ||
176 f->type == AUDIT_SE_USER ||
177 f->type == AUDIT_SE_ROLE ||
178 f->type == AUDIT_SE_TYPE ||
179 f->type == AUDIT_SE_SEN ||
180 f->type == AUDIT_SE_CLR) {
181 err = -EINVAL;
182 goto exit_free;
183 }
184
185 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
186
187 /* Support for legacy operators where
188 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
189 if (f->op & AUDIT_NEGATE)
190 f->op = AUDIT_NOT_EQUAL;
191 else if (!f->op)
192 f->op = AUDIT_EQUAL;
193 else if (f->op == AUDIT_OPERATORS) {
194 err = -EINVAL;
195 goto exit_free;
196 }
197 }
198
199exit_nofree:
200 return entry;
201
202exit_free:
203 audit_free_rule(entry);
204 return ERR_PTR(err);
205}
206
207/* Translate struct audit_rule_data to kernel's rule respresentation. */
208static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
209 size_t datasz)
210{
211 int err = 0;
212 struct audit_entry *entry;
213 void *bufp;
214 size_t remain = datasz - sizeof(struct audit_rule_data);
215 int i;
216 char *str;
217
218 entry = audit_to_entry_common((struct audit_rule *)data);
219 if (IS_ERR(entry))
220 goto exit_nofree;
221
222 bufp = data->buf;
223 entry->rule.vers_ops = 2;
224 for (i = 0; i < data->field_count; i++) {
225 struct audit_field *f = &entry->rule.fields[i];
226
227 err = -EINVAL;
228 if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
229 data->fieldflags[i] & ~AUDIT_OPERATORS)
230 goto exit_free;
231
232 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
233 f->type = data->fields[i];
234 f->val = data->values[i];
235 f->se_str = NULL;
236 f->se_rule = NULL;
237 switch(f->type) {
238 case AUDIT_SE_USER:
239 case AUDIT_SE_ROLE:
240 case AUDIT_SE_TYPE:
241 case AUDIT_SE_SEN:
242 case AUDIT_SE_CLR:
243 str = audit_unpack_string(&bufp, &remain, f->val);
244 if (IS_ERR(str))
245 goto exit_free;
246 entry->rule.buflen += f->val;
247
248 err = selinux_audit_rule_init(f->type, f->op, str,
249 &f->se_rule);
250 /* Keep currently invalid fields around in case they
251 * become valid after a policy reload. */
252 if (err == -EINVAL) {
253 printk(KERN_WARNING "audit rule for selinux "
254 "\'%s\' is invalid\n", str);
255 err = 0;
256 }
257 if (err) {
258 kfree(str);
259 goto exit_free;
260 } else
261 f->se_str = str;
262 break;
263 }
264 }
265
266exit_nofree:
267 return entry;
268
269exit_free:
270 audit_free_rule(entry);
271 return ERR_PTR(err);
272}
273
274/* Pack a filter field's string representation into data block. */
275static inline size_t audit_pack_string(void **bufp, char *str)
276{
277 size_t len = strlen(str);
278
279 memcpy(*bufp, str, len);
280 *bufp += len;
281
282 return len;
283}
284
285/* Translate kernel rule respresentation to struct audit_rule.
286 * Exists for backward compatibility with userspace. */
287static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
288{
289 struct audit_rule *rule;
290 int i;
291
292 rule = kmalloc(sizeof(*rule), GFP_KERNEL);
293 if (unlikely(!rule))
294 return ERR_PTR(-ENOMEM);
295 memset(rule, 0, sizeof(*rule));
296
297 rule->flags = krule->flags | krule->listnr;
298 rule->action = krule->action;
299 rule->field_count = krule->field_count;
300 for (i = 0; i < rule->field_count; i++) {
301 rule->values[i] = krule->fields[i].val;
302 rule->fields[i] = krule->fields[i].type;
303
304 if (krule->vers_ops == 1) {
305 if (krule->fields[i].op & AUDIT_NOT_EQUAL)
306 rule->fields[i] |= AUDIT_NEGATE;
307 } else {
308 rule->fields[i] |= krule->fields[i].op;
309 }
310 }
311 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
312
313 return rule;
314}
315
316/* Translate kernel rule respresentation to struct audit_rule_data. */
317static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
318{
319 struct audit_rule_data *data;
320 void *bufp;
321 int i;
322
323 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
324 if (unlikely(!data))
325 return ERR_PTR(-ENOMEM);
326 memset(data, 0, sizeof(*data));
327
328 data->flags = krule->flags | krule->listnr;
329 data->action = krule->action;
330 data->field_count = krule->field_count;
331 bufp = data->buf;
332 for (i = 0; i < data->field_count; i++) {
333 struct audit_field *f = &krule->fields[i];
334
335 data->fields[i] = f->type;
336 data->fieldflags[i] = f->op;
337 switch(f->type) {
338 case AUDIT_SE_USER:
339 case AUDIT_SE_ROLE:
340 case AUDIT_SE_TYPE:
341 case AUDIT_SE_SEN:
342 case AUDIT_SE_CLR:
343 data->buflen += data->values[i] =
344 audit_pack_string(&bufp, f->se_str);
345 break;
346 default:
347 data->values[i] = f->val;
348 }
349 }
350 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
351
352 return data;
353}
354
355/* Compare two rules in kernel format. Considered success if rules
356 * don't match. */
357static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
358{
359 int i;
360
361 if (a->flags != b->flags ||
362 a->listnr != b->listnr ||
363 a->action != b->action ||
364 a->field_count != b->field_count)
365 return 1;
366
367 for (i = 0; i < a->field_count; i++) {
368 if (a->fields[i].type != b->fields[i].type ||
369 a->fields[i].op != b->fields[i].op)
370 return 1;
371
372 switch(a->fields[i].type) {
373 case AUDIT_SE_USER:
374 case AUDIT_SE_ROLE:
375 case AUDIT_SE_TYPE:
376 case AUDIT_SE_SEN:
377 case AUDIT_SE_CLR:
378 if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
379 return 1;
380 break;
381 default:
382 if (a->fields[i].val != b->fields[i].val)
383 return 1;
384 }
385 }
386
387 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
388 if (a->mask[i] != b->mask[i])
389 return 1;
390
391 return 0;
392}
393
394/* Duplicate selinux field information. The se_rule is opaque, so must be
395 * re-initialized. */
396static inline int audit_dupe_selinux_field(struct audit_field *df,
397 struct audit_field *sf)
398{
399 int ret = 0;
400 char *se_str;
401
402 /* our own copy of se_str */
403 se_str = kstrdup(sf->se_str, GFP_KERNEL);
404 if (unlikely(IS_ERR(se_str)))
405 return -ENOMEM;
406 df->se_str = se_str;
407
408 /* our own (refreshed) copy of se_rule */
409 ret = selinux_audit_rule_init(df->type, df->op, df->se_str,
410 &df->se_rule);
411 /* Keep currently invalid fields around in case they
412 * become valid after a policy reload. */
413 if (ret == -EINVAL) {
414 printk(KERN_WARNING "audit rule for selinux \'%s\' is "
415 "invalid\n", df->se_str);
416 ret = 0;
417 }
418
419 return ret;
420}
421
422/* Duplicate an audit rule. This will be a deep copy with the exception
423 * of the watch - that pointer is carried over. The selinux specific fields
424 * will be updated in the copy. The point is to be able to replace the old
425 * rule with the new rule in the filterlist, then free the old rule. */
426static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
427{
428 u32 fcount = old->field_count;
429 struct audit_entry *entry;
430 struct audit_krule *new;
431 int i, err = 0;
432
433 entry = audit_init_entry(fcount);
434 if (unlikely(!entry))
435 return ERR_PTR(-ENOMEM);
436
437 new = &entry->rule;
438 new->vers_ops = old->vers_ops;
439 new->flags = old->flags;
440 new->listnr = old->listnr;
441 new->action = old->action;
442 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
443 new->mask[i] = old->mask[i];
444 new->buflen = old->buflen;
445 new->field_count = old->field_count;
446 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
447
448 /* deep copy this information, updating the se_rule fields, because
449 * the originals will all be freed when the old rule is freed. */
450 for (i = 0; i < fcount; i++) {
451 switch (new->fields[i].type) {
452 case AUDIT_SE_USER:
453 case AUDIT_SE_ROLE:
454 case AUDIT_SE_TYPE:
455 case AUDIT_SE_SEN:
456 case AUDIT_SE_CLR:
457 err = audit_dupe_selinux_field(&new->fields[i],
458 &old->fields[i]);
459 }
460 if (err) {
461 audit_free_rule(entry);
462 return ERR_PTR(err);
463 }
464 }
465
466 return entry;
467}
468
469/* Add rule to given filterlist if not a duplicate. Protected by
470 * audit_netlink_mutex. */
471static inline int audit_add_rule(struct audit_entry *entry,
472 struct list_head *list)
473{
474 struct audit_entry *e;
475
476 /* Do not use the _rcu iterator here, since this is the only
477 * addition routine. */
478 list_for_each_entry(e, list, list) {
479 if (!audit_compare_rule(&entry->rule, &e->rule))
480 return -EEXIST;
481 }
482
483 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
484 list_add_rcu(&entry->list, list);
485 } else {
486 list_add_tail_rcu(&entry->list, list);
487 }
488
489 return 0;
490}
491
492/* Remove an existing rule from filterlist. Protected by
493 * audit_netlink_mutex. */
494static inline int audit_del_rule(struct audit_entry *entry,
495 struct list_head *list)
496{
497 struct audit_entry *e;
498
499 /* Do not use the _rcu iterator here, since this is the only
500 * deletion routine. */
501 list_for_each_entry(e, list, list) {
502 if (!audit_compare_rule(&entry->rule, &e->rule)) {
503 list_del_rcu(&e->list);
504 call_rcu(&e->rcu, audit_free_rule_rcu);
505 return 0;
506 }
507 }
508 return -ENOENT; /* No matching rule */
509}
510
511/* List rules using struct audit_rule. Exists for backward
512 * compatibility with userspace. */
513static int audit_list(void *_dest)
514{
515 int pid, seq;
516 int *dest = _dest;
517 struct audit_entry *entry;
518 int i;
519
520 pid = dest[0];
521 seq = dest[1];
522 kfree(dest);
523
524 mutex_lock(&audit_netlink_mutex);
525
526 /* The *_rcu iterators not needed here because we are
527 always called with audit_netlink_mutex held. */
528 for (i=0; i<AUDIT_NR_FILTERS; i++) {
529 list_for_each_entry(entry, &audit_filter_list[i], list) {
530 struct audit_rule *rule;
531
532 rule = audit_krule_to_rule(&entry->rule);
533 if (unlikely(!rule))
534 break;
535 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
536 rule, sizeof(*rule));
537 kfree(rule);
538 }
539 }
540 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
541
542 mutex_unlock(&audit_netlink_mutex);
543 return 0;
544}
545
546/* List rules using struct audit_rule_data. */
547static int audit_list_rules(void *_dest)
548{
549 int pid, seq;
550 int *dest = _dest;
551 struct audit_entry *e;
552 int i;
553
554 pid = dest[0];
555 seq = dest[1];
556 kfree(dest);
557
558 mutex_lock(&audit_netlink_mutex);
559
560 /* The *_rcu iterators not needed here because we are
561 always called with audit_netlink_mutex held. */
562 for (i=0; i<AUDIT_NR_FILTERS; i++) {
563 list_for_each_entry(e, &audit_filter_list[i], list) {
564 struct audit_rule_data *data;
565
566 data = audit_krule_to_data(&e->rule);
567 if (unlikely(!data))
568 break;
569 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
570 data, sizeof(*data));
571 kfree(data);
572 }
573 }
574 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
575
576 mutex_unlock(&audit_netlink_mutex);
577 return 0;
578}
579
580/**
581 * audit_receive_filter - apply all rules to the specified message type
582 * @type: audit message type
583 * @pid: target pid for netlink audit messages
584 * @uid: target uid for netlink audit messages
585 * @seq: netlink audit message sequence (serial) number
586 * @data: payload data
587 * @datasz: size of payload data
588 * @loginuid: loginuid of sender
589 * @sid: SE Linux Security ID of sender
590 */
591int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
592 size_t datasz, uid_t loginuid, u32 sid)
593{
594 struct task_struct *tsk;
595 int *dest;
596 int err = 0;
597 struct audit_entry *entry;
598
599 switch (type) {
600 case AUDIT_LIST:
601 case AUDIT_LIST_RULES:
602 /* We can't just spew out the rules here because we might fill
603 * the available socket buffer space and deadlock waiting for
604 * auditctl to read from it... which isn't ever going to
605 * happen if we're actually running in the context of auditctl
606 * trying to _send_ the stuff */
607
608 dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
609 if (!dest)
610 return -ENOMEM;
611 dest[0] = pid;
612 dest[1] = seq;
613
614 if (type == AUDIT_LIST)
615 tsk = kthread_run(audit_list, dest, "audit_list");
616 else
617 tsk = kthread_run(audit_list_rules, dest,
618 "audit_list_rules");
619 if (IS_ERR(tsk)) {
620 kfree(dest);
621 err = PTR_ERR(tsk);
622 }
623 break;
624 case AUDIT_ADD:
625 case AUDIT_ADD_RULE:
626 if (type == AUDIT_ADD)
627 entry = audit_rule_to_entry(data);
628 else
629 entry = audit_data_to_entry(data, datasz);
630 if (IS_ERR(entry))
631 return PTR_ERR(entry);
632
633 err = audit_add_rule(entry,
634 &audit_filter_list[entry->rule.listnr]);
635 if (sid) {
636 char *ctx = NULL;
637 u32 len;
638 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
639 /* Maybe call audit_panic? */
640 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
641 "auid=%u ssid=%u add rule to list=%d res=%d",
642 loginuid, sid, entry->rule.listnr, !err);
643 } else
644 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
645 "auid=%u subj=%s add rule to list=%d res=%d",
646 loginuid, ctx, entry->rule.listnr, !err);
647 kfree(ctx);
648 } else
649 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
650 "auid=%u add rule to list=%d res=%d",
651 loginuid, entry->rule.listnr, !err);
652
653 if (err)
654 audit_free_rule(entry);
655 break;
656 case AUDIT_DEL:
657 case AUDIT_DEL_RULE:
658 if (type == AUDIT_DEL)
659 entry = audit_rule_to_entry(data);
660 else
661 entry = audit_data_to_entry(data, datasz);
662 if (IS_ERR(entry))
663 return PTR_ERR(entry);
664
665 err = audit_del_rule(entry,
666 &audit_filter_list[entry->rule.listnr]);
667
668 if (sid) {
669 char *ctx = NULL;
670 u32 len;
671 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
672 /* Maybe call audit_panic? */
673 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
674 "auid=%u ssid=%u remove rule from list=%d res=%d",
675 loginuid, sid, entry->rule.listnr, !err);
676 } else
677 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
678 "auid=%u subj=%s remove rule from list=%d res=%d",
679 loginuid, ctx, entry->rule.listnr, !err);
680 kfree(ctx);
681 } else
682 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
683 "auid=%u remove rule from list=%d res=%d",
684 loginuid, entry->rule.listnr, !err);
685
686 audit_free_rule(entry);
687 break;
688 default:
689 return -EINVAL;
690 }
691
692 return err;
693}
694
695int audit_comparator(const u32 left, const u32 op, const u32 right)
696{
697 switch (op) {
698 case AUDIT_EQUAL:
699 return (left == right);
700 case AUDIT_NOT_EQUAL:
701 return (left != right);
702 case AUDIT_LESS_THAN:
703 return (left < right);
704 case AUDIT_LESS_THAN_OR_EQUAL:
705 return (left <= right);
706 case AUDIT_GREATER_THAN:
707 return (left > right);
708 case AUDIT_GREATER_THAN_OR_EQUAL:
709 return (left >= right);
710 }
711 BUG();
712 return 0;
713}
714
715
716
717static int audit_filter_user_rules(struct netlink_skb_parms *cb,
718 struct audit_krule *rule,
719 enum audit_state *state)
720{
721 int i;
722
723 for (i = 0; i < rule->field_count; i++) {
724 struct audit_field *f = &rule->fields[i];
725 int result = 0;
726
727 switch (f->type) {
728 case AUDIT_PID:
729 result = audit_comparator(cb->creds.pid, f->op, f->val);
730 break;
731 case AUDIT_UID:
732 result = audit_comparator(cb->creds.uid, f->op, f->val);
733 break;
734 case AUDIT_GID:
735 result = audit_comparator(cb->creds.gid, f->op, f->val);
736 break;
737 case AUDIT_LOGINUID:
738 result = audit_comparator(cb->loginuid, f->op, f->val);
739 break;
740 }
741
742 if (!result)
743 return 0;
744 }
745 switch (rule->action) {
746 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
747 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
748 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
749 }
750 return 1;
751}
752
753int audit_filter_user(struct netlink_skb_parms *cb, int type)
754{
755 struct audit_entry *e;
756 enum audit_state state;
757 int ret = 1;
758
759 rcu_read_lock();
760 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
761 if (audit_filter_user_rules(cb, &e->rule, &state)) {
762 if (state == AUDIT_DISABLED)
763 ret = 0;
764 break;
765 }
766 }
767 rcu_read_unlock();
768
769 return ret; /* Audit by default */
770}
771
772int audit_filter_type(int type)
773{
774 struct audit_entry *e;
775 int result = 0;
776
777 rcu_read_lock();
778 if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
779 goto unlock_and_return;
780
781 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
782 list) {
783 int i;
784 for (i = 0; i < e->rule.field_count; i++) {
785 struct audit_field *f = &e->rule.fields[i];
786 if (f->type == AUDIT_MSGTYPE) {
787 result = audit_comparator(type, f->op, f->val);
788 if (!result)
789 break;
790 }
791 }
792 if (result)
793 goto unlock_and_return;
794 }
795unlock_and_return:
796 rcu_read_unlock();
797 return result;
798}
799
800/* Check to see if the rule contains any selinux fields. Returns 1 if there
801 are selinux fields specified in the rule, 0 otherwise. */
802static inline int audit_rule_has_selinux(struct audit_krule *rule)
803{
804 int i;
805
806 for (i = 0; i < rule->field_count; i++) {
807 struct audit_field *f = &rule->fields[i];
808 switch (f->type) {
809 case AUDIT_SE_USER:
810 case AUDIT_SE_ROLE:
811 case AUDIT_SE_TYPE:
812 case AUDIT_SE_SEN:
813 case AUDIT_SE_CLR:
814 return 1;
815 }
816 }
817
818 return 0;
819}
820
821/* This function will re-initialize the se_rule field of all applicable rules.
822 * It will traverse the filter lists serarching for rules that contain selinux
823 * specific filter fields. When such a rule is found, it is copied, the
824 * selinux field is re-initialized, and the old rule is replaced with the
825 * updated rule. */
826int selinux_audit_rule_update(void)
827{
828 struct audit_entry *entry, *n, *nentry;
829 int i, err = 0;
830
831 /* audit_netlink_mutex synchronizes the writers */
832 mutex_lock(&audit_netlink_mutex);
833
834 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
835 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
836 if (!audit_rule_has_selinux(&entry->rule))
837 continue;
838
839 nentry = audit_dupe_rule(&entry->rule);
840 if (unlikely(IS_ERR(nentry))) {
841 /* save the first error encountered for the
842 * return value */
843 if (!err)
844 err = PTR_ERR(nentry);
845 audit_panic("error updating selinux filters");
846 list_del_rcu(&entry->list);
847 } else {
848 list_replace_rcu(&entry->list, &nentry->list);
849 }
850 call_rcu(&entry->rcu, audit_free_rule_rcu);
851 }
852 }
853
854 mutex_unlock(&audit_netlink_mutex);
855
856 return err;
857}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 685c25175d..1c03a4ed1b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2,6 +2,8 @@
2 * Handles all system-call specific auditing features. 2 * Handles all system-call specific auditing features.
3 * 3 *
4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
6 * Copyright (C) 2005 IBM Corporation
5 * All Rights Reserved. 7 * All Rights Reserved.
6 * 8 *
7 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -27,11 +29,22 @@
27 * this file -- see entry.S) is based on a GPL'd patch written by 29 * this file -- see entry.S) is based on a GPL'd patch written by
28 * okir@suse.de and Copyright 2003 SuSE Linux AG. 30 * okir@suse.de and Copyright 2003 SuSE Linux AG.
29 * 31 *
32 * The support of additional filter rules compares (>, <, >=, <=) was
33 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
34 *
35 * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
36 * filesystem information.
37 *
38 * Subject and object context labeling support added by <danjones@us.ibm.com>
39 * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
30 */ 40 */
31 41
32#include <linux/init.h> 42#include <linux/init.h>
33#include <asm/types.h> 43#include <asm/types.h>
34#include <asm/atomic.h> 44#include <asm/atomic.h>
45#include <asm/types.h>
46#include <linux/fs.h>
47#include <linux/namei.h>
35#include <linux/mm.h> 48#include <linux/mm.h>
36#include <linux/module.h> 49#include <linux/module.h>
37#include <linux/mount.h> 50#include <linux/mount.h>
@@ -39,16 +52,17 @@
39#include <linux/audit.h> 52#include <linux/audit.h>
40#include <linux/personality.h> 53#include <linux/personality.h>
41#include <linux/time.h> 54#include <linux/time.h>
42#include <linux/kthread.h>
43#include <linux/netlink.h> 55#include <linux/netlink.h>
44#include <linux/compiler.h> 56#include <linux/compiler.h>
45#include <asm/unistd.h> 57#include <asm/unistd.h>
58#include <linux/security.h>
59#include <linux/list.h>
60#include <linux/tty.h>
61#include <linux/selinux.h>
62
63#include "audit.h"
46 64
47/* 0 = no checking 65extern struct list_head audit_filter_list[];
48 1 = put_count checking
49 2 = verbose put_count checking
50*/
51#define AUDIT_DEBUG 0
52 66
53/* No syscall auditing will take place unless audit_enabled != 0. */ 67/* No syscall auditing will take place unless audit_enabled != 0. */
54extern int audit_enabled; 68extern int audit_enabled;
@@ -62,29 +76,6 @@ extern int audit_enabled;
62 * path_lookup. */ 76 * path_lookup. */
63#define AUDIT_NAMES_RESERVED 7 77#define AUDIT_NAMES_RESERVED 7
64 78
65/* At task start time, the audit_state is set in the audit_context using
66 a per-task filter. At syscall entry, the audit_state is augmented by
67 the syscall filter. */
68enum audit_state {
69 AUDIT_DISABLED, /* Do not create per-task audit_context.
70 * No syscall-specific audit records can
71 * be generated. */
72 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
73 * but don't necessarily fill it in at
74 * syscall entry time (i.e., filter
75 * instead). */
76 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
77 * and always fill it in at syscall
78 * entry time. This makes a full
79 * syscall record available if some
80 * other part of the kernel decides it
81 * should be recorded. */
82 AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
83 * always fill it in at syscall entry
84 * time, and always write out the audit
85 * record at syscall exit time. */
86};
87
88/* When fs/namei.c:getname() is called, we store the pointer in name and 79/* When fs/namei.c:getname() is called, we store the pointer in name and
89 * we don't let putname() free it (instead we free all of the saved 80 * we don't let putname() free it (instead we free all of the saved
90 * pointers at syscall exit time). 81 * pointers at syscall exit time).
@@ -93,12 +84,13 @@ enum audit_state {
93struct audit_names { 84struct audit_names {
94 const char *name; 85 const char *name;
95 unsigned long ino; 86 unsigned long ino;
87 unsigned long pino;
96 dev_t dev; 88 dev_t dev;
97 umode_t mode; 89 umode_t mode;
98 uid_t uid; 90 uid_t uid;
99 gid_t gid; 91 gid_t gid;
100 dev_t rdev; 92 dev_t rdev;
101 unsigned flags; 93 u32 osid;
102}; 94};
103 95
104struct audit_aux_data { 96struct audit_aux_data {
@@ -115,6 +107,7 @@ struct audit_aux_data_ipcctl {
115 uid_t uid; 107 uid_t uid;
116 gid_t gid; 108 gid_t gid;
117 mode_t mode; 109 mode_t mode;
110 u32 osid;
118}; 111};
119 112
120struct audit_aux_data_socketcall { 113struct audit_aux_data_socketcall {
@@ -167,290 +160,73 @@ struct audit_context {
167#endif 160#endif
168}; 161};
169 162
170 /* Public API */
171/* There are three lists of rules -- one to search at task creation
172 * time, one to search at syscall entry time, and another to search at
173 * syscall exit time. */
174static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
175 LIST_HEAD_INIT(audit_filter_list[0]),
176 LIST_HEAD_INIT(audit_filter_list[1]),
177 LIST_HEAD_INIT(audit_filter_list[2]),
178 LIST_HEAD_INIT(audit_filter_list[3]),
179 LIST_HEAD_INIT(audit_filter_list[4]),
180#if AUDIT_NR_FILTERS != 5
181#error Fix audit_filter_list initialiser
182#endif
183};
184
185struct audit_entry {
186 struct list_head list;
187 struct rcu_head rcu;
188 struct audit_rule rule;
189};
190
191extern int audit_pid;
192
193/* Copy rule from user-space to kernel-space. Called from
194 * audit_add_rule during AUDIT_ADD. */
195static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
196{
197 int i;
198
199 if (s->action != AUDIT_NEVER
200 && s->action != AUDIT_POSSIBLE
201 && s->action != AUDIT_ALWAYS)
202 return -1;
203 if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
204 return -1;
205 if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
206 return -1;
207
208 d->flags = s->flags;
209 d->action = s->action;
210 d->field_count = s->field_count;
211 for (i = 0; i < d->field_count; i++) {
212 d->fields[i] = s->fields[i];
213 d->values[i] = s->values[i];
214 }
215 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
216 return 0;
217}
218
219/* Check to see if two rules are identical. It is called from
220 * audit_add_rule during AUDIT_ADD and
221 * audit_del_rule during AUDIT_DEL. */
222static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
223{
224 int i;
225
226 if (a->flags != b->flags)
227 return 1;
228
229 if (a->action != b->action)
230 return 1;
231
232 if (a->field_count != b->field_count)
233 return 1;
234
235 for (i = 0; i < a->field_count; i++) {
236 if (a->fields[i] != b->fields[i]
237 || a->values[i] != b->values[i])
238 return 1;
239 }
240
241 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
242 if (a->mask[i] != b->mask[i])
243 return 1;
244
245 return 0;
246}
247
248/* Note that audit_add_rule and audit_del_rule are called via
249 * audit_receive() in audit.c, and are protected by
250 * audit_netlink_sem. */
251static inline int audit_add_rule(struct audit_rule *rule,
252 struct list_head *list)
253{
254 struct audit_entry *entry;
255
256 /* Do not use the _rcu iterator here, since this is the only
257 * addition routine. */
258 list_for_each_entry(entry, list, list) {
259 if (!audit_compare_rule(rule, &entry->rule)) {
260 return -EEXIST;
261 }
262 }
263
264 if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
265 return -ENOMEM;
266 if (audit_copy_rule(&entry->rule, rule)) {
267 kfree(entry);
268 return -EINVAL;
269 }
270
271 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
272 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
273 list_add_rcu(&entry->list, list);
274 } else {
275 list_add_tail_rcu(&entry->list, list);
276 }
277
278 return 0;
279}
280
281static inline void audit_free_rule(struct rcu_head *head)
282{
283 struct audit_entry *e = container_of(head, struct audit_entry, rcu);
284 kfree(e);
285}
286
287/* Note that audit_add_rule and audit_del_rule are called via
288 * audit_receive() in audit.c, and are protected by
289 * audit_netlink_sem. */
290static inline int audit_del_rule(struct audit_rule *rule,
291 struct list_head *list)
292{
293 struct audit_entry *e;
294
295 /* Do not use the _rcu iterator here, since this is the only
296 * deletion routine. */
297 list_for_each_entry(e, list, list) {
298 if (!audit_compare_rule(rule, &e->rule)) {
299 list_del_rcu(&e->list);
300 call_rcu(&e->rcu, audit_free_rule);
301 return 0;
302 }
303 }
304 return -ENOENT; /* No matching rule */
305}
306
307static int audit_list_rules(void *_dest)
308{
309 int pid, seq;
310 int *dest = _dest;
311 struct audit_entry *entry;
312 int i;
313
314 pid = dest[0];
315 seq = dest[1];
316 kfree(dest);
317
318 down(&audit_netlink_sem);
319
320 /* The *_rcu iterators not needed here because we are
321 always called with audit_netlink_sem held. */
322 for (i=0; i<AUDIT_NR_FILTERS; i++) {
323 list_for_each_entry(entry, &audit_filter_list[i], list)
324 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
325 &entry->rule, sizeof(entry->rule));
326 }
327 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
328
329 up(&audit_netlink_sem);
330 return 0;
331}
332
333int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
334 uid_t loginuid)
335{
336 struct task_struct *tsk;
337 int *dest;
338 int err = 0;
339 unsigned listnr;
340
341 switch (type) {
342 case AUDIT_LIST:
343 /* We can't just spew out the rules here because we might fill
344 * the available socket buffer space and deadlock waiting for
345 * auditctl to read from it... which isn't ever going to
346 * happen if we're actually running in the context of auditctl
347 * trying to _send_ the stuff */
348
349 dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
350 if (!dest)
351 return -ENOMEM;
352 dest[0] = pid;
353 dest[1] = seq;
354
355 tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
356 if (IS_ERR(tsk)) {
357 kfree(dest);
358 err = PTR_ERR(tsk);
359 }
360 break;
361 case AUDIT_ADD:
362 listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
363 if (listnr >= AUDIT_NR_FILTERS)
364 return -EINVAL;
365
366 err = audit_add_rule(data, &audit_filter_list[listnr]);
367 if (!err)
368 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
369 "auid=%u added an audit rule\n", loginuid);
370 break;
371 case AUDIT_DEL:
372 listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
373 if (listnr >= AUDIT_NR_FILTERS)
374 return -EINVAL;
375
376 err = audit_del_rule(data, &audit_filter_list[listnr]);
377 if (!err)
378 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
379 "auid=%u removed an audit rule\n", loginuid);
380 break;
381 default:
382 return -EINVAL;
383 }
384
385 return err;
386}
387 163
388/* Compare a task_struct with an audit_rule. Return 1 on match, 0 164/* Compare a task_struct with an audit_rule. Return 1 on match, 0
389 * otherwise. */ 165 * otherwise. */
390static int audit_filter_rules(struct task_struct *tsk, 166static int audit_filter_rules(struct task_struct *tsk,
391 struct audit_rule *rule, 167 struct audit_krule *rule,
392 struct audit_context *ctx, 168 struct audit_context *ctx,
393 enum audit_state *state) 169 enum audit_state *state)
394{ 170{
395 int i, j; 171 int i, j, need_sid = 1;
172 u32 sid;
396 173
397 for (i = 0; i < rule->field_count; i++) { 174 for (i = 0; i < rule->field_count; i++) {
398 u32 field = rule->fields[i] & ~AUDIT_NEGATE; 175 struct audit_field *f = &rule->fields[i];
399 u32 value = rule->values[i];
400 int result = 0; 176 int result = 0;
401 177
402 switch (field) { 178 switch (f->type) {
403 case AUDIT_PID: 179 case AUDIT_PID:
404 result = (tsk->pid == value); 180 result = audit_comparator(tsk->pid, f->op, f->val);
405 break; 181 break;
406 case AUDIT_UID: 182 case AUDIT_UID:
407 result = (tsk->uid == value); 183 result = audit_comparator(tsk->uid, f->op, f->val);
408 break; 184 break;
409 case AUDIT_EUID: 185 case AUDIT_EUID:
410 result = (tsk->euid == value); 186 result = audit_comparator(tsk->euid, f->op, f->val);
411 break; 187 break;
412 case AUDIT_SUID: 188 case AUDIT_SUID:
413 result = (tsk->suid == value); 189 result = audit_comparator(tsk->suid, f->op, f->val);
414 break; 190 break;
415 case AUDIT_FSUID: 191 case AUDIT_FSUID:
416 result = (tsk->fsuid == value); 192 result = audit_comparator(tsk->fsuid, f->op, f->val);
417 break; 193 break;
418 case AUDIT_GID: 194 case AUDIT_GID:
419 result = (tsk->gid == value); 195 result = audit_comparator(tsk->gid, f->op, f->val);
420 break; 196 break;
421 case AUDIT_EGID: 197 case AUDIT_EGID:
422 result = (tsk->egid == value); 198 result = audit_comparator(tsk->egid, f->op, f->val);
423 break; 199 break;
424 case AUDIT_SGID: 200 case AUDIT_SGID:
425 result = (tsk->sgid == value); 201 result = audit_comparator(tsk->sgid, f->op, f->val);
426 break; 202 break;
427 case AUDIT_FSGID: 203 case AUDIT_FSGID:
428 result = (tsk->fsgid == value); 204 result = audit_comparator(tsk->fsgid, f->op, f->val);
429 break; 205 break;
430 case AUDIT_PERS: 206 case AUDIT_PERS:
431 result = (tsk->personality == value); 207 result = audit_comparator(tsk->personality, f->op, f->val);
432 break; 208 break;
433 case AUDIT_ARCH: 209 case AUDIT_ARCH:
434 if (ctx) 210 if (ctx)
435 result = (ctx->arch == value); 211 result = audit_comparator(ctx->arch, f->op, f->val);
436 break; 212 break;
437 213
438 case AUDIT_EXIT: 214 case AUDIT_EXIT:
439 if (ctx && ctx->return_valid) 215 if (ctx && ctx->return_valid)
440 result = (ctx->return_code == value); 216 result = audit_comparator(ctx->return_code, f->op, f->val);
441 break; 217 break;
442 case AUDIT_SUCCESS: 218 case AUDIT_SUCCESS:
443 if (ctx && ctx->return_valid) { 219 if (ctx && ctx->return_valid) {
444 if (value) 220 if (f->val)
445 result = (ctx->return_valid == AUDITSC_SUCCESS); 221 result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
446 else 222 else
447 result = (ctx->return_valid == AUDITSC_FAILURE); 223 result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
448 } 224 }
449 break; 225 break;
450 case AUDIT_DEVMAJOR: 226 case AUDIT_DEVMAJOR:
451 if (ctx) { 227 if (ctx) {
452 for (j = 0; j < ctx->name_count; j++) { 228 for (j = 0; j < ctx->name_count; j++) {
453 if (MAJOR(ctx->names[j].dev)==value) { 229 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
454 ++result; 230 ++result;
455 break; 231 break;
456 } 232 }
@@ -460,7 +236,7 @@ static int audit_filter_rules(struct task_struct *tsk,
460 case AUDIT_DEVMINOR: 236 case AUDIT_DEVMINOR:
461 if (ctx) { 237 if (ctx) {
462 for (j = 0; j < ctx->name_count; j++) { 238 for (j = 0; j < ctx->name_count; j++) {
463 if (MINOR(ctx->names[j].dev)==value) { 239 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
464 ++result; 240 ++result;
465 break; 241 break;
466 } 242 }
@@ -470,7 +246,8 @@ static int audit_filter_rules(struct task_struct *tsk,
470 case AUDIT_INODE: 246 case AUDIT_INODE:
471 if (ctx) { 247 if (ctx) {
472 for (j = 0; j < ctx->name_count; j++) { 248 for (j = 0; j < ctx->name_count; j++) {
473 if (ctx->names[j].ino == value) { 249 if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
250 audit_comparator(ctx->names[j].pino, f->op, f->val)) {
474 ++result; 251 ++result;
475 break; 252 break;
476 } 253 }
@@ -480,19 +257,38 @@ static int audit_filter_rules(struct task_struct *tsk,
480 case AUDIT_LOGINUID: 257 case AUDIT_LOGINUID:
481 result = 0; 258 result = 0;
482 if (ctx) 259 if (ctx)
483 result = (ctx->loginuid == value); 260 result = audit_comparator(ctx->loginuid, f->op, f->val);
261 break;
262 case AUDIT_SE_USER:
263 case AUDIT_SE_ROLE:
264 case AUDIT_SE_TYPE:
265 case AUDIT_SE_SEN:
266 case AUDIT_SE_CLR:
267 /* NOTE: this may return negative values indicating
268 a temporary error. We simply treat this as a
269 match for now to avoid losing information that
270 may be wanted. An error message will also be
271 logged upon error */
272 if (f->se_rule) {
273 if (need_sid) {
274 selinux_task_ctxid(tsk, &sid);
275 need_sid = 0;
276 }
277 result = selinux_audit_rule_match(sid, f->type,
278 f->op,
279 f->se_rule,
280 ctx);
281 }
484 break; 282 break;
485 case AUDIT_ARG0: 283 case AUDIT_ARG0:
486 case AUDIT_ARG1: 284 case AUDIT_ARG1:
487 case AUDIT_ARG2: 285 case AUDIT_ARG2:
488 case AUDIT_ARG3: 286 case AUDIT_ARG3:
489 if (ctx) 287 if (ctx)
490 result = (ctx->argv[field-AUDIT_ARG0]==value); 288 result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
491 break; 289 break;
492 } 290 }
493 291
494 if (rule->fields[i] & AUDIT_NEGATE)
495 result = !result;
496 if (!result) 292 if (!result)
497 return 0; 293 return 0;
498 } 294 }
@@ -527,7 +323,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
527/* At syscall entry and exit time, this filter is called if the 323/* At syscall entry and exit time, this filter is called if the
528 * audit_state is not low enough that auditing cannot take place, but is 324 * audit_state is not low enough that auditing cannot take place, but is
529 * also not high enough that we already know we have to write an audit 325 * also not high enough that we already know we have to write an audit
530 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). 326 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
531 */ 327 */
532static enum audit_state audit_filter_syscall(struct task_struct *tsk, 328static enum audit_state audit_filter_syscall(struct task_struct *tsk,
533 struct audit_context *ctx, 329 struct audit_context *ctx,
@@ -541,80 +337,21 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
541 337
542 rcu_read_lock(); 338 rcu_read_lock();
543 if (!list_empty(list)) { 339 if (!list_empty(list)) {
544 int word = AUDIT_WORD(ctx->major); 340 int word = AUDIT_WORD(ctx->major);
545 int bit = AUDIT_BIT(ctx->major); 341 int bit = AUDIT_BIT(ctx->major);
546 342
547 list_for_each_entry_rcu(e, list, list) { 343 list_for_each_entry_rcu(e, list, list) {
548 if ((e->rule.mask[word] & bit) == bit 344 if ((e->rule.mask[word] & bit) == bit
549 && audit_filter_rules(tsk, &e->rule, ctx, &state)) { 345 && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
550 rcu_read_unlock(); 346 rcu_read_unlock();
551 return state; 347 return state;
552 } 348 }
553 }
554 }
555 rcu_read_unlock();
556 return AUDIT_BUILD_CONTEXT;
557}
558
559static int audit_filter_user_rules(struct netlink_skb_parms *cb,
560 struct audit_rule *rule,
561 enum audit_state *state)
562{
563 int i;
564
565 for (i = 0; i < rule->field_count; i++) {
566 u32 field = rule->fields[i] & ~AUDIT_NEGATE;
567 u32 value = rule->values[i];
568 int result = 0;
569
570 switch (field) {
571 case AUDIT_PID:
572 result = (cb->creds.pid == value);
573 break;
574 case AUDIT_UID:
575 result = (cb->creds.uid == value);
576 break;
577 case AUDIT_GID:
578 result = (cb->creds.gid == value);
579 break;
580 case AUDIT_LOGINUID:
581 result = (cb->loginuid == value);
582 break;
583 }
584
585 if (rule->fields[i] & AUDIT_NEGATE)
586 result = !result;
587 if (!result)
588 return 0;
589 }
590 switch (rule->action) {
591 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
592 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
593 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
594 }
595 return 1;
596}
597
598int audit_filter_user(struct netlink_skb_parms *cb, int type)
599{
600 struct audit_entry *e;
601 enum audit_state state;
602 int ret = 1;
603
604 rcu_read_lock();
605 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
606 if (audit_filter_user_rules(cb, &e->rule, &state)) {
607 if (state == AUDIT_DISABLED)
608 ret = 0;
609 break;
610 } 349 }
611 } 350 }
612 rcu_read_unlock(); 351 rcu_read_unlock();
613 352 return AUDIT_BUILD_CONTEXT;
614 return ret; /* Audit by default */
615} 353}
616 354
617/* This should be called with task_lock() held. */
618static inline struct audit_context *audit_get_context(struct task_struct *tsk, 355static inline struct audit_context *audit_get_context(struct task_struct *tsk,
619 int return_valid, 356 int return_valid,
620 int return_code) 357 int return_code)
@@ -654,17 +391,18 @@ static inline void audit_free_names(struct audit_context *context)
654#if AUDIT_DEBUG == 2 391#if AUDIT_DEBUG == 2
655 if (context->auditable 392 if (context->auditable
656 ||context->put_count + context->ino_count != context->name_count) { 393 ||context->put_count + context->ino_count != context->name_count) {
657 printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d" 394 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
658 " name_count=%d put_count=%d" 395 " name_count=%d put_count=%d"
659 " ino_count=%d [NOT freeing]\n", 396 " ino_count=%d [NOT freeing]\n",
660 __LINE__, 397 __FILE__, __LINE__,
661 context->serial, context->major, context->in_syscall, 398 context->serial, context->major, context->in_syscall,
662 context->name_count, context->put_count, 399 context->name_count, context->put_count,
663 context->ino_count); 400 context->ino_count);
664 for (i = 0; i < context->name_count; i++) 401 for (i = 0; i < context->name_count; i++) {
665 printk(KERN_ERR "names[%d] = %p = %s\n", i, 402 printk(KERN_ERR "names[%d] = %p = %s\n", i,
666 context->names[i].name, 403 context->names[i].name,
667 context->names[i].name); 404 context->names[i].name ?: "(null)");
405 }
668 dump_stack(); 406 dump_stack();
669 return; 407 return;
670 } 408 }
@@ -674,9 +412,10 @@ static inline void audit_free_names(struct audit_context *context)
674 context->ino_count = 0; 412 context->ino_count = 0;
675#endif 413#endif
676 414
677 for (i = 0; i < context->name_count; i++) 415 for (i = 0; i < context->name_count; i++) {
678 if (context->names[i].name) 416 if (context->names[i].name)
679 __putname(context->names[i].name); 417 __putname(context->names[i].name);
418 }
680 context->name_count = 0; 419 context->name_count = 0;
681 if (context->pwd) 420 if (context->pwd)
682 dput(context->pwd); 421 dput(context->pwd);
@@ -696,6 +435,7 @@ static inline void audit_free_aux(struct audit_context *context)
696 dput(axi->dentry); 435 dput(axi->dentry);
697 mntput(axi->mnt); 436 mntput(axi->mnt);
698 } 437 }
438
699 context->aux = aux->next; 439 context->aux = aux->next;
700 kfree(aux); 440 kfree(aux);
701 } 441 }
@@ -721,10 +461,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
721 return context; 461 return context;
722} 462}
723 463
724/* Filter on the task information and allocate a per-task audit context 464/**
465 * audit_alloc - allocate an audit context block for a task
466 * @tsk: task
467 *
468 * Filter on the task information and allocate a per-task audit context
725 * if necessary. Doing so turns on system call auditing for the 469 * if necessary. Doing so turns on system call auditing for the
726 * specified task. This is called from copy_process, so no lock is 470 * specified task. This is called from copy_process, so no lock is
727 * needed. */ 471 * needed.
472 */
728int audit_alloc(struct task_struct *tsk) 473int audit_alloc(struct task_struct *tsk)
729{ 474{
730 struct audit_context *context; 475 struct audit_context *context;
@@ -775,41 +520,76 @@ static inline void audit_free_context(struct audit_context *context)
775 printk(KERN_ERR "audit: freed %d contexts\n", count); 520 printk(KERN_ERR "audit: freed %d contexts\n", count);
776} 521}
777 522
778static void audit_log_task_info(struct audit_buffer *ab) 523static void audit_log_task_context(struct audit_buffer *ab)
779{ 524{
780 char name[sizeof(current->comm)]; 525 char *ctx = NULL;
781 struct mm_struct *mm = current->mm; 526 ssize_t len = 0;
527
528 len = security_getprocattr(current, "current", NULL, 0);
529 if (len < 0) {
530 if (len != -EINVAL)
531 goto error_path;
532 return;
533 }
534
535 ctx = kmalloc(len, GFP_KERNEL);
536 if (!ctx)
537 goto error_path;
538
539 len = security_getprocattr(current, "current", ctx, len);
540 if (len < 0 )
541 goto error_path;
542
543 audit_log_format(ab, " subj=%s", ctx);
544 return;
545
546error_path:
547 if (ctx)
548 kfree(ctx);
549 audit_panic("error in audit_log_task_context");
550 return;
551}
552
553static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
554{
555 char name[sizeof(tsk->comm)];
556 struct mm_struct *mm = tsk->mm;
782 struct vm_area_struct *vma; 557 struct vm_area_struct *vma;
783 558
784 get_task_comm(name, current); 559 /* tsk == current */
560
561 get_task_comm(name, tsk);
785 audit_log_format(ab, " comm="); 562 audit_log_format(ab, " comm=");
786 audit_log_untrustedstring(ab, name); 563 audit_log_untrustedstring(ab, name);
787 564
788 if (!mm) 565 if (mm) {
789 return; 566 down_read(&mm->mmap_sem);
790 567 vma = mm->mmap;
791 down_read(&mm->mmap_sem); 568 while (vma) {
792 vma = mm->mmap; 569 if ((vma->vm_flags & VM_EXECUTABLE) &&
793 while (vma) { 570 vma->vm_file) {
794 if ((vma->vm_flags & VM_EXECUTABLE) && 571 audit_log_d_path(ab, "exe=",
795 vma->vm_file) { 572 vma->vm_file->f_dentry,
796 audit_log_d_path(ab, "exe=", 573 vma->vm_file->f_vfsmnt);
797 vma->vm_file->f_dentry, 574 break;
798 vma->vm_file->f_vfsmnt); 575 }
799 break; 576 vma = vma->vm_next;
800 } 577 }
801 vma = vma->vm_next; 578 up_read(&mm->mmap_sem);
802 } 579 }
803 up_read(&mm->mmap_sem); 580 audit_log_task_context(ab);
804} 581}
805 582
806static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) 583static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
807{ 584{
808 int i; 585 int i, call_panic = 0;
809 struct audit_buffer *ab; 586 struct audit_buffer *ab;
810 struct audit_aux_data *aux; 587 struct audit_aux_data *aux;
588 const char *tty;
811 589
812 ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); 590 /* tsk == current */
591
592 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
813 if (!ab) 593 if (!ab)
814 return; /* audit_panic has been called */ 594 return; /* audit_panic has been called */
815 audit_log_format(ab, "arch=%x syscall=%d", 595 audit_log_format(ab, "arch=%x syscall=%d",
@@ -820,11 +600,15 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
820 audit_log_format(ab, " success=%s exit=%ld", 600 audit_log_format(ab, " success=%s exit=%ld",
821 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 601 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
822 context->return_code); 602 context->return_code);
603 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
604 tty = tsk->signal->tty->name;
605 else
606 tty = "(none)";
823 audit_log_format(ab, 607 audit_log_format(ab,
824 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 608 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
825 " pid=%d auid=%u uid=%u gid=%u" 609 " pid=%d auid=%u uid=%u gid=%u"
826 " euid=%u suid=%u fsuid=%u" 610 " euid=%u suid=%u fsuid=%u"
827 " egid=%u sgid=%u fsgid=%u", 611 " egid=%u sgid=%u fsgid=%u tty=%s",
828 context->argv[0], 612 context->argv[0],
829 context->argv[1], 613 context->argv[1],
830 context->argv[2], 614 context->argv[2],
@@ -835,8 +619,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
835 context->uid, 619 context->uid,
836 context->gid, 620 context->gid,
837 context->euid, context->suid, context->fsuid, 621 context->euid, context->suid, context->fsuid,
838 context->egid, context->sgid, context->fsgid); 622 context->egid, context->sgid, context->fsgid, tty);
839 audit_log_task_info(ab); 623 audit_log_task_info(ab, tsk);
840 audit_log_end(ab); 624 audit_log_end(ab);
841 625
842 for (aux = context->aux; aux; aux = aux->next) { 626 for (aux = context->aux; aux; aux = aux->next) {
@@ -849,8 +633,39 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
849 case AUDIT_IPC: { 633 case AUDIT_IPC: {
850 struct audit_aux_data_ipcctl *axi = (void *)aux; 634 struct audit_aux_data_ipcctl *axi = (void *)aux;
851 audit_log_format(ab, 635 audit_log_format(ab,
852 " qbytes=%lx iuid=%u igid=%u mode=%x", 636 " qbytes=%lx iuid=%u igid=%u mode=%x",
853 axi->qbytes, axi->uid, axi->gid, axi->mode); 637 axi->qbytes, axi->uid, axi->gid, axi->mode);
638 if (axi->osid != 0) {
639 char *ctx = NULL;
640 u32 len;
641 if (selinux_ctxid_to_string(
642 axi->osid, &ctx, &len)) {
643 audit_log_format(ab, " osid=%u",
644 axi->osid);
645 call_panic = 1;
646 } else
647 audit_log_format(ab, " obj=%s", ctx);
648 kfree(ctx);
649 }
650 break; }
651
652 case AUDIT_IPC_SET_PERM: {
653 struct audit_aux_data_ipcctl *axi = (void *)aux;
654 audit_log_format(ab,
655 " new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
656 axi->qbytes, axi->uid, axi->gid, axi->mode);
657 if (axi->osid != 0) {
658 char *ctx = NULL;
659 u32 len;
660 if (selinux_ctxid_to_string(
661 axi->osid, &ctx, &len)) {
662 audit_log_format(ab, " osid=%u",
663 axi->osid);
664 call_panic = 1;
665 } else
666 audit_log_format(ab, " obj=%s", ctx);
667 kfree(ctx);
668 }
854 break; } 669 break; }
855 670
856 case AUDIT_SOCKETCALL: { 671 case AUDIT_SOCKETCALL: {
@@ -885,42 +700,65 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
885 } 700 }
886 } 701 }
887 for (i = 0; i < context->name_count; i++) { 702 for (i = 0; i < context->name_count; i++) {
703 unsigned long ino = context->names[i].ino;
704 unsigned long pino = context->names[i].pino;
705
888 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); 706 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
889 if (!ab) 707 if (!ab)
890 continue; /* audit_panic has been called */ 708 continue; /* audit_panic has been called */
891 709
892 audit_log_format(ab, "item=%d", i); 710 audit_log_format(ab, "item=%d", i);
893 if (context->names[i].name) { 711
894 audit_log_format(ab, " name="); 712 audit_log_format(ab, " name=");
713 if (context->names[i].name)
895 audit_log_untrustedstring(ab, context->names[i].name); 714 audit_log_untrustedstring(ab, context->names[i].name);
896 } 715 else
897 audit_log_format(ab, " flags=%x\n", context->names[i].flags); 716 audit_log_format(ab, "(null)");
898 717
899 if (context->names[i].ino != (unsigned long)-1) 718 if (pino != (unsigned long)-1)
900 audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" 719 audit_log_format(ab, " parent=%lu", pino);
901 " ouid=%u ogid=%u rdev=%02x:%02x", 720 if (ino != (unsigned long)-1)
902 context->names[i].ino, 721 audit_log_format(ab, " inode=%lu", ino);
903 MAJOR(context->names[i].dev), 722 if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
904 MINOR(context->names[i].dev), 723 audit_log_format(ab, " dev=%02x:%02x mode=%#o"
905 context->names[i].mode, 724 " ouid=%u ogid=%u rdev=%02x:%02x",
906 context->names[i].uid, 725 MAJOR(context->names[i].dev),
907 context->names[i].gid, 726 MINOR(context->names[i].dev),
908 MAJOR(context->names[i].rdev), 727 context->names[i].mode,
728 context->names[i].uid,
729 context->names[i].gid,
730 MAJOR(context->names[i].rdev),
909 MINOR(context->names[i].rdev)); 731 MINOR(context->names[i].rdev));
732 if (context->names[i].osid != 0) {
733 char *ctx = NULL;
734 u32 len;
735 if (selinux_ctxid_to_string(
736 context->names[i].osid, &ctx, &len)) {
737 audit_log_format(ab, " osid=%u",
738 context->names[i].osid);
739 call_panic = 2;
740 } else
741 audit_log_format(ab, " obj=%s", ctx);
742 kfree(ctx);
743 }
744
910 audit_log_end(ab); 745 audit_log_end(ab);
911 } 746 }
747 if (call_panic)
748 audit_panic("error converting sid to string");
912} 749}
913 750
914/* Free a per-task audit context. Called from copy_process and 751/**
915 * __put_task_struct. */ 752 * audit_free - free a per-task audit context
753 * @tsk: task whose audit context block to free
754 *
755 * Called from copy_process and do_exit
756 */
916void audit_free(struct task_struct *tsk) 757void audit_free(struct task_struct *tsk)
917{ 758{
918 struct audit_context *context; 759 struct audit_context *context;
919 760
920 task_lock(tsk);
921 context = audit_get_context(tsk, 0, 0); 761 context = audit_get_context(tsk, 0, 0);
922 task_unlock(tsk);
923
924 if (likely(!context)) 762 if (likely(!context))
925 return; 763 return;
926 764
@@ -928,29 +766,43 @@ void audit_free(struct task_struct *tsk)
928 * function (e.g., exit_group), then free context block. 766 * function (e.g., exit_group), then free context block.
929 * We use GFP_ATOMIC here because we might be doing this 767 * We use GFP_ATOMIC here because we might be doing this
930 * in the context of the idle thread */ 768 * in the context of the idle thread */
769 /* that can happen only if we are called from do_exit() */
931 if (context->in_syscall && context->auditable) 770 if (context->in_syscall && context->auditable)
932 audit_log_exit(context, GFP_ATOMIC); 771 audit_log_exit(context, tsk);
933 772
934 audit_free_context(context); 773 audit_free_context(context);
935} 774}
936 775
937/* Fill in audit context at syscall entry. This only happens if the 776/**
777 * audit_syscall_entry - fill in an audit record at syscall entry
778 * @tsk: task being audited
779 * @arch: architecture type
780 * @major: major syscall type (function)
781 * @a1: additional syscall register 1
782 * @a2: additional syscall register 2
783 * @a3: additional syscall register 3
784 * @a4: additional syscall register 4
785 *
786 * Fill in audit context at syscall entry. This only happens if the
938 * audit context was created when the task was created and the state or 787 * audit context was created when the task was created and the state or
939 * filters demand the audit context be built. If the state from the 788 * filters demand the audit context be built. If the state from the
940 * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, 789 * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
941 * then the record will be written at syscall exit time (otherwise, it 790 * then the record will be written at syscall exit time (otherwise, it
942 * will only be written if another part of the kernel requests that it 791 * will only be written if another part of the kernel requests that it
943 * be written). */ 792 * be written).
944void audit_syscall_entry(struct task_struct *tsk, int arch, int major, 793 */
794void audit_syscall_entry(int arch, int major,
945 unsigned long a1, unsigned long a2, 795 unsigned long a1, unsigned long a2,
946 unsigned long a3, unsigned long a4) 796 unsigned long a3, unsigned long a4)
947{ 797{
798 struct task_struct *tsk = current;
948 struct audit_context *context = tsk->audit_context; 799 struct audit_context *context = tsk->audit_context;
949 enum audit_state state; 800 enum audit_state state;
950 801
951 BUG_ON(!context); 802 BUG_ON(!context);
952 803
953 /* This happens only on certain architectures that make system 804 /*
805 * This happens only on certain architectures that make system
954 * calls in kernel_thread via the entry.S interface, instead of 806 * calls in kernel_thread via the entry.S interface, instead of
955 * with direct calls. (If you are porting to a new 807 * with direct calls. (If you are porting to a new
956 * architecture, hitting this condition can indicate that you 808 * architecture, hitting this condition can indicate that you
@@ -958,7 +810,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
958 * 810 *
959 * i386 no 811 * i386 no
960 * x86_64 no 812 * x86_64 no
961 * ppc64 yes (see arch/ppc64/kernel/misc.S) 813 * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
962 * 814 *
963 * This also happens with vm86 emulation in a non-nested manner 815 * This also happens with vm86 emulation in a non-nested manner
964 * (entries without exits), so this case must be caught. 816 * (entries without exits), so this case must be caught.
@@ -966,11 +818,6 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
966 if (context->in_syscall) { 818 if (context->in_syscall) {
967 struct audit_context *newctx; 819 struct audit_context *newctx;
968 820
969#if defined(__NR_vm86) && defined(__NR_vm86old)
970 /* vm86 mode should only be entered once */
971 if (major == __NR_vm86 || major == __NR_vm86old)
972 return;
973#endif
974#if AUDIT_DEBUG 821#if AUDIT_DEBUG
975 printk(KERN_ERR 822 printk(KERN_ERR
976 "audit(:%d) pid=%d in syscall=%d;" 823 "audit(:%d) pid=%d in syscall=%d;"
@@ -1014,27 +861,30 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
1014 context->auditable = !!(state == AUDIT_RECORD_CONTEXT); 861 context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
1015} 862}
1016 863
1017/* Tear down after system call. If the audit context has been marked as 864/**
865 * audit_syscall_exit - deallocate audit context after a system call
866 * @tsk: task being audited
867 * @valid: success/failure flag
868 * @return_code: syscall return value
869 *
870 * Tear down after system call. If the audit context has been marked as
1018 * auditable (either because of the AUDIT_RECORD_CONTEXT state from 871 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
1019 * filtering, or because some other part of the kernel write an audit 872 * filtering, or because some other part of the kernel write an audit
1020 * message), then write out the syscall information. In call cases, 873 * message), then write out the syscall information. In call cases,
1021 * free the names stored from getname(). */ 874 * free the names stored from getname().
1022void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) 875 */
876void audit_syscall_exit(int valid, long return_code)
1023{ 877{
878 struct task_struct *tsk = current;
1024 struct audit_context *context; 879 struct audit_context *context;
1025 880
1026 get_task_struct(tsk);
1027 task_lock(tsk);
1028 context = audit_get_context(tsk, valid, return_code); 881 context = audit_get_context(tsk, valid, return_code);
1029 task_unlock(tsk);
1030 882
1031 /* Not having a context here is ok, since the parent may have
1032 * called __put_task_struct. */
1033 if (likely(!context)) 883 if (likely(!context))
1034 goto out; 884 return;
1035 885
1036 if (context->in_syscall && context->auditable) 886 if (context->in_syscall && context->auditable)
1037 audit_log_exit(context, GFP_KERNEL); 887 audit_log_exit(context, tsk);
1038 888
1039 context->in_syscall = 0; 889 context->in_syscall = 0;
1040 context->auditable = 0; 890 context->auditable = 0;
@@ -1049,11 +899,15 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
1049 audit_free_aux(context); 899 audit_free_aux(context);
1050 tsk->audit_context = context; 900 tsk->audit_context = context;
1051 } 901 }
1052 out:
1053 put_task_struct(tsk);
1054} 902}
1055 903
1056/* Add a name to the list. Called from fs/namei.c:getname(). */ 904/**
905 * audit_getname - add a name to the list
906 * @name: name to add
907 *
908 * Add a name to the list of audit names for this context.
909 * Called from fs/namei.c:getname().
910 */
1057void audit_getname(const char *name) 911void audit_getname(const char *name)
1058{ 912{
1059 struct audit_context *context = current->audit_context; 913 struct audit_context *context = current->audit_context;
@@ -1082,10 +936,13 @@ void audit_getname(const char *name)
1082 936
1083} 937}
1084 938
1085/* Intercept a putname request. Called from 939/* audit_putname - intercept a putname request
1086 * include/linux/fs.h:putname(). If we have stored the name from 940 * @name: name to intercept and delay for putname
1087 * getname in the audit context, then we delay the putname until syscall 941 *
1088 * exit. */ 942 * If we have stored the name from getname in the audit context,
943 * then we delay the putname until syscall exit.
944 * Called from include/linux/fs.h:putname().
945 */
1089void audit_putname(const char *name) 946void audit_putname(const char *name)
1090{ 947{
1091 struct audit_context *context = current->audit_context; 948 struct audit_context *context = current->audit_context;
@@ -1100,7 +957,7 @@ void audit_putname(const char *name)
1100 for (i = 0; i < context->name_count; i++) 957 for (i = 0; i < context->name_count; i++)
1101 printk(KERN_ERR "name[%d] = %p = %s\n", i, 958 printk(KERN_ERR "name[%d] = %p = %s\n", i,
1102 context->names[i].name, 959 context->names[i].name,
1103 context->names[i].name); 960 context->names[i].name ?: "(null)");
1104 } 961 }
1105#endif 962#endif
1106 __putname(name); 963 __putname(name);
@@ -1122,9 +979,23 @@ void audit_putname(const char *name)
1122#endif 979#endif
1123} 980}
1124 981
1125/* Store the inode and device from a lookup. Called from 982static void audit_inode_context(int idx, const struct inode *inode)
1126 * fs/namei.c:path_lookup(). */ 983{
1127void audit_inode(const char *name, const struct inode *inode, unsigned flags) 984 struct audit_context *context = current->audit_context;
985
986 selinux_get_inode_sid(inode, &context->names[idx].osid);
987}
988
989
990/**
991 * audit_inode - store the inode and device from a lookup
992 * @name: name being audited
993 * @inode: inode being audited
994 * @flags: lookup flags (as used in path_lookup())
995 *
996 * Called from fs/namei.c:path_lookup().
997 */
998void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
1128{ 999{
1129 int idx; 1000 int idx;
1130 struct audit_context *context = current->audit_context; 1001 struct audit_context *context = current->audit_context;
@@ -1150,15 +1021,105 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags)
1150 ++context->ino_count; 1021 ++context->ino_count;
1151#endif 1022#endif
1152 } 1023 }
1153 context->names[idx].flags = flags;
1154 context->names[idx].ino = inode->i_ino;
1155 context->names[idx].dev = inode->i_sb->s_dev; 1024 context->names[idx].dev = inode->i_sb->s_dev;
1156 context->names[idx].mode = inode->i_mode; 1025 context->names[idx].mode = inode->i_mode;
1157 context->names[idx].uid = inode->i_uid; 1026 context->names[idx].uid = inode->i_uid;
1158 context->names[idx].gid = inode->i_gid; 1027 context->names[idx].gid = inode->i_gid;
1159 context->names[idx].rdev = inode->i_rdev; 1028 context->names[idx].rdev = inode->i_rdev;
1029 audit_inode_context(idx, inode);
1030 if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) &&
1031 (strcmp(name, ".") != 0)) {
1032 context->names[idx].ino = (unsigned long)-1;
1033 context->names[idx].pino = inode->i_ino;
1034 } else {
1035 context->names[idx].ino = inode->i_ino;
1036 context->names[idx].pino = (unsigned long)-1;
1037 }
1160} 1038}
1161 1039
1040/**
1041 * audit_inode_child - collect inode info for created/removed objects
1042 * @dname: inode's dentry name
1043 * @inode: inode being audited
1044 * @pino: inode number of dentry parent
1045 *
1046 * For syscalls that create or remove filesystem objects, audit_inode
1047 * can only collect information for the filesystem object's parent.
1048 * This call updates the audit context with the child's information.
1049 * Syscalls that create a new filesystem object must be hooked after
1050 * the object is created. Syscalls that remove a filesystem object
1051 * must be hooked prior, in order to capture the target inode during
1052 * unsuccessful attempts.
1053 */
1054void __audit_inode_child(const char *dname, const struct inode *inode,
1055 unsigned long pino)
1056{
1057 int idx;
1058 struct audit_context *context = current->audit_context;
1059
1060 if (!context->in_syscall)
1061 return;
1062
1063 /* determine matching parent */
1064 if (dname)
1065 for (idx = 0; idx < context->name_count; idx++)
1066 if (context->names[idx].pino == pino) {
1067 const char *n;
1068 const char *name = context->names[idx].name;
1069 int dlen = strlen(dname);
1070 int nlen = name ? strlen(name) : 0;
1071
1072 if (nlen < dlen)
1073 continue;
1074
1075 /* disregard trailing slashes */
1076 n = name + nlen - 1;
1077 while ((*n == '/') && (n > name))
1078 n--;
1079
1080 /* find last path component */
1081 n = n - dlen + 1;
1082 if (n < name)
1083 continue;
1084 else if (n > name) {
1085 if (*--n != '/')
1086 continue;
1087 else
1088 n++;
1089 }
1090
1091 if (strncmp(n, dname, dlen) == 0)
1092 goto update_context;
1093 }
1094
1095 /* catch-all in case match not found */
1096 idx = context->name_count++;
1097 context->names[idx].name = NULL;
1098 context->names[idx].pino = pino;
1099#if AUDIT_DEBUG
1100 context->ino_count++;
1101#endif
1102
1103update_context:
1104 if (inode) {
1105 context->names[idx].ino = inode->i_ino;
1106 context->names[idx].dev = inode->i_sb->s_dev;
1107 context->names[idx].mode = inode->i_mode;
1108 context->names[idx].uid = inode->i_uid;
1109 context->names[idx].gid = inode->i_gid;
1110 context->names[idx].rdev = inode->i_rdev;
1111 audit_inode_context(idx, inode);
1112 }
1113}
1114
1115/**
1116 * auditsc_get_stamp - get local copies of audit_context values
1117 * @ctx: audit_context for the task
1118 * @t: timespec to store time recorded in the audit_context
1119 * @serial: serial value that is recorded in the audit_context
1120 *
1121 * Also sets the context as auditable.
1122 */
1162void auditsc_get_stamp(struct audit_context *ctx, 1123void auditsc_get_stamp(struct audit_context *ctx,
1163 struct timespec *t, unsigned int *serial) 1124 struct timespec *t, unsigned int *serial)
1164{ 1125{
@@ -1170,6 +1131,15 @@ void auditsc_get_stamp(struct audit_context *ctx,
1170 ctx->auditable = 1; 1131 ctx->auditable = 1;
1171} 1132}
1172 1133
1134/**
1135 * audit_set_loginuid - set a task's audit_context loginuid
1136 * @task: task whose audit context is being modified
1137 * @loginuid: loginuid value
1138 *
1139 * Returns 0.
1140 *
1141 * Called (set) from fs/proc/base.c::proc_loginuid_write().
1142 */
1173int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 1143int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1174{ 1144{
1175 if (task->audit_context) { 1145 if (task->audit_context) {
@@ -1188,12 +1158,24 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1188 return 0; 1158 return 0;
1189} 1159}
1190 1160
1161/**
1162 * audit_get_loginuid - get the loginuid for an audit_context
1163 * @ctx: the audit_context
1164 *
1165 * Returns the context's loginuid or -1 if @ctx is NULL.
1166 */
1191uid_t audit_get_loginuid(struct audit_context *ctx) 1167uid_t audit_get_loginuid(struct audit_context *ctx)
1192{ 1168{
1193 return ctx ? ctx->loginuid : -1; 1169 return ctx ? ctx->loginuid : -1;
1194} 1170}
1195 1171
1196int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 1172/**
1173 * audit_ipc_obj - record audit data for ipc object
1174 * @ipcp: ipc permissions
1175 *
1176 * Returns 0 for success or NULL context or < 0 on error.
1177 */
1178int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1197{ 1179{
1198 struct audit_aux_data_ipcctl *ax; 1180 struct audit_aux_data_ipcctl *ax;
1199 struct audit_context *context = current->audit_context; 1181 struct audit_context *context = current->audit_context;
@@ -1201,7 +1183,39 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1201 if (likely(!context)) 1183 if (likely(!context))
1202 return 0; 1184 return 0;
1203 1185
1204 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 1186 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1187 if (!ax)
1188 return -ENOMEM;
1189
1190 ax->uid = ipcp->uid;
1191 ax->gid = ipcp->gid;
1192 ax->mode = ipcp->mode;
1193 selinux_get_ipc_sid(ipcp, &ax->osid);
1194
1195 ax->d.type = AUDIT_IPC;
1196 ax->d.next = context->aux;
1197 context->aux = (void *)ax;
1198 return 0;
1199}
1200
1201/**
1202 * audit_ipc_set_perm - record audit data for new ipc permissions
1203 * @qbytes: msgq bytes
1204 * @uid: msgq user id
1205 * @gid: msgq group id
1206 * @mode: msgq mode (permissions)
1207 *
1208 * Returns 0 for success or NULL context or < 0 on error.
1209 */
1210int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
1211{
1212 struct audit_aux_data_ipcctl *ax;
1213 struct audit_context *context = current->audit_context;
1214
1215 if (likely(!context))
1216 return 0;
1217
1218 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1205 if (!ax) 1219 if (!ax)
1206 return -ENOMEM; 1220 return -ENOMEM;
1207 1221
@@ -1209,13 +1223,21 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1209 ax->uid = uid; 1223 ax->uid = uid;
1210 ax->gid = gid; 1224 ax->gid = gid;
1211 ax->mode = mode; 1225 ax->mode = mode;
1226 selinux_get_ipc_sid(ipcp, &ax->osid);
1212 1227
1213 ax->d.type = AUDIT_IPC; 1228 ax->d.type = AUDIT_IPC_SET_PERM;
1214 ax->d.next = context->aux; 1229 ax->d.next = context->aux;
1215 context->aux = (void *)ax; 1230 context->aux = (void *)ax;
1216 return 0; 1231 return 0;
1217} 1232}
1218 1233
1234/**
1235 * audit_socketcall - record audit data for sys_socketcall
1236 * @nargs: number of args
1237 * @args: args array
1238 *
1239 * Returns 0 for success or NULL context or < 0 on error.
1240 */
1219int audit_socketcall(int nargs, unsigned long *args) 1241int audit_socketcall(int nargs, unsigned long *args)
1220{ 1242{
1221 struct audit_aux_data_socketcall *ax; 1243 struct audit_aux_data_socketcall *ax;
@@ -1237,6 +1259,13 @@ int audit_socketcall(int nargs, unsigned long *args)
1237 return 0; 1259 return 0;
1238} 1260}
1239 1261
1262/**
1263 * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
1264 * @len: data length in user space
1265 * @a: data address in kernel space
1266 *
1267 * Returns 0 for success or NULL context or < 0 on error.
1268 */
1240int audit_sockaddr(int len, void *a) 1269int audit_sockaddr(int len, void *a)
1241{ 1270{
1242 struct audit_aux_data_sockaddr *ax; 1271 struct audit_aux_data_sockaddr *ax;
@@ -1258,6 +1287,15 @@ int audit_sockaddr(int len, void *a)
1258 return 0; 1287 return 0;
1259} 1288}
1260 1289
1290/**
1291 * audit_avc_path - record the granting or denial of permissions
1292 * @dentry: dentry to record
1293 * @mnt: mnt to record
1294 *
1295 * Returns 0 for success or NULL context or < 0 on error.
1296 *
1297 * Called from security/selinux/avc.c::avc_audit()
1298 */
1261int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) 1299int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1262{ 1300{
1263 struct audit_aux_data_path *ax; 1301 struct audit_aux_data_path *ax;
@@ -1279,6 +1317,14 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1279 return 0; 1317 return 0;
1280} 1318}
1281 1319
1320/**
1321 * audit_signal_info - record signal info for shutting down audit subsystem
1322 * @sig: signal value
1323 * @t: task being signaled
1324 *
1325 * If the audit subsystem is being terminated, record the task (pid)
1326 * and uid that is doing that.
1327 */
1282void audit_signal_info(int sig, struct task_struct *t) 1328void audit_signal_info(int sig, struct task_struct *t)
1283{ 1329{
1284 extern pid_t audit_sig_pid; 1330 extern pid_t audit_sig_pid;
@@ -1295,4 +1341,3 @@ void audit_signal_info(int sig, struct task_struct *t)
1295 } 1341 }
1296 } 1342 }
1297} 1343}
1298
diff --git a/kernel/capability.c b/kernel/capability.c
index bfa3c92e16..1a4d8a40d3 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -233,3 +233,19 @@ out:
233 233
234 return ret; 234 return ret;
235} 235}
236
237int __capable(struct task_struct *t, int cap)
238{
239 if (security_capable(t, cap) == 0) {
240 t->flags |= PF_SUPERPRIV;
241 return 1;
242 }
243 return 0;
244}
245EXPORT_SYMBOL(__capable);
246
247int capable(int cap)
248{
249 return __capable(current, cap);
250}
251EXPORT_SYMBOL(capable);
diff --git a/kernel/compat.c b/kernel/compat.c
index 8c9cd88b67..c1601a84f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,10 +17,10 @@
17#include <linux/time.h> 17#include <linux/time.h>
18#include <linux/signal.h> 18#include <linux/signal.h>
19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ 19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
20#include <linux/futex.h> /* for FUTEX_WAIT */
21#include <linux/syscalls.h> 20#include <linux/syscalls.h>
22#include <linux/unistd.h> 21#include <linux/unistd.h>
23#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h>
24 24
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26 26
@@ -238,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
238 return ret; 238 return ret;
239} 239}
240 240
241#ifdef CONFIG_FUTEX
242asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
243 struct compat_timespec __user *utime, u32 __user *uaddr2,
244 int val3)
245{
246 struct timespec t;
247 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
248 int val2 = 0;
249
250 if ((op == FUTEX_WAIT) && utime) {
251 if (get_compat_timespec(&t, utime))
252 return -EFAULT;
253 timeout = timespec_to_jiffies(&t) + 1;
254 }
255 if (op >= FUTEX_REQUEUE)
256 val2 = (int) (unsigned long) utime;
257
258 return do_futex((unsigned long)uaddr, op, val, timeout,
259 (unsigned long)uaddr2, val2, val3);
260}
261#endif
262
263asmlinkage long compat_sys_setrlimit(unsigned int resource, 241asmlinkage long compat_sys_setrlimit(unsigned int resource,
264 struct compat_rlimit __user *rlim) 242 struct compat_rlimit __user *rlim)
265{ 243{
@@ -898,3 +876,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
898 return -ERESTARTNOHAND; 876 return -ERESTARTNOHAND;
899} 877}
900#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 878#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
879
880asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
881{
882 struct timex txc;
883 int ret;
884
885 memset(&txc, 0, sizeof(struct timex));
886
887 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
888 __get_user(txc.modes, &utp->modes) ||
889 __get_user(txc.offset, &utp->offset) ||
890 __get_user(txc.freq, &utp->freq) ||
891 __get_user(txc.maxerror, &utp->maxerror) ||
892 __get_user(txc.esterror, &utp->esterror) ||
893 __get_user(txc.status, &utp->status) ||
894 __get_user(txc.constant, &utp->constant) ||
895 __get_user(txc.precision, &utp->precision) ||
896 __get_user(txc.tolerance, &utp->tolerance) ||
897 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
898 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
899 __get_user(txc.tick, &utp->tick) ||
900 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
901 __get_user(txc.jitter, &utp->jitter) ||
902 __get_user(txc.shift, &utp->shift) ||
903 __get_user(txc.stabil, &utp->stabil) ||
904 __get_user(txc.jitcnt, &utp->jitcnt) ||
905 __get_user(txc.calcnt, &utp->calcnt) ||
906 __get_user(txc.errcnt, &utp->errcnt) ||
907 __get_user(txc.stbcnt, &utp->stbcnt))
908 return -EFAULT;
909
910 ret = do_adjtimex(&txc);
911
912 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
913 __put_user(txc.modes, &utp->modes) ||
914 __put_user(txc.offset, &utp->offset) ||
915 __put_user(txc.freq, &utp->freq) ||
916 __put_user(txc.maxerror, &utp->maxerror) ||
917 __put_user(txc.esterror, &utp->esterror) ||
918 __put_user(txc.status, &utp->status) ||
919 __put_user(txc.constant, &utp->constant) ||
920 __put_user(txc.precision, &utp->precision) ||
921 __put_user(txc.tolerance, &utp->tolerance) ||
922 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
923 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
924 __put_user(txc.tick, &utp->tick) ||
925 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
926 __put_user(txc.jitter, &utp->jitter) ||
927 __put_user(txc.shift, &utp->shift) ||
928 __put_user(txc.stabil, &utp->stabil) ||
929 __put_user(txc.jitcnt, &utp->jitcnt) ||
930 __put_user(txc.calcnt, &utp->calcnt) ||
931 __put_user(txc.errcnt, &utp->errcnt) ||
932 __put_user(txc.stbcnt, &utp->stbcnt))
933 ret = -EFAULT;
934
935 return ret;
936}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e882c6babf..fe2b8d0bfe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DECLARE_MUTEX(cpucontrol);
20 20
21static struct notifier_block *cpu_chain; 21static BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 int ret; 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75
76 if ((ret = lock_cpu_hotplug_interruptible()) != 0)
77 return ret;
78 ret = notifier_chain_register(&cpu_chain, nb);
79 unlock_cpu_hotplug();
80 return ret;
81} 75}
82EXPORT_SYMBOL(register_cpu_notifier); 76EXPORT_SYMBOL(register_cpu_notifier);
83 77
84void unregister_cpu_notifier(struct notifier_block *nb) 78void unregister_cpu_notifier(struct notifier_block *nb)
85{ 79{
86 lock_cpu_hotplug(); 80 blocking_notifier_chain_unregister(&cpu_chain, nb);
87 notifier_chain_unregister(&cpu_chain, nb);
88 unlock_cpu_hotplug();
89} 81}
90EXPORT_SYMBOL(unregister_cpu_notifier); 82EXPORT_SYMBOL(unregister_cpu_notifier);
91 83
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
141 goto out; 133 goto out;
142 } 134 }
143 135
144 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 136 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
145 (void *)(long)cpu); 137 (void *)(long)cpu);
146 if (err == NOTIFY_BAD) { 138 if (err == NOTIFY_BAD) {
147 printk("%s: attempt to take down CPU %u failed\n", 139 printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
159 p = __stop_machine_run(take_cpu_down, NULL, cpu); 151 p = __stop_machine_run(take_cpu_down, NULL, cpu);
160 if (IS_ERR(p)) { 152 if (IS_ERR(p)) {
161 /* CPU didn't die: tell everyone. Can't complain. */ 153 /* CPU didn't die: tell everyone. Can't complain. */
162 if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 154 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
163 (void *)(long)cpu) == NOTIFY_BAD) 155 (void *)(long)cpu) == NOTIFY_BAD)
164 BUG(); 156 BUG();
165 157
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
182 put_cpu(); 174 put_cpu();
183 175
184 /* CPU is completely dead: tell everyone. Too late to complain. */ 176 /* CPU is completely dead: tell everyone. Too late to complain. */
185 if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) 177 if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
186 == NOTIFY_BAD) 178 (void *)(long)cpu) == NOTIFY_BAD)
187 BUG(); 179 BUG();
188 180
189 check_for_tasks(cpu); 181 check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
211 goto out; 203 goto out;
212 } 204 }
213 205
214 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 206 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
215 if (ret == NOTIFY_BAD) { 207 if (ret == NOTIFY_BAD) {
216 printk("%s: attempt to bring up CPU %u failed\n", 208 printk("%s: attempt to bring up CPU %u failed\n",
217 __FUNCTION__, cpu); 209 __FUNCTION__, cpu);
@@ -223,15 +215,15 @@ int __devinit cpu_up(unsigned int cpu)
223 ret = __cpu_up(cpu); 215 ret = __cpu_up(cpu);
224 if (ret != 0) 216 if (ret != 0)
225 goto out_notify; 217 goto out_notify;
226 if (!cpu_online(cpu)) 218 BUG_ON(!cpu_online(cpu));
227 BUG();
228 219
229 /* Now call notifier in preparation. */ 220 /* Now call notifier in preparation. */
230 notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); 221 blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
231 222
232out_notify: 223out_notify:
233 if (ret != 0) 224 if (ret != 0)
234 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); 225 blocking_notifier_call_chain(&cpu_chain,
226 CPU_UP_CANCELED, hcpu);
235out: 227out:
236 unlock_cpu_hotplug(); 228 unlock_cpu_hotplug();
237 return ret; 229 return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba42b0a769..ab81fdd457 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,15 +4,14 @@
4 * Processor and Memory placement constraints for sets of tasks. 4 * Processor and Memory placement constraints for sets of tasks.
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 * Portions derived from Patrick Mochel's sysfs code. 9 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel 10 * sysfs is Copyright (c) 2001-3 Patrick Mochel
11 * Portions Copyright (c) 2004 Silicon Graphics, Inc.
12 * 11 *
13 * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> 12 * 2003-10-10 Written by Simon Derr.
14 * 2003-10-22 Updates by Stephen Hemminger. 13 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson <pj@sgi.com> 14 * 2004 May-July Rework by Paul Jackson.
16 * 15 *
17 * This file is subject to the terms and conditions of the GNU General Public 16 * This file is subject to the terms and conditions of the GNU General Public
18 * License. See the file COPYING in the main directory of the Linux 17 * License. See the file COPYING in the main directory of the Linux
@@ -53,7 +52,7 @@
53 52
54#include <asm/uaccess.h> 53#include <asm/uaccess.h>
55#include <asm/atomic.h> 54#include <asm/atomic.h>
56#include <asm/semaphore.h> 55#include <linux/mutex.h>
57 56
58#define CPUSET_SUPER_MAGIC 0x27e0eb 57#define CPUSET_SUPER_MAGIC 0x27e0eb
59 58
@@ -108,37 +107,49 @@ typedef enum {
108 CS_MEM_EXCLUSIVE, 107 CS_MEM_EXCLUSIVE,
109 CS_MEMORY_MIGRATE, 108 CS_MEMORY_MIGRATE,
110 CS_REMOVED, 109 CS_REMOVED,
111 CS_NOTIFY_ON_RELEASE 110 CS_NOTIFY_ON_RELEASE,
111 CS_SPREAD_PAGE,
112 CS_SPREAD_SLAB,
112} cpuset_flagbits_t; 113} cpuset_flagbits_t;
113 114
114/* convenient tests for these bits */ 115/* convenient tests for these bits */
115static inline int is_cpu_exclusive(const struct cpuset *cs) 116static inline int is_cpu_exclusive(const struct cpuset *cs)
116{ 117{
117 return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 118 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
118} 119}
119 120
120static inline int is_mem_exclusive(const struct cpuset *cs) 121static inline int is_mem_exclusive(const struct cpuset *cs)
121{ 122{
122 return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 123 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
123} 124}
124 125
125static inline int is_removed(const struct cpuset *cs) 126static inline int is_removed(const struct cpuset *cs)
126{ 127{
127 return !!test_bit(CS_REMOVED, &cs->flags); 128 return test_bit(CS_REMOVED, &cs->flags);
128} 129}
129 130
130static inline int notify_on_release(const struct cpuset *cs) 131static inline int notify_on_release(const struct cpuset *cs)
131{ 132{
132 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 133 return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
133} 134}
134 135
135static inline int is_memory_migrate(const struct cpuset *cs) 136static inline int is_memory_migrate(const struct cpuset *cs)
136{ 137{
137 return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); 138 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
139}
140
141static inline int is_spread_page(const struct cpuset *cs)
142{
143 return test_bit(CS_SPREAD_PAGE, &cs->flags);
144}
145
146static inline int is_spread_slab(const struct cpuset *cs)
147{
148 return test_bit(CS_SPREAD_SLAB, &cs->flags);
138} 149}
139 150
140/* 151/*
141 * Increment this atomic integer everytime any cpuset changes its 152 * Increment this integer everytime any cpuset changes its
142 * mems_allowed value. Users of cpusets can track this generation 153 * mems_allowed value. Users of cpusets can track this generation
143 * number, and avoid having to lock and reload mems_allowed unless 154 * number, and avoid having to lock and reload mems_allowed unless
144 * the cpuset they're using changes generation. 155 * the cpuset they're using changes generation.
@@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs)
152 * on every visit to __alloc_pages(), to efficiently check whether 163 * on every visit to __alloc_pages(), to efficiently check whether
153 * its current->cpuset->mems_allowed has changed, requiring an update 164 * its current->cpuset->mems_allowed has changed, requiring an update
154 * of its current->mems_allowed. 165 * of its current->mems_allowed.
166 *
167 * Since cpuset_mems_generation is guarded by manage_mutex,
168 * there is no need to mark it atomic.
155 */ 169 */
156static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); 170static int cpuset_mems_generation;
157 171
158static struct cpuset top_cpuset = { 172static struct cpuset top_cpuset = {
159 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 173 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
@@ -168,63 +182,57 @@ static struct vfsmount *cpuset_mount;
168static struct super_block *cpuset_sb; 182static struct super_block *cpuset_sb;
169 183
170/* 184/*
171 * We have two global cpuset semaphores below. They can nest. 185 * We have two global cpuset mutexes below. They can nest.
172 * It is ok to first take manage_sem, then nest callback_sem. We also 186 * It is ok to first take manage_mutex, then nest callback_mutex. We also
173 * require taking task_lock() when dereferencing a tasks cpuset pointer. 187 * require taking task_lock() when dereferencing a tasks cpuset pointer.
174 * See "The task_lock() exception", at the end of this comment. 188 * See "The task_lock() exception", at the end of this comment.
175 * 189 *
176 * A task must hold both semaphores to modify cpusets. If a task 190 * A task must hold both mutexes to modify cpusets. If a task
177 * holds manage_sem, then it blocks others wanting that semaphore, 191 * holds manage_mutex, then it blocks others wanting that mutex,
178 * ensuring that it is the only task able to also acquire callback_sem 192 * ensuring that it is the only task able to also acquire callback_mutex
179 * and be able to modify cpusets. It can perform various checks on 193 * and be able to modify cpusets. It can perform various checks on
180 * the cpuset structure first, knowing nothing will change. It can 194 * the cpuset structure first, knowing nothing will change. It can
181 * also allocate memory while just holding manage_sem. While it is 195 * also allocate memory while just holding manage_mutex. While it is
182 * performing these checks, various callback routines can briefly 196 * performing these checks, various callback routines can briefly
183 * acquire callback_sem to query cpusets. Once it is ready to make 197 * acquire callback_mutex to query cpusets. Once it is ready to make
184 * the changes, it takes callback_sem, blocking everyone else. 198 * the changes, it takes callback_mutex, blocking everyone else.
185 * 199 *
186 * Calls to the kernel memory allocator can not be made while holding 200 * Calls to the kernel memory allocator can not be made while holding
187 * callback_sem, as that would risk double tripping on callback_sem 201 * callback_mutex, as that would risk double tripping on callback_mutex
188 * from one of the callbacks into the cpuset code from within 202 * from one of the callbacks into the cpuset code from within
189 * __alloc_pages(). 203 * __alloc_pages().
190 * 204 *
191 * If a task is only holding callback_sem, then it has read-only 205 * If a task is only holding callback_mutex, then it has read-only
192 * access to cpusets. 206 * access to cpusets.
193 * 207 *
194 * The task_struct fields mems_allowed and mems_generation may only 208 * The task_struct fields mems_allowed and mems_generation may only
195 * be accessed in the context of that task, so require no locks. 209 * be accessed in the context of that task, so require no locks.
196 * 210 *
197 * Any task can increment and decrement the count field without lock. 211 * Any task can increment and decrement the count field without lock.
198 * So in general, code holding manage_sem or callback_sem can't rely 212 * So in general, code holding manage_mutex or callback_mutex can't rely
199 * on the count field not changing. However, if the count goes to 213 * on the count field not changing. However, if the count goes to
200 * zero, then only attach_task(), which holds both semaphores, can 214 * zero, then only attach_task(), which holds both mutexes, can
201 * increment it again. Because a count of zero means that no tasks 215 * increment it again. Because a count of zero means that no tasks
202 * are currently attached, therefore there is no way a task attached 216 * are currently attached, therefore there is no way a task attached
203 * to that cpuset can fork (the other way to increment the count). 217 * to that cpuset can fork (the other way to increment the count).
204 * So code holding manage_sem or callback_sem can safely assume that 218 * So code holding manage_mutex or callback_mutex can safely assume that
205 * if the count is zero, it will stay zero. Similarly, if a task 219 * if the count is zero, it will stay zero. Similarly, if a task
206 * holds manage_sem or callback_sem on a cpuset with zero count, it 220 * holds manage_mutex or callback_mutex on a cpuset with zero count, it
207 * knows that the cpuset won't be removed, as cpuset_rmdir() needs 221 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
208 * both of those semaphores. 222 * both of those mutexes.
209 *
210 * A possible optimization to improve parallelism would be to make
211 * callback_sem a R/W semaphore (rwsem), allowing the callback routines
212 * to proceed in parallel, with read access, until the holder of
213 * manage_sem needed to take this rwsem for exclusive write access
214 * and modify some cpusets.
215 * 223 *
216 * The cpuset_common_file_write handler for operations that modify 224 * The cpuset_common_file_write handler for operations that modify
217 * the cpuset hierarchy holds manage_sem across the entire operation, 225 * the cpuset hierarchy holds manage_mutex across the entire operation,
218 * single threading all such cpuset modifications across the system. 226 * single threading all such cpuset modifications across the system.
219 * 227 *
220 * The cpuset_common_file_read() handlers only hold callback_sem across 228 * The cpuset_common_file_read() handlers only hold callback_mutex across
221 * small pieces of code, such as when reading out possibly multi-word 229 * small pieces of code, such as when reading out possibly multi-word
222 * cpumasks and nodemasks. 230 * cpumasks and nodemasks.
223 * 231 *
224 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't 232 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
225 * (usually) take either semaphore. These are the two most performance 233 * (usually) take either mutex. These are the two most performance
226 * critical pieces of code here. The exception occurs on cpuset_exit(), 234 * critical pieces of code here. The exception occurs on cpuset_exit(),
227 * when a task in a notify_on_release cpuset exits. Then manage_sem 235 * when a task in a notify_on_release cpuset exits. Then manage_mutex
228 * is taken, and if the cpuset count is zero, a usermode call made 236 * is taken, and if the cpuset count is zero, a usermode call made
229 * to /sbin/cpuset_release_agent with the name of the cpuset (path 237 * to /sbin/cpuset_release_agent with the name of the cpuset (path
230 * relative to the root of cpuset file system) as the argument. 238 * relative to the root of cpuset file system) as the argument.
@@ -242,9 +250,9 @@ static struct super_block *cpuset_sb;
242 * 250 *
243 * The need for this exception arises from the action of attach_task(), 251 * The need for this exception arises from the action of attach_task(),
244 * which overwrites one tasks cpuset pointer with another. It does 252 * which overwrites one tasks cpuset pointer with another. It does
245 * so using both semaphores, however there are several performance 253 * so using both mutexes, however there are several performance
246 * critical places that need to reference task->cpuset without the 254 * critical places that need to reference task->cpuset without the
247 * expense of grabbing a system global semaphore. Therefore except as 255 * expense of grabbing a system global mutex. Therefore except as
248 * noted below, when dereferencing or, as in attach_task(), modifying 256 * noted below, when dereferencing or, as in attach_task(), modifying
249 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock 257 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
250 * (task->alloc_lock) already in the task_struct routinely used for 258 * (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +264,8 @@ static struct super_block *cpuset_sb;
256 * the routine cpuset_update_task_memory_state(). 264 * the routine cpuset_update_task_memory_state().
257 */ 265 */
258 266
259static DECLARE_MUTEX(manage_sem); 267static DEFINE_MUTEX(manage_mutex);
260static DECLARE_MUTEX(callback_sem); 268static DEFINE_MUTEX(callback_mutex);
261 269
262/* 270/*
263 * A couple of forward declarations required, due to cyclic reference loop: 271 * A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +440,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
432} 440}
433 441
434/* 442/*
435 * Call with manage_sem held. Writes path of cpuset into buf. 443 * Call with manage_mutex held. Writes path of cpuset into buf.
436 * Returns 0 on success, -errno on error. 444 * Returns 0 on success, -errno on error.
437 */ 445 */
438 446
@@ -484,11 +492,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
484 * status of the /sbin/cpuset_release_agent task, so no sense holding 492 * status of the /sbin/cpuset_release_agent task, so no sense holding
485 * our caller up for that. 493 * our caller up for that.
486 * 494 *
487 * When we had only one cpuset semaphore, we had to call this 495 * When we had only one cpuset mutex, we had to call this
488 * without holding it, to avoid deadlock when call_usermodehelper() 496 * without holding it, to avoid deadlock when call_usermodehelper()
489 * allocated memory. With two locks, we could now call this while 497 * allocated memory. With two locks, we could now call this while
490 * holding manage_sem, but we still don't, so as to minimize 498 * holding manage_mutex, but we still don't, so as to minimize
491 * the time manage_sem is held. 499 * the time manage_mutex is held.
492 */ 500 */
493 501
494static void cpuset_release_agent(const char *pathbuf) 502static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +528,15 @@ static void cpuset_release_agent(const char *pathbuf)
520 * cs is notify_on_release() and now both the user count is zero and 528 * cs is notify_on_release() and now both the user count is zero and
521 * the list of children is empty, prepare cpuset path in a kmalloc'd 529 * the list of children is empty, prepare cpuset path in a kmalloc'd
522 * buffer, to be returned via ppathbuf, so that the caller can invoke 530 * buffer, to be returned via ppathbuf, so that the caller can invoke
523 * cpuset_release_agent() with it later on, once manage_sem is dropped. 531 * cpuset_release_agent() with it later on, once manage_mutex is dropped.
524 * Call here with manage_sem held. 532 * Call here with manage_mutex held.
525 * 533 *
526 * This check_for_release() routine is responsible for kmalloc'ing 534 * This check_for_release() routine is responsible for kmalloc'ing
527 * pathbuf. The above cpuset_release_agent() is responsible for 535 * pathbuf. The above cpuset_release_agent() is responsible for
528 * kfree'ing pathbuf. The caller of these routines is responsible 536 * kfree'ing pathbuf. The caller of these routines is responsible
529 * for providing a pathbuf pointer, initialized to NULL, then 537 * for providing a pathbuf pointer, initialized to NULL, then
530 * calling check_for_release() with manage_sem held and the address 538 * calling check_for_release() with manage_mutex held and the address
531 * of the pathbuf pointer, then dropping manage_sem, then calling 539 * of the pathbuf pointer, then dropping manage_mutex, then calling
532 * cpuset_release_agent() with pathbuf, as set by check_for_release(). 540 * cpuset_release_agent() with pathbuf, as set by check_for_release().
533 */ 541 */
534 542
@@ -559,7 +567,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
559 * One way or another, we guarantee to return some non-empty subset 567 * One way or another, we guarantee to return some non-empty subset
560 * of cpu_online_map. 568 * of cpu_online_map.
561 * 569 *
562 * Call with callback_sem held. 570 * Call with callback_mutex held.
563 */ 571 */
564 572
565static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 573static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +591,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
583 * One way or another, we guarantee to return some non-empty subset 591 * One way or another, we guarantee to return some non-empty subset
584 * of node_online_map. 592 * of node_online_map.
585 * 593 *
586 * Call with callback_sem held. 594 * Call with callback_mutex held.
587 */ 595 */
588 596
589static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 597static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
608 * current->cpuset if a task has its memory placement changed. 616 * current->cpuset if a task has its memory placement changed.
609 * Do not call this routine if in_interrupt(). 617 * Do not call this routine if in_interrupt().
610 * 618 *
611 * Call without callback_sem or task_lock() held. May be called 619 * Call without callback_mutex or task_lock() held. May be
612 * with or without manage_sem held. Doesn't need task_lock to guard 620 * called with or without manage_mutex held. Thanks in part to
613 * against another task changing a non-NULL cpuset pointer to NULL, 621 * 'the_top_cpuset_hack', the tasks cpuset pointer will never
614 * as that is only done by a task on itself, and if the current task 622 * be NULL. This routine also might acquire callback_mutex and
615 * is here, it is not simultaneously in the exit code NULL'ing its
616 * cpuset pointer. This routine also might acquire callback_sem and
617 * current->mm->mmap_sem during call. 623 * current->mm->mmap_sem during call.
618 * 624 *
619 * Reading current->cpuset->mems_generation doesn't need task_lock 625 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +664,21 @@ void cpuset_update_task_memory_state(void)
658 } 664 }
659 665
660 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 666 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
661 down(&callback_sem); 667 mutex_lock(&callback_mutex);
662 task_lock(tsk); 668 task_lock(tsk);
663 cs = tsk->cpuset; /* Maybe changed when task not locked */ 669 cs = tsk->cpuset; /* Maybe changed when task not locked */
664 guarantee_online_mems(cs, &tsk->mems_allowed); 670 guarantee_online_mems(cs, &tsk->mems_allowed);
665 tsk->cpuset_mems_generation = cs->mems_generation; 671 tsk->cpuset_mems_generation = cs->mems_generation;
672 if (is_spread_page(cs))
673 tsk->flags |= PF_SPREAD_PAGE;
674 else
675 tsk->flags &= ~PF_SPREAD_PAGE;
676 if (is_spread_slab(cs))
677 tsk->flags |= PF_SPREAD_SLAB;
678 else
679 tsk->flags &= ~PF_SPREAD_SLAB;
666 task_unlock(tsk); 680 task_unlock(tsk);
667 up(&callback_sem); 681 mutex_unlock(&callback_mutex);
668 mpol_rebind_task(tsk, &tsk->mems_allowed); 682 mpol_rebind_task(tsk, &tsk->mems_allowed);
669 } 683 }
670} 684}
@@ -674,7 +688,7 @@ void cpuset_update_task_memory_state(void)
674 * 688 *
675 * One cpuset is a subset of another if all its allowed CPUs and 689 * One cpuset is a subset of another if all its allowed CPUs and
676 * Memory Nodes are a subset of the other, and its exclusive flags 690 * Memory Nodes are a subset of the other, and its exclusive flags
677 * are only set if the other's are set. Call holding manage_sem. 691 * are only set if the other's are set. Call holding manage_mutex.
678 */ 692 */
679 693
680static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 694static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +706,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
692 * If we replaced the flag and mask values of the current cpuset 706 * If we replaced the flag and mask values of the current cpuset
693 * (cur) with those values in the trial cpuset (trial), would 707 * (cur) with those values in the trial cpuset (trial), would
694 * our various subset and exclusive rules still be valid? Presumes 708 * our various subset and exclusive rules still be valid? Presumes
695 * manage_sem held. 709 * manage_mutex held.
696 * 710 *
697 * 'cur' is the address of an actual, in-use cpuset. Operations 711 * 'cur' is the address of an actual, in-use cpuset. Operations
698 * such as list traversal that depend on the actual address of the 712 * such as list traversal that depend on the actual address of the
@@ -746,7 +760,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
746 * exclusive child cpusets 760 * exclusive child cpusets
747 * Build these two partitions by calling partition_sched_domains 761 * Build these two partitions by calling partition_sched_domains
748 * 762 *
749 * Call with manage_sem held. May nest a call to the 763 * Call with manage_mutex held. May nest a call to the
750 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 764 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
751 */ 765 */
752 766
@@ -792,7 +806,7 @@ static void update_cpu_domains(struct cpuset *cur)
792} 806}
793 807
794/* 808/*
795 * Call with manage_sem held. May take callback_sem during call. 809 * Call with manage_mutex held. May take callback_mutex during call.
796 */ 810 */
797 811
798static int update_cpumask(struct cpuset *cs, char *buf) 812static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,15 +825,64 @@ static int update_cpumask(struct cpuset *cs, char *buf)
811 if (retval < 0) 825 if (retval < 0)
812 return retval; 826 return retval;
813 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 827 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
814 down(&callback_sem); 828 mutex_lock(&callback_mutex);
815 cs->cpus_allowed = trialcs.cpus_allowed; 829 cs->cpus_allowed = trialcs.cpus_allowed;
816 up(&callback_sem); 830 mutex_unlock(&callback_mutex);
817 if (is_cpu_exclusive(cs) && !cpus_unchanged) 831 if (is_cpu_exclusive(cs) && !cpus_unchanged)
818 update_cpu_domains(cs); 832 update_cpu_domains(cs);
819 return 0; 833 return 0;
820} 834}
821 835
822/* 836/*
837 * cpuset_migrate_mm
838 *
839 * Migrate memory region from one set of nodes to another.
840 *
841 * Temporarilly set tasks mems_allowed to target nodes of migration,
842 * so that the migration code can allocate pages on these nodes.
843 *
844 * Call holding manage_mutex, so our current->cpuset won't change
845 * during this call, as manage_mutex holds off any attach_task()
846 * calls. Therefore we don't need to take task_lock around the
847 * call to guarantee_online_mems(), as we know no one is changing
848 * our tasks cpuset.
849 *
850 * Hold callback_mutex around the two modifications of our tasks
851 * mems_allowed to synchronize with cpuset_mems_allowed().
852 *
853 * While the mm_struct we are migrating is typically from some
854 * other task, the task_struct mems_allowed that we are hacking
855 * is for our current task, which must allocate new pages for that
856 * migrating memory region.
857 *
858 * We call cpuset_update_task_memory_state() before hacking
859 * our tasks mems_allowed, so that we are assured of being in
860 * sync with our tasks cpuset, and in particular, callbacks to
861 * cpuset_update_task_memory_state() from nested page allocations
862 * won't see any mismatch of our cpuset and task mems_generation
863 * values, so won't overwrite our hacked tasks mems_allowed
864 * nodemask.
865 */
866
867static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
868 const nodemask_t *to)
869{
870 struct task_struct *tsk = current;
871
872 cpuset_update_task_memory_state();
873
874 mutex_lock(&callback_mutex);
875 tsk->mems_allowed = *to;
876 mutex_unlock(&callback_mutex);
877
878 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
879
880 mutex_lock(&callback_mutex);
881 guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
882 mutex_unlock(&callback_mutex);
883}
884
885/*
823 * Handle user request to change the 'mems' memory placement 886 * Handle user request to change the 'mems' memory placement
824 * of a cpuset. Needs to validate the request, update the 887 * of a cpuset. Needs to validate the request, update the
825 * cpusets mems_allowed and mems_generation, and for each 888 * cpusets mems_allowed and mems_generation, and for each
@@ -827,7 +890,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
827 * the cpuset is marked 'memory_migrate', migrate the tasks 890 * the cpuset is marked 'memory_migrate', migrate the tasks
828 * pages to the new memory. 891 * pages to the new memory.
829 * 892 *
830 * Call with manage_sem held. May take callback_sem during call. 893 * Call with manage_mutex held. May take callback_mutex during call.
831 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 894 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
832 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 895 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
833 * their mempolicies to the cpusets new mems_allowed. 896 * their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +925,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
862 if (retval < 0) 925 if (retval < 0)
863 goto done; 926 goto done;
864 927
865 down(&callback_sem); 928 mutex_lock(&callback_mutex);
866 cs->mems_allowed = trialcs.mems_allowed; 929 cs->mems_allowed = trialcs.mems_allowed;
867 atomic_inc(&cpuset_mems_generation); 930 cs->mems_generation = cpuset_mems_generation++;
868 cs->mems_generation = atomic_read(&cpuset_mems_generation); 931 mutex_unlock(&callback_mutex);
869 up(&callback_sem);
870 932
871 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 933 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
872 934
@@ -922,7 +984,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
922 * tasklist_lock. Forks can happen again now - the mpol_copy() 984 * tasklist_lock. Forks can happen again now - the mpol_copy()
923 * cpuset_being_rebound check will catch such forks, and rebind 985 * cpuset_being_rebound check will catch such forks, and rebind
924 * their vma mempolicies too. Because we still hold the global 986 * their vma mempolicies too. Because we still hold the global
925 * cpuset manage_sem, we know that no other rebind effort will 987 * cpuset manage_mutex, we know that no other rebind effort will
926 * be contending for the global variable cpuset_being_rebound. 988 * be contending for the global variable cpuset_being_rebound.
927 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 989 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
928 * is idempotent. Also migrate pages in each mm to new nodes. 990 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -932,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
932 struct mm_struct *mm = mmarray[i]; 994 struct mm_struct *mm = mmarray[i];
933 995
934 mpol_rebind_mm(mm, &cs->mems_allowed); 996 mpol_rebind_mm(mm, &cs->mems_allowed);
935 if (migrate) { 997 if (migrate)
936 do_migrate_pages(mm, &oldmem, &cs->mems_allowed, 998 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
937 MPOL_MF_MOVE_ALL);
938 }
939 mmput(mm); 999 mmput(mm);
940 } 1000 }
941 1001
@@ -948,7 +1008,7 @@ done:
948} 1008}
949 1009
950/* 1010/*
951 * Call with manage_sem held. 1011 * Call with manage_mutex held.
952 */ 1012 */
953 1013
954static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) 1014static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -963,11 +1023,12 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
963/* 1023/*
964 * update_flag - read a 0 or a 1 in a file and update associated flag 1024 * update_flag - read a 0 or a 1 in a file and update associated flag
965 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1025 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
966 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) 1026 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
1027 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
967 * cs: the cpuset to update 1028 * cs: the cpuset to update
968 * buf: the buffer where we read the 0 or 1 1029 * buf: the buffer where we read the 0 or 1
969 * 1030 *
970 * Call with manage_sem held. 1031 * Call with manage_mutex held.
971 */ 1032 */
972 1033
973static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 1034static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +1050,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
989 return err; 1050 return err;
990 cpu_exclusive_changed = 1051 cpu_exclusive_changed =
991 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 1052 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
992 down(&callback_sem); 1053 mutex_lock(&callback_mutex);
993 if (turning_on) 1054 if (turning_on)
994 set_bit(bit, &cs->flags); 1055 set_bit(bit, &cs->flags);
995 else 1056 else
996 clear_bit(bit, &cs->flags); 1057 clear_bit(bit, &cs->flags);
997 up(&callback_sem); 1058 mutex_unlock(&callback_mutex);
998 1059
999 if (cpu_exclusive_changed) 1060 if (cpu_exclusive_changed)
1000 update_cpu_domains(cs); 1061 update_cpu_domains(cs);
@@ -1104,7 +1165,7 @@ static int fmeter_getrate(struct fmeter *fmp)
1104 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 1165 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1105 * notified on release. 1166 * notified on release.
1106 * 1167 *
1107 * Call holding manage_sem. May take callback_sem and task_lock of 1168 * Call holding manage_mutex. May take callback_mutex and task_lock of
1108 * the task 'pid' during call. 1169 * the task 'pid' during call.
1109 */ 1170 */
1110 1171
@@ -1144,13 +1205,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1144 get_task_struct(tsk); 1205 get_task_struct(tsk);
1145 } 1206 }
1146 1207
1147 down(&callback_sem); 1208 mutex_lock(&callback_mutex);
1148 1209
1149 task_lock(tsk); 1210 task_lock(tsk);
1150 oldcs = tsk->cpuset; 1211 oldcs = tsk->cpuset;
1151 if (!oldcs) { 1212 if (!oldcs) {
1152 task_unlock(tsk); 1213 task_unlock(tsk);
1153 up(&callback_sem); 1214 mutex_unlock(&callback_mutex);
1154 put_task_struct(tsk); 1215 put_task_struct(tsk);
1155 return -ESRCH; 1216 return -ESRCH;
1156 } 1217 }
@@ -1164,16 +1225,16 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1164 from = oldcs->mems_allowed; 1225 from = oldcs->mems_allowed;
1165 to = cs->mems_allowed; 1226 to = cs->mems_allowed;
1166 1227
1167 up(&callback_sem); 1228 mutex_unlock(&callback_mutex);
1168 1229
1169 mm = get_task_mm(tsk); 1230 mm = get_task_mm(tsk);
1170 if (mm) { 1231 if (mm) {
1171 mpol_rebind_mm(mm, &to); 1232 mpol_rebind_mm(mm, &to);
1233 if (is_memory_migrate(cs))
1234 cpuset_migrate_mm(mm, &from, &to);
1172 mmput(mm); 1235 mmput(mm);
1173 } 1236 }
1174 1237
1175 if (is_memory_migrate(cs))
1176 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
1177 put_task_struct(tsk); 1238 put_task_struct(tsk);
1178 synchronize_rcu(); 1239 synchronize_rcu();
1179 if (atomic_dec_and_test(&oldcs->count)) 1240 if (atomic_dec_and_test(&oldcs->count))
@@ -1194,6 +1255,8 @@ typedef enum {
1194 FILE_NOTIFY_ON_RELEASE, 1255 FILE_NOTIFY_ON_RELEASE,
1195 FILE_MEMORY_PRESSURE_ENABLED, 1256 FILE_MEMORY_PRESSURE_ENABLED,
1196 FILE_MEMORY_PRESSURE, 1257 FILE_MEMORY_PRESSURE,
1258 FILE_SPREAD_PAGE,
1259 FILE_SPREAD_SLAB,
1197 FILE_TASKLIST, 1260 FILE_TASKLIST,
1198} cpuset_filetype_t; 1261} cpuset_filetype_t;
1199 1262
@@ -1221,7 +1284,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1221 } 1284 }
1222 buffer[nbytes] = 0; /* nul-terminate */ 1285 buffer[nbytes] = 0; /* nul-terminate */
1223 1286
1224 down(&manage_sem); 1287 mutex_lock(&manage_mutex);
1225 1288
1226 if (is_removed(cs)) { 1289 if (is_removed(cs)) {
1227 retval = -ENODEV; 1290 retval = -ENODEV;
@@ -1253,6 +1316,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1253 case FILE_MEMORY_PRESSURE: 1316 case FILE_MEMORY_PRESSURE:
1254 retval = -EACCES; 1317 retval = -EACCES;
1255 break; 1318 break;
1319 case FILE_SPREAD_PAGE:
1320 retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
1321 cs->mems_generation = cpuset_mems_generation++;
1322 break;
1323 case FILE_SPREAD_SLAB:
1324 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1325 cs->mems_generation = cpuset_mems_generation++;
1326 break;
1256 case FILE_TASKLIST: 1327 case FILE_TASKLIST:
1257 retval = attach_task(cs, buffer, &pathbuf); 1328 retval = attach_task(cs, buffer, &pathbuf);
1258 break; 1329 break;
@@ -1264,7 +1335,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1264 if (retval == 0) 1335 if (retval == 0)
1265 retval = nbytes; 1336 retval = nbytes;
1266out2: 1337out2:
1267 up(&manage_sem); 1338 mutex_unlock(&manage_mutex);
1268 cpuset_release_agent(pathbuf); 1339 cpuset_release_agent(pathbuf);
1269out1: 1340out1:
1270 kfree(buffer); 1341 kfree(buffer);
@@ -1304,9 +1375,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1304{ 1375{
1305 cpumask_t mask; 1376 cpumask_t mask;
1306 1377
1307 down(&callback_sem); 1378 mutex_lock(&callback_mutex);
1308 mask = cs->cpus_allowed; 1379 mask = cs->cpus_allowed;
1309 up(&callback_sem); 1380 mutex_unlock(&callback_mutex);
1310 1381
1311 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1382 return cpulist_scnprintf(page, PAGE_SIZE, mask);
1312} 1383}
@@ -1315,9 +1386,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1315{ 1386{
1316 nodemask_t mask; 1387 nodemask_t mask;
1317 1388
1318 down(&callback_sem); 1389 mutex_lock(&callback_mutex);
1319 mask = cs->mems_allowed; 1390 mask = cs->mems_allowed;
1320 up(&callback_sem); 1391 mutex_unlock(&callback_mutex);
1321 1392
1322 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1393 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1323} 1394}
@@ -1362,6 +1433,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1362 case FILE_MEMORY_PRESSURE: 1433 case FILE_MEMORY_PRESSURE:
1363 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); 1434 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1364 break; 1435 break;
1436 case FILE_SPREAD_PAGE:
1437 *s++ = is_spread_page(cs) ? '1' : '0';
1438 break;
1439 case FILE_SPREAD_SLAB:
1440 *s++ = is_spread_slab(cs) ? '1' : '0';
1441 break;
1365 default: 1442 default:
1366 retval = -EINVAL; 1443 retval = -EINVAL;
1367 goto out; 1444 goto out;
@@ -1598,7 +1675,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1598 * Handle an open on 'tasks' file. Prepare a buffer listing the 1675 * Handle an open on 'tasks' file. Prepare a buffer listing the
1599 * process id's of tasks currently attached to the cpuset being opened. 1676 * process id's of tasks currently attached to the cpuset being opened.
1600 * 1677 *
1601 * Does not require any specific cpuset semaphores, and does not take any. 1678 * Does not require any specific cpuset mutexes, and does not take any.
1602 */ 1679 */
1603static int cpuset_tasks_open(struct inode *unused, struct file *file) 1680static int cpuset_tasks_open(struct inode *unused, struct file *file)
1604{ 1681{
@@ -1725,6 +1802,16 @@ static struct cftype cft_memory_pressure = {
1725 .private = FILE_MEMORY_PRESSURE, 1802 .private = FILE_MEMORY_PRESSURE,
1726}; 1803};
1727 1804
1805static struct cftype cft_spread_page = {
1806 .name = "memory_spread_page",
1807 .private = FILE_SPREAD_PAGE,
1808};
1809
1810static struct cftype cft_spread_slab = {
1811 .name = "memory_spread_slab",
1812 .private = FILE_SPREAD_SLAB,
1813};
1814
1728static int cpuset_populate_dir(struct dentry *cs_dentry) 1815static int cpuset_populate_dir(struct dentry *cs_dentry)
1729{ 1816{
1730 int err; 1817 int err;
@@ -1743,6 +1830,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1743 return err; 1830 return err;
1744 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) 1831 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1745 return err; 1832 return err;
1833 if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
1834 return err;
1835 if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
1836 return err;
1746 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1837 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1747 return err; 1838 return err;
1748 return 0; 1839 return 0;
@@ -1754,7 +1845,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1754 * name: name of the new cpuset. Will be strcpy'ed. 1845 * name: name of the new cpuset. Will be strcpy'ed.
1755 * mode: mode to set on new inode 1846 * mode: mode to set on new inode
1756 * 1847 *
1757 * Must be called with the semaphore on the parent inode held 1848 * Must be called with the mutex on the parent inode held
1758 */ 1849 */
1759 1850
1760static long cpuset_create(struct cpuset *parent, const char *name, int mode) 1851static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,44 +1857,47 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1766 if (!cs) 1857 if (!cs)
1767 return -ENOMEM; 1858 return -ENOMEM;
1768 1859
1769 down(&manage_sem); 1860 mutex_lock(&manage_mutex);
1770 cpuset_update_task_memory_state(); 1861 cpuset_update_task_memory_state();
1771 cs->flags = 0; 1862 cs->flags = 0;
1772 if (notify_on_release(parent)) 1863 if (notify_on_release(parent))
1773 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1864 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1865 if (is_spread_page(parent))
1866 set_bit(CS_SPREAD_PAGE, &cs->flags);
1867 if (is_spread_slab(parent))
1868 set_bit(CS_SPREAD_SLAB, &cs->flags);
1774 cs->cpus_allowed = CPU_MASK_NONE; 1869 cs->cpus_allowed = CPU_MASK_NONE;
1775 cs->mems_allowed = NODE_MASK_NONE; 1870 cs->mems_allowed = NODE_MASK_NONE;
1776 atomic_set(&cs->count, 0); 1871 atomic_set(&cs->count, 0);
1777 INIT_LIST_HEAD(&cs->sibling); 1872 INIT_LIST_HEAD(&cs->sibling);
1778 INIT_LIST_HEAD(&cs->children); 1873 INIT_LIST_HEAD(&cs->children);
1779 atomic_inc(&cpuset_mems_generation); 1874 cs->mems_generation = cpuset_mems_generation++;
1780 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1781 fmeter_init(&cs->fmeter); 1875 fmeter_init(&cs->fmeter);
1782 1876
1783 cs->parent = parent; 1877 cs->parent = parent;
1784 1878
1785 down(&callback_sem); 1879 mutex_lock(&callback_mutex);
1786 list_add(&cs->sibling, &cs->parent->children); 1880 list_add(&cs->sibling, &cs->parent->children);
1787 number_of_cpusets++; 1881 number_of_cpusets++;
1788 up(&callback_sem); 1882 mutex_unlock(&callback_mutex);
1789 1883
1790 err = cpuset_create_dir(cs, name, mode); 1884 err = cpuset_create_dir(cs, name, mode);
1791 if (err < 0) 1885 if (err < 0)
1792 goto err; 1886 goto err;
1793 1887
1794 /* 1888 /*
1795 * Release manage_sem before cpuset_populate_dir() because it 1889 * Release manage_mutex before cpuset_populate_dir() because it
1796 * will down() this new directory's i_mutex and if we race with 1890 * will down() this new directory's i_mutex and if we race with
1797 * another mkdir, we might deadlock. 1891 * another mkdir, we might deadlock.
1798 */ 1892 */
1799 up(&manage_sem); 1893 mutex_unlock(&manage_mutex);
1800 1894
1801 err = cpuset_populate_dir(cs->dentry); 1895 err = cpuset_populate_dir(cs->dentry);
1802 /* If err < 0, we have a half-filled directory - oh well ;) */ 1896 /* If err < 0, we have a half-filled directory - oh well ;) */
1803 return 0; 1897 return 0;
1804err: 1898err:
1805 list_del(&cs->sibling); 1899 list_del(&cs->sibling);
1806 up(&manage_sem); 1900 mutex_unlock(&manage_mutex);
1807 kfree(cs); 1901 kfree(cs);
1808 return err; 1902 return err;
1809} 1903}
@@ -1825,18 +1919,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1825 1919
1826 /* the vfs holds both inode->i_mutex already */ 1920 /* the vfs holds both inode->i_mutex already */
1827 1921
1828 down(&manage_sem); 1922 mutex_lock(&manage_mutex);
1829 cpuset_update_task_memory_state(); 1923 cpuset_update_task_memory_state();
1830 if (atomic_read(&cs->count) > 0) { 1924 if (atomic_read(&cs->count) > 0) {
1831 up(&manage_sem); 1925 mutex_unlock(&manage_mutex);
1832 return -EBUSY; 1926 return -EBUSY;
1833 } 1927 }
1834 if (!list_empty(&cs->children)) { 1928 if (!list_empty(&cs->children)) {
1835 up(&manage_sem); 1929 mutex_unlock(&manage_mutex);
1836 return -EBUSY; 1930 return -EBUSY;
1837 } 1931 }
1838 parent = cs->parent; 1932 parent = cs->parent;
1839 down(&callback_sem); 1933 mutex_lock(&callback_mutex);
1840 set_bit(CS_REMOVED, &cs->flags); 1934 set_bit(CS_REMOVED, &cs->flags);
1841 if (is_cpu_exclusive(cs)) 1935 if (is_cpu_exclusive(cs))
1842 update_cpu_domains(cs); 1936 update_cpu_domains(cs);
@@ -1848,10 +1942,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1848 cpuset_d_remove_dir(d); 1942 cpuset_d_remove_dir(d);
1849 dput(d); 1943 dput(d);
1850 number_of_cpusets--; 1944 number_of_cpusets--;
1851 up(&callback_sem); 1945 mutex_unlock(&callback_mutex);
1852 if (list_empty(&parent->children)) 1946 if (list_empty(&parent->children))
1853 check_for_release(parent, &pathbuf); 1947 check_for_release(parent, &pathbuf);
1854 up(&manage_sem); 1948 mutex_unlock(&manage_mutex);
1855 cpuset_release_agent(pathbuf); 1949 cpuset_release_agent(pathbuf);
1856 return 0; 1950 return 0;
1857} 1951}
@@ -1867,7 +1961,7 @@ int __init cpuset_init_early(void)
1867 struct task_struct *tsk = current; 1961 struct task_struct *tsk = current;
1868 1962
1869 tsk->cpuset = &top_cpuset; 1963 tsk->cpuset = &top_cpuset;
1870 tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); 1964 tsk->cpuset->mems_generation = cpuset_mems_generation++;
1871 return 0; 1965 return 0;
1872} 1966}
1873 1967
@@ -1886,8 +1980,7 @@ int __init cpuset_init(void)
1886 top_cpuset.mems_allowed = NODE_MASK_ALL; 1980 top_cpuset.mems_allowed = NODE_MASK_ALL;
1887 1981
1888 fmeter_init(&top_cpuset.fmeter); 1982 fmeter_init(&top_cpuset.fmeter);
1889 atomic_inc(&cpuset_mems_generation); 1983 top_cpuset.mems_generation = cpuset_mems_generation++;
1890 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1891 1984
1892 init_task.cpuset = &top_cpuset; 1985 init_task.cpuset = &top_cpuset;
1893 1986
@@ -1960,23 +2053,56 @@ void cpuset_fork(struct task_struct *child)
1960 * Description: Detach cpuset from @tsk and release it. 2053 * Description: Detach cpuset from @tsk and release it.
1961 * 2054 *
1962 * Note that cpusets marked notify_on_release force every task in 2055 * Note that cpusets marked notify_on_release force every task in
1963 * them to take the global manage_sem semaphore when exiting. 2056 * them to take the global manage_mutex mutex when exiting.
1964 * This could impact scaling on very large systems. Be reluctant to 2057 * This could impact scaling on very large systems. Be reluctant to
1965 * use notify_on_release cpusets where very high task exit scaling 2058 * use notify_on_release cpusets where very high task exit scaling
1966 * is required on large systems. 2059 * is required on large systems.
1967 * 2060 *
1968 * Don't even think about derefencing 'cs' after the cpuset use count 2061 * Don't even think about derefencing 'cs' after the cpuset use count
1969 * goes to zero, except inside a critical section guarded by manage_sem 2062 * goes to zero, except inside a critical section guarded by manage_mutex
1970 * or callback_sem. Otherwise a zero cpuset use count is a license to 2063 * or callback_mutex. Otherwise a zero cpuset use count is a license to
1971 * any other task to nuke the cpuset immediately, via cpuset_rmdir(). 2064 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1972 * 2065 *
1973 * This routine has to take manage_sem, not callback_sem, because 2066 * This routine has to take manage_mutex, not callback_mutex, because
1974 * it is holding that semaphore while calling check_for_release(), 2067 * it is holding that mutex while calling check_for_release(),
1975 * which calls kmalloc(), so can't be called holding callback__sem(). 2068 * which calls kmalloc(), so can't be called holding callback_mutex().
1976 * 2069 *
1977 * We don't need to task_lock() this reference to tsk->cpuset, 2070 * We don't need to task_lock() this reference to tsk->cpuset,
1978 * because tsk is already marked PF_EXITING, so attach_task() won't 2071 * because tsk is already marked PF_EXITING, so attach_task() won't
1979 * mess with it, or task is a failed fork, never visible to attach_task. 2072 * mess with it, or task is a failed fork, never visible to attach_task.
2073 *
2074 * the_top_cpuset_hack:
2075 *
2076 * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
2077 *
2078 * Don't leave a task unable to allocate memory, as that is an
2079 * accident waiting to happen should someone add a callout in
2080 * do_exit() after the cpuset_exit() call that might allocate.
2081 * If a task tries to allocate memory with an invalid cpuset,
2082 * it will oops in cpuset_update_task_memory_state().
2083 *
2084 * We call cpuset_exit() while the task is still competent to
2085 * handle notify_on_release(), then leave the task attached to
2086 * the root cpuset (top_cpuset) for the remainder of its exit.
2087 *
2088 * To do this properly, we would increment the reference count on
2089 * top_cpuset, and near the very end of the kernel/exit.c do_exit()
2090 * code we would add a second cpuset function call, to drop that
2091 * reference. This would just create an unnecessary hot spot on
2092 * the top_cpuset reference count, to no avail.
2093 *
2094 * Normally, holding a reference to a cpuset without bumping its
2095 * count is unsafe. The cpuset could go away, or someone could
2096 * attach us to a different cpuset, decrementing the count on
2097 * the first cpuset that we never incremented. But in this case,
2098 * top_cpuset isn't going away, and either task has PF_EXITING set,
2099 * which wards off any attach_task() attempts, or task is a failed
2100 * fork, never visible to attach_task.
2101 *
2102 * Another way to do this would be to set the cpuset pointer
2103 * to NULL here, and check in cpuset_update_task_memory_state()
2104 * for a NULL pointer. This hack avoids that NULL check, for no
2105 * cost (other than this way too long comment ;).
1980 **/ 2106 **/
1981 2107
1982void cpuset_exit(struct task_struct *tsk) 2108void cpuset_exit(struct task_struct *tsk)
@@ -1984,15 +2110,15 @@ void cpuset_exit(struct task_struct *tsk)
1984 struct cpuset *cs; 2110 struct cpuset *cs;
1985 2111
1986 cs = tsk->cpuset; 2112 cs = tsk->cpuset;
1987 tsk->cpuset = NULL; 2113 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
1988 2114
1989 if (notify_on_release(cs)) { 2115 if (notify_on_release(cs)) {
1990 char *pathbuf = NULL; 2116 char *pathbuf = NULL;
1991 2117
1992 down(&manage_sem); 2118 mutex_lock(&manage_mutex);
1993 if (atomic_dec_and_test(&cs->count)) 2119 if (atomic_dec_and_test(&cs->count))
1994 check_for_release(cs, &pathbuf); 2120 check_for_release(cs, &pathbuf);
1995 up(&manage_sem); 2121 mutex_unlock(&manage_mutex);
1996 cpuset_release_agent(pathbuf); 2122 cpuset_release_agent(pathbuf);
1997 } else { 2123 } else {
1998 atomic_dec(&cs->count); 2124 atomic_dec(&cs->count);
@@ -2013,11 +2139,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
2013{ 2139{
2014 cpumask_t mask; 2140 cpumask_t mask;
2015 2141
2016 down(&callback_sem); 2142 mutex_lock(&callback_mutex);
2017 task_lock(tsk); 2143 task_lock(tsk);
2018 guarantee_online_cpus(tsk->cpuset, &mask); 2144 guarantee_online_cpus(tsk->cpuset, &mask);
2019 task_unlock(tsk); 2145 task_unlock(tsk);
2020 up(&callback_sem); 2146 mutex_unlock(&callback_mutex);
2021 2147
2022 return mask; 2148 return mask;
2023} 2149}
@@ -2041,11 +2167,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2041{ 2167{
2042 nodemask_t mask; 2168 nodemask_t mask;
2043 2169
2044 down(&callback_sem); 2170 mutex_lock(&callback_mutex);
2045 task_lock(tsk); 2171 task_lock(tsk);
2046 guarantee_online_mems(tsk->cpuset, &mask); 2172 guarantee_online_mems(tsk->cpuset, &mask);
2047 task_unlock(tsk); 2173 task_unlock(tsk);
2048 up(&callback_sem); 2174 mutex_unlock(&callback_mutex);
2049 2175
2050 return mask; 2176 return mask;
2051} 2177}
@@ -2071,7 +2197,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
2071 2197
2072/* 2198/*
2073 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 2199 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
2074 * ancestor to the specified cpuset. Call holding callback_sem. 2200 * ancestor to the specified cpuset. Call holding callback_mutex.
2075 * If no ancestor is mem_exclusive (an unusual configuration), then 2201 * If no ancestor is mem_exclusive (an unusual configuration), then
2076 * returns the root cpuset. 2202 * returns the root cpuset.
2077 */ 2203 */
@@ -2098,37 +2224,44 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2098 * GFP_KERNEL allocations are not so marked, so can escape to the 2224 * GFP_KERNEL allocations are not so marked, so can escape to the
2099 * nearest mem_exclusive ancestor cpuset. 2225 * nearest mem_exclusive ancestor cpuset.
2100 * 2226 *
2101 * Scanning up parent cpusets requires callback_sem. The __alloc_pages() 2227 * Scanning up parent cpusets requires callback_mutex. The __alloc_pages()
2102 * routine only calls here with __GFP_HARDWALL bit _not_ set if 2228 * routine only calls here with __GFP_HARDWALL bit _not_ set if
2103 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 2229 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
2104 * mems_allowed came up empty on the first pass over the zonelist. 2230 * mems_allowed came up empty on the first pass over the zonelist.
2105 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 2231 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
2106 * short of memory, might require taking the callback_sem semaphore. 2232 * short of memory, might require taking the callback_mutex mutex.
2107 * 2233 *
2108 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 2234 * The first call here from mm/page_alloc:get_page_from_freelist()
2109 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 2235 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
2110 * hardwall cpusets - no allocation on a node outside the cpuset is 2236 * no allocation on a node outside the cpuset is allowed (unless in
2111 * allowed (unless in interrupt, of course). 2237 * interrupt, of course).
2112 * 2238 *
2113 * The second loop doesn't even call here for GFP_ATOMIC requests 2239 * The second pass through get_page_from_freelist() doesn't even call
2114 * (if the __alloc_pages() local variable 'wait' is set). That check 2240 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
2115 * and the checks below have the combined affect in the second loop of 2241 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
2116 * the __alloc_pages() routine that: 2242 * in alloc_flags. That logic and the checks below have the combined
2243 * affect that:
2117 * in_interrupt - any node ok (current task context irrelevant) 2244 * in_interrupt - any node ok (current task context irrelevant)
2118 * GFP_ATOMIC - any node ok 2245 * GFP_ATOMIC - any node ok
2119 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok 2246 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
2120 * GFP_USER - only nodes in current tasks mems allowed ok. 2247 * GFP_USER - only nodes in current tasks mems allowed ok.
2248 *
2249 * Rule:
2250 * Don't call cpuset_zone_allowed() if you can't sleep, unless you
2251 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2252 * the code that might scan up ancestor cpusets and sleep.
2121 **/ 2253 **/
2122 2254
2123int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 2255int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2124{ 2256{
2125 int node; /* node that zone z is on */ 2257 int node; /* node that zone z is on */
2126 const struct cpuset *cs; /* current cpuset ancestors */ 2258 const struct cpuset *cs; /* current cpuset ancestors */
2127 int allowed = 1; /* is allocation in zone z allowed? */ 2259 int allowed; /* is allocation in zone z allowed? */
2128 2260
2129 if (in_interrupt()) 2261 if (in_interrupt())
2130 return 1; 2262 return 1;
2131 node = z->zone_pgdat->node_id; 2263 node = z->zone_pgdat->node_id;
2264 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2132 if (node_isset(node, current->mems_allowed)) 2265 if (node_isset(node, current->mems_allowed))
2133 return 1; 2266 return 1;
2134 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 2267 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
@@ -2138,31 +2271,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2138 return 1; 2271 return 1;
2139 2272
2140 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2273 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2141 down(&callback_sem); 2274 mutex_lock(&callback_mutex);
2142 2275
2143 task_lock(current); 2276 task_lock(current);
2144 cs = nearest_exclusive_ancestor(current->cpuset); 2277 cs = nearest_exclusive_ancestor(current->cpuset);
2145 task_unlock(current); 2278 task_unlock(current);
2146 2279
2147 allowed = node_isset(node, cs->mems_allowed); 2280 allowed = node_isset(node, cs->mems_allowed);
2148 up(&callback_sem); 2281 mutex_unlock(&callback_mutex);
2149 return allowed; 2282 return allowed;
2150} 2283}
2151 2284
2152/** 2285/**
2153 * cpuset_lock - lock out any changes to cpuset structures 2286 * cpuset_lock - lock out any changes to cpuset structures
2154 * 2287 *
2155 * The out of memory (oom) code needs to lock down cpusets 2288 * The out of memory (oom) code needs to mutex_lock cpusets
2156 * from being changed while it scans the tasklist looking for a 2289 * from being changed while it scans the tasklist looking for a
2157 * task in an overlapping cpuset. Expose callback_sem via this 2290 * task in an overlapping cpuset. Expose callback_mutex via this
2158 * cpuset_lock() routine, so the oom code can lock it, before 2291 * cpuset_lock() routine, so the oom code can lock it, before
2159 * locking the task list. The tasklist_lock is a spinlock, so 2292 * locking the task list. The tasklist_lock is a spinlock, so
2160 * must be taken inside callback_sem. 2293 * must be taken inside callback_mutex.
2161 */ 2294 */
2162 2295
2163void cpuset_lock(void) 2296void cpuset_lock(void)
2164{ 2297{
2165 down(&callback_sem); 2298 mutex_lock(&callback_mutex);
2166} 2299}
2167 2300
2168/** 2301/**
@@ -2173,10 +2306,48 @@ void cpuset_lock(void)
2173 2306
2174void cpuset_unlock(void) 2307void cpuset_unlock(void)
2175{ 2308{
2176 up(&callback_sem); 2309 mutex_unlock(&callback_mutex);
2177} 2310}
2178 2311
2179/** 2312/**
2313 * cpuset_mem_spread_node() - On which node to begin search for a page
2314 *
2315 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2316 * tasks in a cpuset with is_spread_page or is_spread_slab set),
2317 * and if the memory allocation used cpuset_mem_spread_node()
2318 * to determine on which node to start looking, as it will for
2319 * certain page cache or slab cache pages such as used for file
2320 * system buffers and inode caches, then instead of starting on the
2321 * local node to look for a free page, rather spread the starting
2322 * node around the tasks mems_allowed nodes.
2323 *
2324 * We don't have to worry about the returned node being offline
2325 * because "it can't happen", and even if it did, it would be ok.
2326 *
2327 * The routines calling guarantee_online_mems() are careful to
2328 * only set nodes in task->mems_allowed that are online. So it
2329 * should not be possible for the following code to return an
2330 * offline node. But if it did, that would be ok, as this routine
2331 * is not returning the node where the allocation must be, only
2332 * the node where the search should start. The zonelist passed to
2333 * __alloc_pages() will include all nodes. If the slab allocator
2334 * is passed an offline node, it will fall back to the local node.
2335 * See kmem_cache_alloc_node().
2336 */
2337
2338int cpuset_mem_spread_node(void)
2339{
2340 int node;
2341
2342 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2343 if (node == MAX_NUMNODES)
2344 node = first_node(current->mems_allowed);
2345 current->cpuset_mem_spread_rotor = node;
2346 return node;
2347}
2348EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2349
2350/**
2180 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? 2351 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
2181 * @p: pointer to task_struct of some other task. 2352 * @p: pointer to task_struct of some other task.
2182 * 2353 *
@@ -2185,7 +2356,7 @@ void cpuset_unlock(void)
2185 * determine if task @p's memory usage might impact the memory 2356 * determine if task @p's memory usage might impact the memory
2186 * available to the current task. 2357 * available to the current task.
2187 * 2358 *
2188 * Call while holding callback_sem. 2359 * Call while holding callback_mutex.
2189 **/ 2360 **/
2190 2361
2191int cpuset_excl_nodes_overlap(const struct task_struct *p) 2362int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2256,13 +2427,13 @@ void __cpuset_memory_pressure_bump(void)
2256 * - Used for /proc/<pid>/cpuset. 2427 * - Used for /proc/<pid>/cpuset.
2257 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2428 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2258 * doesn't really matter if tsk->cpuset changes after we read it, 2429 * doesn't really matter if tsk->cpuset changes after we read it,
2259 * and we take manage_sem, keeping attach_task() from changing it 2430 * and we take manage_mutex, keeping attach_task() from changing it
2260 * anyway. 2431 * anyway. No need to check that tsk->cpuset != NULL, thanks to
2432 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2433 * cpuset to top_cpuset.
2261 */ 2434 */
2262
2263static int proc_cpuset_show(struct seq_file *m, void *v) 2435static int proc_cpuset_show(struct seq_file *m, void *v)
2264{ 2436{
2265 struct cpuset *cs;
2266 struct task_struct *tsk; 2437 struct task_struct *tsk;
2267 char *buf; 2438 char *buf;
2268 int retval = 0; 2439 int retval = 0;
@@ -2272,20 +2443,14 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2272 return -ENOMEM; 2443 return -ENOMEM;
2273 2444
2274 tsk = m->private; 2445 tsk = m->private;
2275 down(&manage_sem); 2446 mutex_lock(&manage_mutex);
2276 cs = tsk->cpuset; 2447 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2277 if (!cs) {
2278 retval = -EINVAL;
2279 goto out;
2280 }
2281
2282 retval = cpuset_path(cs, buf, PAGE_SIZE);
2283 if (retval < 0) 2448 if (retval < 0)
2284 goto out; 2449 goto out;
2285 seq_puts(m, buf); 2450 seq_puts(m, buf);
2286 seq_putc(m, '\n'); 2451 seq_putc(m, '\n');
2287out: 2452out:
2288 up(&manage_sem); 2453 mutex_unlock(&manage_mutex);
2289 kfree(buf); 2454 kfree(buf);
2290 return retval; 2455 return retval;
2291} 2456}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 867d6dbeb5..c01cead2cf 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -140,6 +140,7 @@ __set_personality(u_long personality)
140 ep = lookup_exec_domain(personality); 140 ep = lookup_exec_domain(personality);
141 if (ep == current_thread_info()->exec_domain) { 141 if (ep == current_thread_info()->exec_domain) {
142 current->personality = personality; 142 current->personality = personality;
143 module_put(ep->module);
143 return 0; 144 return 0;
144 } 145 }
145 146
diff --git a/kernel/exit.c b/kernel/exit.c
index 93cee36713..e95b932822 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,8 +29,13 @@
29#include <linux/cpuset.h> 29#include <linux/cpuset.h>
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/signal.h> 31#include <linux/signal.h>
32#include <linux/posix-timers.h>
32#include <linux/cn_proc.h> 33#include <linux/cn_proc.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/futex.h>
36#include <linux/compat.h>
37#include <linux/pipe_fs_i.h>
38#include <linux/audit.h> /* for audit_free() */
34 39
35#include <asm/uaccess.h> 40#include <asm/uaccess.h>
36#include <asm/unistd.h> 41#include <asm/unistd.h>
@@ -48,15 +53,85 @@ static void __unhash_process(struct task_struct *p)
48{ 53{
49 nr_threads--; 54 nr_threads--;
50 detach_pid(p, PIDTYPE_PID); 55 detach_pid(p, PIDTYPE_PID);
51 detach_pid(p, PIDTYPE_TGID);
52 if (thread_group_leader(p)) { 56 if (thread_group_leader(p)) {
53 detach_pid(p, PIDTYPE_PGID); 57 detach_pid(p, PIDTYPE_PGID);
54 detach_pid(p, PIDTYPE_SID); 58 detach_pid(p, PIDTYPE_SID);
55 if (p->pid) 59
56 __get_cpu_var(process_counts)--; 60 list_del_rcu(&p->tasks);
61 __get_cpu_var(process_counts)--;
62 }
63 list_del_rcu(&p->thread_group);
64 remove_parent(p);
65}
66
67/*
68 * This function expects the tasklist_lock write-locked.
69 */
70static void __exit_signal(struct task_struct *tsk)
71{
72 struct signal_struct *sig = tsk->signal;
73 struct sighand_struct *sighand;
74
75 BUG_ON(!sig);
76 BUG_ON(!atomic_read(&sig->count));
77
78 rcu_read_lock();
79 sighand = rcu_dereference(tsk->sighand);
80 spin_lock(&sighand->siglock);
81
82 posix_cpu_timers_exit(tsk);
83 if (atomic_dec_and_test(&sig->count))
84 posix_cpu_timers_exit_group(tsk);
85 else {
86 /*
87 * If there is any task waiting for the group exit
88 * then notify it:
89 */
90 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
91 wake_up_process(sig->group_exit_task);
92 sig->group_exit_task = NULL;
93 }
94 if (tsk == sig->curr_target)
95 sig->curr_target = next_thread(tsk);
96 /*
97 * Accumulate here the counters for all threads but the
98 * group leader as they die, so they can be added into
99 * the process-wide totals when those are taken.
100 * The group leader stays around as a zombie as long
101 * as there are other threads. When it gets reaped,
102 * the exit.c code will add its counts into these totals.
103 * We won't ever get here for the group leader, since it
104 * will have been the last reference on the signal_struct.
105 */
106 sig->utime = cputime_add(sig->utime, tsk->utime);
107 sig->stime = cputime_add(sig->stime, tsk->stime);
108 sig->min_flt += tsk->min_flt;
109 sig->maj_flt += tsk->maj_flt;
110 sig->nvcsw += tsk->nvcsw;
111 sig->nivcsw += tsk->nivcsw;
112 sig->sched_time += tsk->sched_time;
113 sig = NULL; /* Marker for below. */
57 } 114 }
58 115
59 REMOVE_LINKS(p); 116 __unhash_process(tsk);
117
118 tsk->signal = NULL;
119 tsk->sighand = NULL;
120 spin_unlock(&sighand->siglock);
121 rcu_read_unlock();
122
123 __cleanup_sighand(sighand);
124 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
125 flush_sigqueue(&tsk->pending);
126 if (sig) {
127 flush_sigqueue(&sig->shared_pending);
128 __cleanup_signal(sig);
129 }
130}
131
132static void delayed_put_task_struct(struct rcu_head *rhp)
133{
134 put_task_struct(container_of(rhp, struct task_struct, rcu));
60} 135}
61 136
62void release_task(struct task_struct * p) 137void release_task(struct task_struct * p)
@@ -65,21 +140,14 @@ void release_task(struct task_struct * p)
65 task_t *leader; 140 task_t *leader;
66 struct dentry *proc_dentry; 141 struct dentry *proc_dentry;
67 142
68repeat: 143repeat:
69 atomic_dec(&p->user->processes); 144 atomic_dec(&p->user->processes);
70 spin_lock(&p->proc_lock); 145 spin_lock(&p->proc_lock);
71 proc_dentry = proc_pid_unhash(p); 146 proc_dentry = proc_pid_unhash(p);
72 write_lock_irq(&tasklist_lock); 147 write_lock_irq(&tasklist_lock);
73 if (unlikely(p->ptrace)) 148 ptrace_unlink(p);
74 __ptrace_unlink(p);
75 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 149 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
76 __exit_signal(p); 150 __exit_signal(p);
77 /*
78 * Note that the fastpath in sys_times depends on __exit_signal having
79 * updated the counters before a task is removed from the tasklist of
80 * the process by __unhash_process.
81 */
82 __unhash_process(p);
83 151
84 /* 152 /*
85 * If we are the last non-leader member of the thread 153 * If we are the last non-leader member of the thread
@@ -107,28 +175,13 @@ repeat:
107 spin_unlock(&p->proc_lock); 175 spin_unlock(&p->proc_lock);
108 proc_pid_flush(proc_dentry); 176 proc_pid_flush(proc_dentry);
109 release_thread(p); 177 release_thread(p);
110 put_task_struct(p); 178 call_rcu(&p->rcu, delayed_put_task_struct);
111 179
112 p = leader; 180 p = leader;
113 if (unlikely(zap_leader)) 181 if (unlikely(zap_leader))
114 goto repeat; 182 goto repeat;
115} 183}
116 184
117/* we are using it only for SMP init */
118
119void unhash_process(struct task_struct *p)
120{
121 struct dentry *proc_dentry;
122
123 spin_lock(&p->proc_lock);
124 proc_dentry = proc_pid_unhash(p);
125 write_lock_irq(&tasklist_lock);
126 __unhash_process(p);
127 write_unlock_irq(&tasklist_lock);
128 spin_unlock(&p->proc_lock);
129 proc_pid_flush(proc_dentry);
130}
131
132/* 185/*
133 * This checks not only the pgrp, but falls back on the pid if no 186 * This checks not only the pgrp, but falls back on the pid if no
134 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly 187 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
@@ -236,10 +289,10 @@ static void reparent_to_init(void)
236 289
237 ptrace_unlink(current); 290 ptrace_unlink(current);
238 /* Reparent to init */ 291 /* Reparent to init */
239 REMOVE_LINKS(current); 292 remove_parent(current);
240 current->parent = child_reaper; 293 current->parent = child_reaper;
241 current->real_parent = child_reaper; 294 current->real_parent = child_reaper;
242 SET_LINKS(current); 295 add_parent(current);
243 296
244 /* Set the exit signal to SIGCHLD so we signal init on exit */ 297 /* Set the exit signal to SIGCHLD so we signal init on exit */
245 current->exit_signal = SIGCHLD; 298 current->exit_signal = SIGCHLD;
@@ -345,9 +398,9 @@ void daemonize(const char *name, ...)
345 exit_mm(current); 398 exit_mm(current);
346 399
347 set_special_pids(1, 1); 400 set_special_pids(1, 1);
348 down(&tty_sem); 401 mutex_lock(&tty_mutex);
349 current->signal->tty = NULL; 402 current->signal->tty = NULL;
350 up(&tty_sem); 403 mutex_unlock(&tty_mutex);
351 404
352 /* Block and flush all signals */ 405 /* Block and flush all signals */
353 sigfillset(&blocked); 406 sigfillset(&blocked);
@@ -360,6 +413,9 @@ void daemonize(const char *name, ...)
360 fs = init_task.fs; 413 fs = init_task.fs;
361 current->fs = fs; 414 current->fs = fs;
362 atomic_inc(&fs->count); 415 atomic_inc(&fs->count);
416 exit_namespace(current);
417 current->namespace = init_task.namespace;
418 get_namespace(current->namespace);
363 exit_files(current); 419 exit_files(current);
364 current->files = init_task.files; 420 current->files = init_task.files;
365 atomic_inc(&current->files->count); 421 atomic_inc(&current->files->count);
@@ -533,13 +589,13 @@ static void exit_mm(struct task_struct * tsk)
533 mmput(mm); 589 mmput(mm);
534} 590}
535 591
536static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) 592static inline void choose_new_parent(task_t *p, task_t *reaper)
537{ 593{
538 /* 594 /*
539 * Make sure we're not reparenting to ourselves and that 595 * Make sure we're not reparenting to ourselves and that
540 * the parent is not a zombie. 596 * the parent is not a zombie.
541 */ 597 */
542 BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); 598 BUG_ON(p == reaper || reaper->exit_state);
543 p->real_parent = reaper; 599 p->real_parent = reaper;
544} 600}
545 601
@@ -564,9 +620,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
564 * anyway, so let go of it. 620 * anyway, so let go of it.
565 */ 621 */
566 p->ptrace = 0; 622 p->ptrace = 0;
567 list_del_init(&p->sibling); 623 remove_parent(p);
568 p->parent = p->real_parent; 624 p->parent = p->real_parent;
569 list_add_tail(&p->sibling, &p->parent->children); 625 add_parent(p);
570 626
571 /* If we'd notified the old parent about this child's death, 627 /* If we'd notified the old parent about this child's death,
572 * also notify the new parent. 628 * also notify the new parent.
@@ -640,7 +696,7 @@ static void forget_original_parent(struct task_struct * father,
640 696
641 if (father == p->real_parent) { 697 if (father == p->real_parent) {
642 /* reparent with a reaper, real father it's us */ 698 /* reparent with a reaper, real father it's us */
643 choose_new_parent(p, reaper, child_reaper); 699 choose_new_parent(p, reaper);
644 reparent_thread(p, father, 0); 700 reparent_thread(p, father, 0);
645 } else { 701 } else {
646 /* reparent ptraced task to its real parent */ 702 /* reparent ptraced task to its real parent */
@@ -661,7 +717,7 @@ static void forget_original_parent(struct task_struct * father,
661 } 717 }
662 list_for_each_safe(_p, _n, &father->ptrace_children) { 718 list_for_each_safe(_p, _n, &father->ptrace_children) {
663 p = list_entry(_p,struct task_struct,ptrace_list); 719 p = list_entry(_p,struct task_struct,ptrace_list);
664 choose_new_parent(p, reaper, child_reaper); 720 choose_new_parent(p, reaper);
665 reparent_thread(p, father, 1); 721 reparent_thread(p, father, 1);
666 } 722 }
667} 723}
@@ -802,10 +858,8 @@ fastcall NORET_TYPE void do_exit(long code)
802 panic("Aiee, killing interrupt handler!"); 858 panic("Aiee, killing interrupt handler!");
803 if (unlikely(!tsk->pid)) 859 if (unlikely(!tsk->pid))
804 panic("Attempted to kill the idle task!"); 860 panic("Attempted to kill the idle task!");
805 if (unlikely(tsk->pid == 1)) 861 if (unlikely(tsk == child_reaper))
806 panic("Attempted to kill init!"); 862 panic("Attempted to kill init!");
807 if (tsk->io_context)
808 exit_io_context();
809 863
810 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 864 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
811 current->ptrace_message = code; 865 current->ptrace_message = code;
@@ -819,6 +873,8 @@ fastcall NORET_TYPE void do_exit(long code)
819 if (unlikely(tsk->flags & PF_EXITING)) { 873 if (unlikely(tsk->flags & PF_EXITING)) {
820 printk(KERN_ALERT 874 printk(KERN_ALERT
821 "Fixing recursive fault but reboot is needed!\n"); 875 "Fixing recursive fault but reboot is needed!\n");
876 if (tsk->io_context)
877 exit_io_context();
822 set_current_state(TASK_UNINTERRUPTIBLE); 878 set_current_state(TASK_UNINTERRUPTIBLE);
823 schedule(); 879 schedule();
824 } 880 }
@@ -849,6 +905,14 @@ fastcall NORET_TYPE void do_exit(long code)
849 exit_itimers(tsk->signal); 905 exit_itimers(tsk->signal);
850 acct_process(code); 906 acct_process(code);
851 } 907 }
908 if (unlikely(tsk->robust_list))
909 exit_robust_list(tsk);
910#ifdef CONFIG_COMPAT
911 if (unlikely(tsk->compat_robust_list))
912 compat_exit_robust_list(tsk);
913#endif
914 if (unlikely(tsk->audit_context))
915 audit_free(tsk);
852 exit_mm(tsk); 916 exit_mm(tsk);
853 917
854 exit_sem(tsk); 918 exit_sem(tsk);
@@ -878,6 +942,12 @@ fastcall NORET_TYPE void do_exit(long code)
878 */ 942 */
879 mutex_debug_check_no_locks_held(tsk); 943 mutex_debug_check_no_locks_held(tsk);
880 944
945 if (tsk->io_context)
946 exit_io_context();
947
948 if (tsk->splice_pipe)
949 __free_pipe_info(tsk->splice_pipe);
950
881 /* PF_DEAD causes final put_task_struct after we schedule. */ 951 /* PF_DEAD causes final put_task_struct after we schedule. */
882 preempt_disable(); 952 preempt_disable();
883 BUG_ON(tsk->flags & PF_DEAD); 953 BUG_ON(tsk->flags & PF_DEAD);
@@ -906,13 +976,6 @@ asmlinkage long sys_exit(int error_code)
906 do_exit((error_code&0xff)<<8); 976 do_exit((error_code&0xff)<<8);
907} 977}
908 978
909task_t fastcall *next_thread(const task_t *p)
910{
911 return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
912}
913
914EXPORT_SYMBOL(next_thread);
915
916/* 979/*
917 * Take down every thread in the group. This is called by fatal signals 980 * Take down every thread in the group. This is called by fatal signals
918 * as well as by sys_exit_group (below). 981 * as well as by sys_exit_group (below).
@@ -927,7 +990,6 @@ do_group_exit(int exit_code)
927 else if (!thread_group_empty(current)) { 990 else if (!thread_group_empty(current)) {
928 struct signal_struct *const sig = current->signal; 991 struct signal_struct *const sig = current->signal;
929 struct sighand_struct *const sighand = current->sighand; 992 struct sighand_struct *const sighand = current->sighand;
930 read_lock(&tasklist_lock);
931 spin_lock_irq(&sighand->siglock); 993 spin_lock_irq(&sighand->siglock);
932 if (sig->flags & SIGNAL_GROUP_EXIT) 994 if (sig->flags & SIGNAL_GROUP_EXIT)
933 /* Another thread got here before we took the lock. */ 995 /* Another thread got here before we took the lock. */
@@ -937,7 +999,6 @@ do_group_exit(int exit_code)
937 zap_other_threads(current); 999 zap_other_threads(current);
938 } 1000 }
939 spin_unlock_irq(&sighand->siglock); 1001 spin_unlock_irq(&sighand->siglock);
940 read_unlock(&tasklist_lock);
941 } 1002 }
942 1003
943 do_exit(exit_code); 1004 do_exit(exit_code);
@@ -1267,7 +1328,7 @@ bail_ref:
1267 1328
1268 /* move to end of parent's list to avoid starvation */ 1329 /* move to end of parent's list to avoid starvation */
1269 remove_parent(p); 1330 remove_parent(p);
1270 add_parent(p, p->parent); 1331 add_parent(p);
1271 1332
1272 write_unlock_irq(&tasklist_lock); 1333 write_unlock_irq(&tasklist_lock);
1273 1334
diff --git a/kernel/extable.c b/kernel/extable.c
index 7501b531ce..7fe2628553 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,7 +40,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
40 return e; 40 return e;
41} 41}
42 42
43static int core_kernel_text(unsigned long addr) 43int core_kernel_text(unsigned long addr)
44{ 44{
45 if (addr >= (unsigned long)_stext && 45 if (addr >= (unsigned long)_stext &&
46 addr <= (unsigned long)_etext) 46 addr <= (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e88b374ce..ac8100e308 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
84#endif 84#endif
85 85
86/* SLAB cache for signal_struct structures (tsk->signal) */ 86/* SLAB cache for signal_struct structures (tsk->signal) */
87kmem_cache_t *signal_cachep; 87static kmem_cache_t *signal_cachep;
88 88
89/* SLAB cache for sighand_struct structures (tsk->sighand) */ 89/* SLAB cache for sighand_struct structures (tsk->sighand) */
90kmem_cache_t *sighand_cachep; 90kmem_cache_t *sighand_cachep;
@@ -114,8 +114,6 @@ void __put_task_struct(struct task_struct *tsk)
114 WARN_ON(atomic_read(&tsk->usage)); 114 WARN_ON(atomic_read(&tsk->usage));
115 WARN_ON(tsk == current); 115 WARN_ON(tsk == current);
116 116
117 if (unlikely(tsk->audit_context))
118 audit_free(tsk);
119 security_task_free(tsk); 117 security_task_free(tsk);
120 free_uid(tsk->user); 118 free_uid(tsk->user);
121 put_group_info(tsk->group_info); 119 put_group_info(tsk->group_info);
@@ -179,6 +177,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
179 /* One for us, one for whoever does the "release_task()" (usually parent) */ 177 /* One for us, one for whoever does the "release_task()" (usually parent) */
180 atomic_set(&tsk->usage,2); 178 atomic_set(&tsk->usage,2);
181 atomic_set(&tsk->fs_excl, 0); 179 atomic_set(&tsk->fs_excl, 0);
180 tsk->btrace_seq = 0;
181 tsk->splice_pipe = NULL;
182 return tsk; 182 return tsk;
183} 183}
184 184
@@ -605,12 +605,12 @@ static struct files_struct *alloc_files(void)
605 atomic_set(&newf->count, 1); 605 atomic_set(&newf->count, 1);
606 606
607 spin_lock_init(&newf->file_lock); 607 spin_lock_init(&newf->file_lock);
608 newf->next_fd = 0;
608 fdt = &newf->fdtab; 609 fdt = &newf->fdtab;
609 fdt->next_fd = 0;
610 fdt->max_fds = NR_OPEN_DEFAULT; 610 fdt->max_fds = NR_OPEN_DEFAULT;
611 fdt->max_fdset = __FD_SETSIZE; 611 fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
612 fdt->close_on_exec = &newf->close_on_exec_init; 612 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
613 fdt->open_fds = &newf->open_fds_init; 613 fdt->open_fds = (fd_set *)&newf->open_fds_init;
614 fdt->fd = &newf->fd_array[0]; 614 fdt->fd = &newf->fd_array[0];
615 INIT_RCU_HEAD(&fdt->rcu); 615 INIT_RCU_HEAD(&fdt->rcu);
616 fdt->free_files = NULL; 616 fdt->free_files = NULL;
@@ -718,7 +718,7 @@ out_release:
718 free_fdset (new_fdt->open_fds, new_fdt->max_fdset); 718 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
719 free_fd_array(new_fdt->fd, new_fdt->max_fds); 719 free_fd_array(new_fdt->fd, new_fdt->max_fds);
720 kmem_cache_free(files_cachep, newf); 720 kmem_cache_free(files_cachep, newf);
721 goto out; 721 return NULL;
722} 722}
723 723
724static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 724static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
@@ -766,8 +766,7 @@ int unshare_files(void)
766 struct files_struct *files = current->files; 766 struct files_struct *files = current->files;
767 int rc; 767 int rc;
768 768
769 if(!files) 769 BUG_ON(!files);
770 BUG();
771 770
772 /* This can race but the race causes us to copy when we don't 771 /* This can race but the race causes us to copy when we don't
773 need to and drop the copy */ 772 need to and drop the copy */
@@ -784,14 +783,6 @@ int unshare_files(void)
784 783
785EXPORT_SYMBOL(unshare_files); 784EXPORT_SYMBOL(unshare_files);
786 785
787void sighand_free_cb(struct rcu_head *rhp)
788{
789 struct sighand_struct *sp;
790
791 sp = container_of(rhp, struct sighand_struct, rcu);
792 kmem_cache_free(sighand_cachep, sp);
793}
794
795static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 786static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
796{ 787{
797 struct sighand_struct *sig; 788 struct sighand_struct *sig;
@@ -804,12 +795,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
804 rcu_assign_pointer(tsk->sighand, sig); 795 rcu_assign_pointer(tsk->sighand, sig);
805 if (!sig) 796 if (!sig)
806 return -ENOMEM; 797 return -ENOMEM;
807 spin_lock_init(&sig->siglock);
808 atomic_set(&sig->count, 1); 798 atomic_set(&sig->count, 1);
809 memcpy(sig->action, current->sighand->action, sizeof(sig->action)); 799 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
810 return 0; 800 return 0;
811} 801}
812 802
803void __cleanup_sighand(struct sighand_struct *sighand)
804{
805 if (atomic_dec_and_test(&sighand->count))
806 kmem_cache_free(sighand_cachep, sighand);
807}
808
813static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) 809static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
814{ 810{
815 struct signal_struct *sig; 811 struct signal_struct *sig;
@@ -845,7 +841,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
845 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); 841 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
846 sig->it_real_incr.tv64 = 0; 842 sig->it_real_incr.tv64 = 0;
847 sig->real_timer.function = it_real_fn; 843 sig->real_timer.function = it_real_fn;
848 sig->real_timer.data = tsk; 844 sig->tsk = tsk;
849 845
850 sig->it_virt_expires = cputime_zero; 846 sig->it_virt_expires = cputime_zero;
851 sig->it_virt_incr = cputime_zero; 847 sig->it_virt_incr = cputime_zero;
@@ -879,6 +875,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
879 return 0; 875 return 0;
880} 876}
881 877
878void __cleanup_signal(struct signal_struct *sig)
879{
880 exit_thread_group_keys(sig);
881 kmem_cache_free(signal_cachep, sig);
882}
883
884static inline void cleanup_signal(struct task_struct *tsk)
885{
886 struct signal_struct *sig = tsk->signal;
887
888 atomic_dec(&sig->live);
889
890 if (atomic_dec_and_test(&sig->count))
891 __cleanup_signal(sig);
892}
893
882static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) 894static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
883{ 895{
884 unsigned long new_flags = p->flags; 896 unsigned long new_flags = p->flags;
@@ -1018,6 +1030,7 @@ static task_t *copy_process(unsigned long clone_flags,
1018 p->mempolicy = NULL; 1030 p->mempolicy = NULL;
1019 goto bad_fork_cleanup_cpuset; 1031 goto bad_fork_cleanup_cpuset;
1020 } 1032 }
1033 mpol_fix_fork_child_flag(p);
1021#endif 1034#endif
1022 1035
1023#ifdef CONFIG_DEBUG_MUTEXES 1036#ifdef CONFIG_DEBUG_MUTEXES
@@ -1058,6 +1071,15 @@ static task_t *copy_process(unsigned long clone_flags,
1058 * Clear TID on mm_release()? 1071 * Clear TID on mm_release()?
1059 */ 1072 */
1060 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1073 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1074 p->robust_list = NULL;
1075#ifdef CONFIG_COMPAT
1076 p->compat_robust_list = NULL;
1077#endif
1078 /*
1079 * sigaltstack should be cleared when sharing the same VM
1080 */
1081 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
1082 p->sas_ss_sp = p->sas_ss_size = 0;
1061 1083
1062 /* 1084 /*
1063 * Syscall tracing should be turned off in the child regardless 1085 * Syscall tracing should be turned off in the child regardless
@@ -1083,6 +1105,7 @@ static task_t *copy_process(unsigned long clone_flags,
1083 * We dont wake it up yet. 1105 * We dont wake it up yet.
1084 */ 1106 */
1085 p->group_leader = p; 1107 p->group_leader = p;
1108 INIT_LIST_HEAD(&p->thread_group);
1086 INIT_LIST_HEAD(&p->ptrace_children); 1109 INIT_LIST_HEAD(&p->ptrace_children);
1087 INIT_LIST_HEAD(&p->ptrace_list); 1110 INIT_LIST_HEAD(&p->ptrace_list);
1088 1111
@@ -1106,16 +1129,6 @@ static task_t *copy_process(unsigned long clone_flags,
1106 !cpu_online(task_cpu(p)))) 1129 !cpu_online(task_cpu(p))))
1107 set_task_cpu(p, smp_processor_id()); 1130 set_task_cpu(p, smp_processor_id());
1108 1131
1109 /*
1110 * Check for pending SIGKILL! The new thread should not be allowed
1111 * to slip out of an OOM kill. (or normal SIGKILL.)
1112 */
1113 if (sigismember(&current->pending.signal, SIGKILL)) {
1114 write_unlock_irq(&tasklist_lock);
1115 retval = -EINTR;
1116 goto bad_fork_cleanup_namespace;
1117 }
1118
1119 /* CLONE_PARENT re-uses the old parent */ 1132 /* CLONE_PARENT re-uses the old parent */
1120 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) 1133 if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
1121 p->real_parent = current->real_parent; 1134 p->real_parent = current->real_parent;
@@ -1123,8 +1136,25 @@ static task_t *copy_process(unsigned long clone_flags,
1123 p->real_parent = current; 1136 p->real_parent = current;
1124 p->parent = p->real_parent; 1137 p->parent = p->real_parent;
1125 1138
1139 spin_lock(&current->sighand->siglock);
1140
1141 /*
1142 * Process group and session signals need to be delivered to just the
1143 * parent before the fork or both the parent and the child after the
1144 * fork. Restart if a signal comes in before we add the new process to
1145 * it's process group.
1146 * A fatal signal pending means that current will exit, so the new
1147 * thread can't slip out of an OOM kill (or normal SIGKILL).
1148 */
1149 recalc_sigpending();
1150 if (signal_pending(current)) {
1151 spin_unlock(&current->sighand->siglock);
1152 write_unlock_irq(&tasklist_lock);
1153 retval = -ERESTARTNOINTR;
1154 goto bad_fork_cleanup_namespace;
1155 }
1156
1126 if (clone_flags & CLONE_THREAD) { 1157 if (clone_flags & CLONE_THREAD) {
1127 spin_lock(&current->sighand->siglock);
1128 /* 1158 /*
1129 * Important: if an exit-all has been started then 1159 * Important: if an exit-all has been started then
1130 * do not create this new thread - the whole thread 1160 * do not create this new thread - the whole thread
@@ -1136,17 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags,
1136 retval = -EAGAIN; 1166 retval = -EAGAIN;
1137 goto bad_fork_cleanup_namespace; 1167 goto bad_fork_cleanup_namespace;
1138 } 1168 }
1139 p->group_leader = current->group_leader;
1140 1169
1141 if (current->signal->group_stop_count > 0) { 1170 p->group_leader = current->group_leader;
1142 /* 1171 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1143 * There is an all-stop in progress for the group.
1144 * We ourselves will stop as soon as we check signals.
1145 * Make the new thread part of that group stop too.
1146 */
1147 current->signal->group_stop_count++;
1148 set_tsk_thread_flag(p, TIF_SIGPENDING);
1149 }
1150 1172
1151 if (!cputime_eq(current->signal->it_virt_expires, 1173 if (!cputime_eq(current->signal->it_virt_expires,
1152 cputime_zero) || 1174 cputime_zero) ||
@@ -1162,8 +1184,6 @@ static task_t *copy_process(unsigned long clone_flags,
1162 */ 1184 */
1163 p->it_prof_expires = jiffies_to_cputime(1); 1185 p->it_prof_expires = jiffies_to_cputime(1);
1164 } 1186 }
1165
1166 spin_unlock(&current->sighand->siglock);
1167 } 1187 }
1168 1188
1169 /* 1189 /*
@@ -1171,24 +1191,27 @@ static task_t *copy_process(unsigned long clone_flags,
1171 */ 1191 */
1172 p->ioprio = current->ioprio; 1192 p->ioprio = current->ioprio;
1173 1193
1174 SET_LINKS(p); 1194 if (likely(p->pid)) {
1175 if (unlikely(p->ptrace & PT_PTRACED)) 1195 add_parent(p);
1176 __ptrace_link(p, current->parent); 1196 if (unlikely(p->ptrace & PT_PTRACED))
1177 1197 __ptrace_link(p, current->parent);
1178 attach_pid(p, PIDTYPE_PID, p->pid); 1198
1179 attach_pid(p, PIDTYPE_TGID, p->tgid); 1199 if (thread_group_leader(p)) {
1180 if (thread_group_leader(p)) { 1200 p->signal->tty = current->signal->tty;
1181 p->signal->tty = current->signal->tty; 1201 p->signal->pgrp = process_group(current);
1182 p->signal->pgrp = process_group(current); 1202 p->signal->session = current->signal->session;
1183 p->signal->session = current->signal->session; 1203 attach_pid(p, PIDTYPE_PGID, process_group(p));
1184 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1204 attach_pid(p, PIDTYPE_SID, p->signal->session);
1185 attach_pid(p, PIDTYPE_SID, p->signal->session); 1205
1186 if (p->pid) 1206 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1187 __get_cpu_var(process_counts)++; 1207 __get_cpu_var(process_counts)++;
1208 }
1209 attach_pid(p, PIDTYPE_PID, p->pid);
1210 nr_threads++;
1188 } 1211 }
1189 1212
1190 nr_threads++;
1191 total_forks++; 1213 total_forks++;
1214 spin_unlock(&current->sighand->siglock);
1192 write_unlock_irq(&tasklist_lock); 1215 write_unlock_irq(&tasklist_lock);
1193 proc_fork_connector(p); 1216 proc_fork_connector(p);
1194 return p; 1217 return p;
@@ -1201,9 +1224,9 @@ bad_fork_cleanup_mm:
1201 if (p->mm) 1224 if (p->mm)
1202 mmput(p->mm); 1225 mmput(p->mm);
1203bad_fork_cleanup_signal: 1226bad_fork_cleanup_signal:
1204 exit_signal(p); 1227 cleanup_signal(p);
1205bad_fork_cleanup_sighand: 1228bad_fork_cleanup_sighand:
1206 exit_sighand(p); 1229 __cleanup_sighand(p->sighand);
1207bad_fork_cleanup_fs: 1230bad_fork_cleanup_fs:
1208 exit_fs(p); /* blocking */ 1231 exit_fs(p); /* blocking */
1209bad_fork_cleanup_files: 1232bad_fork_cleanup_files:
@@ -1250,7 +1273,7 @@ task_t * __devinit fork_idle(int cpu)
1250 if (!task) 1273 if (!task)
1251 return ERR_PTR(-ENOMEM); 1274 return ERR_PTR(-ENOMEM);
1252 init_idle(task, cpu); 1275 init_idle(task, cpu);
1253 unhash_process(task); 1276
1254 return task; 1277 return task;
1255} 1278}
1256 1279
@@ -1285,17 +1308,19 @@ long do_fork(unsigned long clone_flags,
1285{ 1308{
1286 struct task_struct *p; 1309 struct task_struct *p;
1287 int trace = 0; 1310 int trace = 0;
1288 long pid = alloc_pidmap(); 1311 struct pid *pid = alloc_pid();
1312 long nr;
1289 1313
1290 if (pid < 0) 1314 if (!pid)
1291 return -EAGAIN; 1315 return -EAGAIN;
1316 nr = pid->nr;
1292 if (unlikely(current->ptrace)) { 1317 if (unlikely(current->ptrace)) {
1293 trace = fork_traceflag (clone_flags); 1318 trace = fork_traceflag (clone_flags);
1294 if (trace) 1319 if (trace)
1295 clone_flags |= CLONE_PTRACE; 1320 clone_flags |= CLONE_PTRACE;
1296 } 1321 }
1297 1322
1298 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1323 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
1299 /* 1324 /*
1300 * Do this prior waking up the new thread - the thread pointer 1325 * Do this prior waking up the new thread - the thread pointer
1301 * might get invalid after that point, if the thread exits quickly. 1326 * might get invalid after that point, if the thread exits quickly.
@@ -1322,7 +1347,7 @@ long do_fork(unsigned long clone_flags,
1322 p->state = TASK_STOPPED; 1347 p->state = TASK_STOPPED;
1323 1348
1324 if (unlikely (trace)) { 1349 if (unlikely (trace)) {
1325 current->ptrace_message = pid; 1350 current->ptrace_message = nr;
1326 ptrace_notify ((trace << 8) | SIGTRAP); 1351 ptrace_notify ((trace << 8) | SIGTRAP);
1327 } 1352 }
1328 1353
@@ -1332,21 +1357,31 @@ long do_fork(unsigned long clone_flags,
1332 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 1357 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1333 } 1358 }
1334 } else { 1359 } else {
1335 free_pidmap(pid); 1360 free_pid(pid);
1336 pid = PTR_ERR(p); 1361 nr = PTR_ERR(p);
1337 } 1362 }
1338 return pid; 1363 return nr;
1339} 1364}
1340 1365
1341#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1366#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1342#define ARCH_MIN_MMSTRUCT_ALIGN 0 1367#define ARCH_MIN_MMSTRUCT_ALIGN 0
1343#endif 1368#endif
1344 1369
1370static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
1371{
1372 struct sighand_struct *sighand = data;
1373
1374 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
1375 SLAB_CTOR_CONSTRUCTOR)
1376 spin_lock_init(&sighand->siglock);
1377}
1378
1345void __init proc_caches_init(void) 1379void __init proc_caches_init(void)
1346{ 1380{
1347 sighand_cachep = kmem_cache_create("sighand_cache", 1381 sighand_cachep = kmem_cache_create("sighand_cache",
1348 sizeof(struct sighand_struct), 0, 1382 sizeof(struct sighand_struct), 0,
1349 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1383 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1384 sighand_ctor, NULL);
1350 signal_cachep = kmem_cache_create("signal_cache", 1385 signal_cachep = kmem_cache_create("signal_cache",
1351 sizeof(struct signal_struct), 0, 1386 sizeof(struct signal_struct), 0,
1352 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1387 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
@@ -1471,9 +1506,7 @@ static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1471 1506
1472 if ((unshare_flags & CLONE_VM) && 1507 if ((unshare_flags & CLONE_VM) &&
1473 (mm && atomic_read(&mm->mm_users) > 1)) { 1508 (mm && atomic_read(&mm->mm_users) > 1)) {
1474 *new_mmp = dup_mm(current); 1509 return -EINVAL;
1475 if (!*new_mmp)
1476 return -ENOMEM;
1477 } 1510 }
1478 1511
1479 return 0; 1512 return 0;
@@ -1529,6 +1562,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1529 1562
1530 check_unshare_flags(&unshare_flags); 1563 check_unshare_flags(&unshare_flags);
1531 1564
1565 /* Return -EINVAL for all unsupported flags */
1566 err = -EINVAL;
1567 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1568 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
1569 goto bad_unshare_out;
1570
1532 if ((err = unshare_thread(unshare_flags))) 1571 if ((err = unshare_thread(unshare_flags)))
1533 goto bad_unshare_out; 1572 goto bad_unshare_out;
1534 if ((err = unshare_fs(unshare_flags, &new_fs))) 1573 if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1562,7 +1601,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1562 1601
1563 if (new_sigh) { 1602 if (new_sigh) {
1564 sigh = current->sighand; 1603 sigh = current->sighand;
1565 current->sighand = new_sigh; 1604 rcu_assign_pointer(current->sighand, new_sigh);
1566 new_sigh = sigh; 1605 new_sigh = sigh;
1567 } 1606 }
1568 1607
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f9780..5699c51205 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
8 * Removed page pinning, fix privately mapped COW pages and other cleanups 8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier 9 * (C) Copyright 2003, 2004 Jamie Lokier
10 * 10 *
11 * Robust futex support started by Ingo Molnar
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 *
11 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
12 * enough at me, Linus for the original (flawed) idea, Matthew 16 * enough at me, Linus for the original (flawed) idea, Matthew
13 * Kirkwood for proof-of-concept implementation. 17 * Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,172 @@ error:
829 goto out; 833 goto out;
830} 834}
831 835
836/*
837 * Support for robust futexes: the kernel cleans up held futexes at
838 * thread exit time.
839 *
840 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after
847 * acquiring the lock, but just before it could have added itself to
848 * the list. There can only be one such pending lock.
849 */
850
851/**
852 * sys_set_robust_list - set the robust-futex list head of a task
853 * @head: pointer to the list-head
854 * @len: length of the list-head, as userspace expects
855 */
856asmlinkage long
857sys_set_robust_list(struct robust_list_head __user *head,
858 size_t len)
859{
860 /*
861 * The kernel knows only one size for now:
862 */
863 if (unlikely(len != sizeof(*head)))
864 return -EINVAL;
865
866 current->robust_list = head;
867
868 return 0;
869}
870
871/**
872 * sys_get_robust_list - get the robust-futex list head of a task
873 * @pid: pid of the process [zero for current task]
874 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
875 * @len_ptr: pointer to a length field, the kernel fills in the header size
876 */
877asmlinkage long
878sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
879 size_t __user *len_ptr)
880{
881 struct robust_list_head *head;
882 unsigned long ret;
883
884 if (!pid)
885 head = current->robust_list;
886 else {
887 struct task_struct *p;
888
889 ret = -ESRCH;
890 read_lock(&tasklist_lock);
891 p = find_task_by_pid(pid);
892 if (!p)
893 goto err_unlock;
894 ret = -EPERM;
895 if ((current->euid != p->euid) && (current->euid != p->uid) &&
896 !capable(CAP_SYS_PTRACE))
897 goto err_unlock;
898 head = p->robust_list;
899 read_unlock(&tasklist_lock);
900 }
901
902 if (put_user(sizeof(*head), len_ptr))
903 return -EFAULT;
904 return put_user(head, head_ptr);
905
906err_unlock:
907 read_unlock(&tasklist_lock);
908
909 return ret;
910}
911
912/*
913 * Process a futex-list entry, check whether it's owned by the
914 * dying task, and do notification if so:
915 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{
918 u32 uval;
919
920retry:
921 if (get_user(uval, uaddr))
922 return -1;
923
924 if ((uval & FUTEX_TID_MASK) == curr->pid) {
925 /*
926 * Ok, this dying thread is truly holding a futex
927 * of interest. Set the OWNER_DIED bit atomically
928 * via cmpxchg, and if the value had FUTEX_WAITERS
929 * set, wake up a waiter (if any). (We have to do a
930 * futex_wake() even if OWNER_DIED is already set -
931 * to handle the rare but possible case of recursive
932 * thread-death.) The rest of the cleanup is done in
933 * userspace.
934 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval)
937 goto retry;
938
939 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1);
941 }
942 return 0;
943}
944
945/*
946 * Walk curr->robust_list (very carefully, it's a userspace list!)
947 * and mark any locks found there dead, and notify any waiters.
948 *
949 * We silently return on any sign of list-walking problem.
950 */
951void exit_robust_list(struct task_struct *curr)
952{
953 struct robust_list_head __user *head = curr->robust_list;
954 struct robust_list __user *entry, *pending;
955 unsigned int limit = ROBUST_LIST_LIMIT;
956 unsigned long futex_offset;
957
958 /*
959 * Fetch the list head (which was registered earlier, via
960 * sys_set_robust_list()):
961 */
962 if (get_user(entry, &head->list.next))
963 return;
964 /*
965 * Fetch the relative futex offset:
966 */
967 if (get_user(futex_offset, &head->futex_offset))
968 return;
969 /*
970 * Fetch any possibly pending lock-add first, and handle it
971 * if it exists:
972 */
973 if (get_user(pending, &head->list_op_pending))
974 return;
975 if (pending)
976 handle_futex_death((void *)pending + futex_offset, curr);
977
978 while (entry != &head->list) {
979 /*
980 * A pending lock might already be on the list, so
981 * dont process it twice:
982 */
983 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset,
985 curr))
986 return;
987 /*
988 * Fetch the next entry in the list:
989 */
990 if (get_user(entry, &entry->next))
991 return;
992 /*
993 * Avoid excessively long or circular lists:
994 */
995 if (!--limit)
996 break;
997
998 cond_resched();
999 }
1000}
1001
832long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
833 unsigned long uaddr2, int val2, int val3) 1003 unsigned long uaddr2, int val2, int val3)
834{ 1004{
@@ -869,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
869 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
870 int val2 = 0; 1040 int val2 = 0;
871 1041
872 if ((op == FUTEX_WAIT) && utime) { 1042 if (utime && (op == FUTEX_WAIT)) {
873 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1043 if (copy_from_user(&t, utime, sizeof(t)) != 0)
874 return -EFAULT; 1044 return -EFAULT;
1045 if (!timespec_valid(&t))
1046 return -EINVAL;
875 timeout = timespec_to_jiffies(&t) + 1; 1047 timeout = timespec_to_jiffies(&t) + 1;
876 } 1048 }
877 /* 1049 /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 0000000000..1ab6a0ea3d
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,144 @@
1/*
2 * linux/kernel/futex_compat.c
3 *
4 * Futex compatibililty routines.
5 *
6 * Copyright 2006, Red Hat, Inc., Ingo Molnar
7 */
8
9#include <linux/linkage.h>
10#include <linux/compat.h>
11#include <linux/futex.h>
12
13#include <asm/uaccess.h>
14
15/*
16 * Walk curr->robust_list (very carefully, it's a userspace list!)
17 * and mark any locks found there dead, and notify any waiters.
18 *
19 * We silently return on any sign of list-walking problem.
20 */
21void compat_exit_robust_list(struct task_struct *curr)
22{
23 struct compat_robust_list_head __user *head = curr->compat_robust_list;
24 struct robust_list __user *entry, *pending;
25 compat_uptr_t uentry, upending;
26 unsigned int limit = ROBUST_LIST_LIMIT;
27 compat_long_t futex_offset;
28
29 /*
30 * Fetch the list head (which was registered earlier, via
31 * sys_set_robust_list()):
32 */
33 if (get_user(uentry, &head->list.next))
34 return;
35 entry = compat_ptr(uentry);
36 /*
37 * Fetch the relative futex offset:
38 */
39 if (get_user(futex_offset, &head->futex_offset))
40 return;
41 /*
42 * Fetch any possibly pending lock-add first, and handle it
43 * if it exists:
44 */
45 if (get_user(upending, &head->list_op_pending))
46 return;
47 pending = compat_ptr(upending);
48 if (upending)
49 handle_futex_death((void *)pending + futex_offset, curr);
50
51 while (compat_ptr(uentry) != &head->list) {
52 /*
53 * A pending lock might already be on the list, so
54 * dont process it twice:
55 */
56 if (entry != pending)
57 if (handle_futex_death((void *)entry + futex_offset,
58 curr))
59 return;
60
61 /*
62 * Fetch the next entry in the list:
63 */
64 if (get_user(uentry, (compat_uptr_t *)&entry->next))
65 return;
66 entry = compat_ptr(uentry);
67 /*
68 * Avoid excessively long or circular lists:
69 */
70 if (!--limit)
71 break;
72
73 cond_resched();
74 }
75}
76
77asmlinkage long
78compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
79 compat_size_t len)
80{
81 if (unlikely(len != sizeof(*head)))
82 return -EINVAL;
83
84 current->compat_robust_list = head;
85
86 return 0;
87}
88
89asmlinkage long
90compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
91 compat_size_t __user *len_ptr)
92{
93 struct compat_robust_list_head *head;
94 unsigned long ret;
95
96 if (!pid)
97 head = current->compat_robust_list;
98 else {
99 struct task_struct *p;
100
101 ret = -ESRCH;
102 read_lock(&tasklist_lock);
103 p = find_task_by_pid(pid);
104 if (!p)
105 goto err_unlock;
106 ret = -EPERM;
107 if ((current->euid != p->euid) && (current->euid != p->uid) &&
108 !capable(CAP_SYS_PTRACE))
109 goto err_unlock;
110 head = p->compat_robust_list;
111 read_unlock(&tasklist_lock);
112 }
113
114 if (put_user(sizeof(*head), len_ptr))
115 return -EFAULT;
116 return put_user(ptr_to_compat(head), head_ptr);
117
118err_unlock:
119 read_unlock(&tasklist_lock);
120
121 return ret;
122}
123
124asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
125 struct compat_timespec __user *utime, u32 __user *uaddr2,
126 u32 val3)
127{
128 struct timespec t;
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0;
131
132 if (utime && (op == FUTEX_WAIT)) {
133 if (get_compat_timespec(&t, utime))
134 return -EFAULT;
135 if (!timespec_valid(&t))
136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1;
138 }
139 if (op >= FUTEX_REQUEUE)
140 val2 = (int) (unsigned long) utime;
141
142 return do_futex((unsigned long)uaddr, op, val, timeout,
143 (unsigned long)uaddr2, val2, val3);
144}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b6e1757ae..01fa2ae98a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -123,6 +123,26 @@ void ktime_get_ts(struct timespec *ts)
123EXPORT_SYMBOL_GPL(ktime_get_ts); 123EXPORT_SYMBOL_GPL(ktime_get_ts);
124 124
125/* 125/*
126 * Get the coarse grained time at the softirq based on xtime and
127 * wall_to_monotonic.
128 */
129static void hrtimer_get_softirq_time(struct hrtimer_base *base)
130{
131 ktime_t xtim, tomono;
132 unsigned long seq;
133
134 do {
135 seq = read_seqbegin(&xtime_lock);
136 xtim = timespec_to_ktime(xtime);
137 tomono = timespec_to_ktime(wall_to_monotonic);
138
139 } while (read_seqretry(&xtime_lock, seq));
140
141 base[CLOCK_REALTIME].softirq_time = xtim;
142 base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
143}
144
145/*
126 * Functions and macros which are different for UP/SMP systems are kept in a 146 * Functions and macros which are different for UP/SMP systems are kept in a
127 * single place 147 * single place
128 */ 148 */
@@ -246,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
246/* 266/*
247 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
248 */ 268 */
249static unsigned long ktime_divns(const ktime_t kt, nsec_t div) 269static unsigned long ktime_divns(const ktime_t kt, s64 div)
250{ 270{
251 u64 dclc, inc, dns; 271 u64 dclc, inc, dns;
252 int sft = 0; 272 int sft = 0;
@@ -281,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
281 * hrtimer_forward - forward the timer expiry 301 * hrtimer_forward - forward the timer expiry
282 * 302 *
283 * @timer: hrtimer to forward 303 * @timer: hrtimer to forward
304 * @now: forward past this time
284 * @interval: the interval to forward 305 * @interval: the interval to forward
285 * 306 *
286 * Forward the timer expiry so it will expire in the future. 307 * Forward the timer expiry so it will expire in the future.
287 * Returns the number of overruns. 308 * Returns the number of overruns.
288 */ 309 */
289unsigned long 310unsigned long
290hrtimer_forward(struct hrtimer *timer, ktime_t interval) 311hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
291{ 312{
292 unsigned long orun = 1; 313 unsigned long orun = 1;
293 ktime_t delta, now; 314 ktime_t delta;
294
295 now = timer->base->get_time();
296 315
297 delta = ktime_sub(now, timer->expires); 316 delta = ktime_sub(now, timer->expires);
298 317
@@ -303,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t interval)
303 interval.tv64 = timer->base->resolution.tv64; 322 interval.tv64 = timer->base->resolution.tv64;
304 323
305 if (unlikely(delta.tv64 >= interval.tv64)) { 324 if (unlikely(delta.tv64 >= interval.tv64)) {
306 nsec_t incr = ktime_to_ns(interval); 325 s64 incr = ktime_to_ns(interval);
307 326
308 orun = ktime_divns(delta, incr); 327 orun = ktime_divns(delta, incr);
309 timer->expires = ktime_add_ns(timer->expires, incr * orun); 328 timer->expires = ktime_add_ns(timer->expires, incr * orun);
@@ -355,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
355 rb_link_node(&timer->node, parent, link); 374 rb_link_node(&timer->node, parent, link);
356 rb_insert_color(&timer->node, &base->active); 375 rb_insert_color(&timer->node, &base->active);
357 376
358 timer->state = HRTIMER_PENDING;
359
360 if (!base->first || timer->expires.tv64 < 377 if (!base->first || timer->expires.tv64 <
361 rb_entry(base->first, struct hrtimer, node)->expires.tv64) 378 rb_entry(base->first, struct hrtimer, node)->expires.tv64)
362 base->first = &timer->node; 379 base->first = &timer->node;
@@ -376,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
376 if (base->first == &timer->node) 393 if (base->first == &timer->node)
377 base->first = rb_next(&timer->node); 394 base->first = rb_next(&timer->node);
378 rb_erase(&timer->node, &base->active); 395 rb_erase(&timer->node, &base->active);
396 timer->node.rb_parent = HRTIMER_INACTIVE;
379} 397}
380 398
381/* 399/*
@@ -386,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
386{ 404{
387 if (hrtimer_active(timer)) { 405 if (hrtimer_active(timer)) {
388 __remove_hrtimer(timer, base); 406 __remove_hrtimer(timer, base);
389 timer->state = HRTIMER_INACTIVE;
390 return 1; 407 return 1;
391 } 408 }
392 return 0; 409 return 0;
@@ -418,8 +435,19 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
418 /* Switch the timer base, if necessary: */ 435 /* Switch the timer base, if necessary: */
419 new_base = switch_hrtimer_base(timer, base); 436 new_base = switch_hrtimer_base(timer, base);
420 437
421 if (mode == HRTIMER_REL) 438 if (mode == HRTIMER_REL) {
422 tim = ktime_add(tim, new_base->get_time()); 439 tim = ktime_add(tim, new_base->get_time());
440 /*
441 * CONFIG_TIME_LOW_RES is a temporary way for architectures
442 * to signal that they simply return xtime in
443 * do_gettimeoffset(). In this case we want to round up by
444 * resolution when starting a relative timer, to avoid short
445 * timeouts. This will go away with the GTOD framework.
446 */
447#ifdef CONFIG_TIME_LOW_RES
448 tim = ktime_add(tim, base->resolution);
449#endif
450 }
423 timer->expires = tim; 451 timer->expires = tim;
424 452
425 enqueue_hrtimer(timer, new_base); 453 enqueue_hrtimer(timer, new_base);
@@ -428,6 +456,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
428 456
429 return ret; 457 return ret;
430} 458}
459EXPORT_SYMBOL_GPL(hrtimer_start);
431 460
432/** 461/**
433 * hrtimer_try_to_cancel - try to deactivate a timer 462 * hrtimer_try_to_cancel - try to deactivate a timer
@@ -456,6 +485,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
456 return ret; 485 return ret;
457 486
458} 487}
488EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
459 489
460/** 490/**
461 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 491 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
@@ -473,8 +503,10 @@ int hrtimer_cancel(struct hrtimer *timer)
473 503
474 if (ret >= 0) 504 if (ret >= 0)
475 return ret; 505 return ret;
506 cpu_relax();
476 } 507 }
477} 508}
509EXPORT_SYMBOL_GPL(hrtimer_cancel);
478 510
479/** 511/**
480 * hrtimer_get_remaining - get remaining time for the timer 512 * hrtimer_get_remaining - get remaining time for the timer
@@ -493,6 +525,42 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
493 525
494 return rem; 526 return rem;
495} 527}
528EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
529
530#ifdef CONFIG_NO_IDLE_HZ
531/**
532 * hrtimer_get_next_event - get the time until next expiry event
533 *
534 * Returns the delta to the next expiry event or KTIME_MAX if no timer
535 * is pending.
536 */
537ktime_t hrtimer_get_next_event(void)
538{
539 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
540 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
541 unsigned long flags;
542 int i;
543
544 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
545 struct hrtimer *timer;
546
547 spin_lock_irqsave(&base->lock, flags);
548 if (!base->first) {
549 spin_unlock_irqrestore(&base->lock, flags);
550 continue;
551 }
552 timer = rb_entry(base->first, struct hrtimer, node);
553 delta.tv64 = timer->expires.tv64;
554 spin_unlock_irqrestore(&base->lock, flags);
555 delta = ktime_sub(delta, base->get_time());
556 if (delta.tv64 < mindelta.tv64)
557 mindelta.tv64 = delta.tv64;
558 }
559 if (mindelta.tv64 < 0)
560 mindelta.tv64 = 0;
561 return mindelta;
562}
563#endif
496 564
497/** 565/**
498 * hrtimer_init - initialize a timer to the given clock 566 * hrtimer_init - initialize a timer to the given clock
@@ -514,7 +582,9 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
514 clock_id = CLOCK_MONOTONIC; 582 clock_id = CLOCK_MONOTONIC;
515 583
516 timer->base = &bases[clock_id]; 584 timer->base = &bases[clock_id];
585 timer->node.rb_parent = HRTIMER_INACTIVE;
517} 586}
587EXPORT_SYMBOL_GPL(hrtimer_init);
518 588
519/** 589/**
520 * hrtimer_get_res - get the timer resolution for a clock 590 * hrtimer_get_res - get the timer resolution for a clock
@@ -534,54 +604,45 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
534 604
535 return 0; 605 return 0;
536} 606}
607EXPORT_SYMBOL_GPL(hrtimer_get_res);
537 608
538/* 609/*
539 * Expire the per base hrtimer-queue: 610 * Expire the per base hrtimer-queue:
540 */ 611 */
541static inline void run_hrtimer_queue(struct hrtimer_base *base) 612static inline void run_hrtimer_queue(struct hrtimer_base *base)
542{ 613{
543 ktime_t now = base->get_time();
544 struct rb_node *node; 614 struct rb_node *node;
545 615
616 if (!base->first)
617 return;
618
619 if (base->get_softirq_time)
620 base->softirq_time = base->get_softirq_time();
621
546 spin_lock_irq(&base->lock); 622 spin_lock_irq(&base->lock);
547 623
548 while ((node = base->first)) { 624 while ((node = base->first)) {
549 struct hrtimer *timer; 625 struct hrtimer *timer;
550 int (*fn)(void *); 626 int (*fn)(struct hrtimer *);
551 int restart; 627 int restart;
552 void *data;
553 628
554 timer = rb_entry(node, struct hrtimer, node); 629 timer = rb_entry(node, struct hrtimer, node);
555 if (now.tv64 <= timer->expires.tv64) 630 if (base->softirq_time.tv64 <= timer->expires.tv64)
556 break; 631 break;
557 632
558 fn = timer->function; 633 fn = timer->function;
559 data = timer->data;
560 set_curr_timer(base, timer); 634 set_curr_timer(base, timer);
561 timer->state = HRTIMER_RUNNING;
562 __remove_hrtimer(timer, base); 635 __remove_hrtimer(timer, base);
563 spin_unlock_irq(&base->lock); 636 spin_unlock_irq(&base->lock);
564 637
565 /* 638 restart = fn(timer);
566 * fn == NULL is special case for the simplest timer
567 * variant - wake up process and do not restart:
568 */
569 if (!fn) {
570 wake_up_process(data);
571 restart = HRTIMER_NORESTART;
572 } else
573 restart = fn(data);
574 639
575 spin_lock_irq(&base->lock); 640 spin_lock_irq(&base->lock);
576 641
577 /* Another CPU has added back the timer */ 642 if (restart != HRTIMER_NORESTART) {
578 if (timer->state != HRTIMER_RUNNING) 643 BUG_ON(hrtimer_active(timer));
579 continue;
580
581 if (restart == HRTIMER_RESTART)
582 enqueue_hrtimer(timer, base); 644 enqueue_hrtimer(timer, base);
583 else 645 }
584 timer->state = HRTIMER_EXPIRED;
585 } 646 }
586 set_curr_timer(base, NULL); 647 set_curr_timer(base, NULL);
587 spin_unlock_irq(&base->lock); 648 spin_unlock_irq(&base->lock);
@@ -595,6 +656,8 @@ void hrtimer_run_queues(void)
595 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 656 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
596 int i; 657 int i;
597 658
659 hrtimer_get_softirq_time(base);
660
598 for (i = 0; i < MAX_HRTIMER_BASES; i++) 661 for (i = 0; i < MAX_HRTIMER_BASES; i++)
599 run_hrtimer_queue(&base[i]); 662 run_hrtimer_queue(&base[i]);
600} 663}
@@ -602,80 +665,69 @@ void hrtimer_run_queues(void)
602/* 665/*
603 * Sleep related functions: 666 * Sleep related functions:
604 */ 667 */
605 668static int hrtimer_wakeup(struct hrtimer *timer)
606/**
607 * schedule_hrtimer - sleep until timeout
608 *
609 * @timer: hrtimer variable initialized with the correct clock base
610 * @mode: timeout value is abs/rel
611 *
612 * Make the current task sleep until @timeout is
613 * elapsed.
614 *
615 * You can set the task state as follows -
616 *
617 * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
618 * pass before the routine returns. The routine will return 0
619 *
620 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
621 * delivered to the current task. In this case the remaining time
622 * will be returned
623 *
624 * The current task state is guaranteed to be TASK_RUNNING when this
625 * routine returns.
626 */
627static ktime_t __sched
628schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
629{ 669{
630 /* fn stays NULL, meaning single-shot wakeup: */ 670 struct hrtimer_sleeper *t =
631 timer->data = current; 671 container_of(timer, struct hrtimer_sleeper, timer);
672 struct task_struct *task = t->task;
632 673
633 hrtimer_start(timer, timer->expires, mode); 674 t->task = NULL;
675 if (task)
676 wake_up_process(task);
634 677
635 schedule(); 678 return HRTIMER_NORESTART;
636 hrtimer_cancel(timer); 679}
637 680
638 /* Return the remaining time: */ 681void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
639 if (timer->state != HRTIMER_EXPIRED) 682{
640 return ktime_sub(timer->expires, timer->base->get_time()); 683 sl->timer.function = hrtimer_wakeup;
641 else 684 sl->task = task;
642 return (ktime_t) {.tv64 = 0 };
643} 685}
644 686
645static inline ktime_t __sched 687static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
646schedule_hrtimer_interruptible(struct hrtimer *timer,
647 const enum hrtimer_mode mode)
648{ 688{
649 set_current_state(TASK_INTERRUPTIBLE); 689 hrtimer_init_sleeper(t, current);
690
691 do {
692 set_current_state(TASK_INTERRUPTIBLE);
693 hrtimer_start(&t->timer, t->timer.expires, mode);
694
695 schedule();
696
697 hrtimer_cancel(&t->timer);
698 mode = HRTIMER_ABS;
650 699
651 return schedule_hrtimer(timer, mode); 700 } while (t->task && !signal_pending(current));
701
702 return t->task == NULL;
652} 703}
653 704
654static long __sched nanosleep_restart(struct restart_block *restart) 705static long __sched nanosleep_restart(struct restart_block *restart)
655{ 706{
707 struct hrtimer_sleeper t;
656 struct timespec __user *rmtp; 708 struct timespec __user *rmtp;
657 struct timespec tu; 709 struct timespec tu;
658 void *rfn_save = restart->fn; 710 ktime_t time;
659 struct hrtimer timer;
660 ktime_t rem;
661 711
662 restart->fn = do_no_restart_syscall; 712 restart->fn = do_no_restart_syscall;
663 713
664 hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); 714 hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
665 715 t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
666 timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
667
668 rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
669 716
670 if (rem.tv64 <= 0) 717 if (do_nanosleep(&t, HRTIMER_ABS))
671 return 0; 718 return 0;
672 719
673 rmtp = (struct timespec __user *) restart->arg2; 720 rmtp = (struct timespec __user *) restart->arg2;
674 tu = ktime_to_timespec(rem); 721 if (rmtp) {
675 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) 722 time = ktime_sub(t.timer.expires, t.timer.base->get_time());
676 return -EFAULT; 723 if (time.tv64 <= 0)
724 return 0;
725 tu = ktime_to_timespec(time);
726 if (copy_to_user(rmtp, &tu, sizeof(tu)))
727 return -EFAULT;
728 }
677 729
678 restart->fn = rfn_save; 730 restart->fn = nanosleep_restart;
679 731
680 /* The other values in restart are already filled in */ 732 /* The other values in restart are already filled in */
681 return -ERESTART_RESTARTBLOCK; 733 return -ERESTART_RESTARTBLOCK;
@@ -685,33 +737,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
685 const enum hrtimer_mode mode, const clockid_t clockid) 737 const enum hrtimer_mode mode, const clockid_t clockid)
686{ 738{
687 struct restart_block *restart; 739 struct restart_block *restart;
688 struct hrtimer timer; 740 struct hrtimer_sleeper t;
689 struct timespec tu; 741 struct timespec tu;
690 ktime_t rem; 742 ktime_t rem;
691 743
692 hrtimer_init(&timer, clockid, mode); 744 hrtimer_init(&t.timer, clockid, mode);
693 745 t.timer.expires = timespec_to_ktime(*rqtp);
694 timer.expires = timespec_to_ktime(*rqtp); 746 if (do_nanosleep(&t, mode))
695
696 rem = schedule_hrtimer_interruptible(&timer, mode);
697 if (rem.tv64 <= 0)
698 return 0; 747 return 0;
699 748
700 /* Absolute timers do not update the rmtp value and restart: */ 749 /* Absolute timers do not update the rmtp value and restart: */
701 if (mode == HRTIMER_ABS) 750 if (mode == HRTIMER_ABS)
702 return -ERESTARTNOHAND; 751 return -ERESTARTNOHAND;
703 752
704 tu = ktime_to_timespec(rem); 753 if (rmtp) {
705 754 rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
706 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) 755 if (rem.tv64 <= 0)
707 return -EFAULT; 756 return 0;
757 tu = ktime_to_timespec(rem);
758 if (copy_to_user(rmtp, &tu, sizeof(tu)))
759 return -EFAULT;
760 }
708 761
709 restart = &current_thread_info()->restart_block; 762 restart = &current_thread_info()->restart_block;
710 restart->fn = nanosleep_restart; 763 restart->fn = nanosleep_restart;
711 restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; 764 restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
712 restart->arg1 = timer.expires.tv64 >> 32; 765 restart->arg1 = t.timer.expires.tv64 >> 32;
713 restart->arg2 = (unsigned long) rmtp; 766 restart->arg2 = (unsigned long) rmtp;
714 restart->arg3 = (unsigned long) timer.base->index; 767 restart->arg3 = (unsigned long) t.timer.base->index;
715 768
716 return -ERESTART_RESTARTBLOCK; 769 return -ERESTART_RESTARTBLOCK;
717} 770}
@@ -789,7 +842,7 @@ static void migrate_hrtimers(int cpu)
789} 842}
790#endif /* CONFIG_HOTPLUG_CPU */ 843#endif /* CONFIG_HOTPLUG_CPU */
791 844
792static int __devinit hrtimer_cpu_notify(struct notifier_block *self, 845static int hrtimer_cpu_notify(struct notifier_block *self,
793 unsigned long action, void *hcpu) 846 unsigned long action, void *hcpu)
794{ 847{
795 long cpu = (long)hcpu; 848 long cpu = (long)hcpu;
@@ -813,7 +866,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
813 return NOTIFY_OK; 866 return NOTIFY_OK;
814} 867}
815 868
816static struct notifier_block __devinitdata hrtimers_nb = { 869static struct notifier_block hrtimers_nb = {
817 .notifier_call = hrtimer_cpu_notify, 870 .notifier_call = hrtimer_cpu_notify,
818}; 871};
819 872
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 49378738ff..9f77f50d81 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,4 +2,4 @@
2obj-y := handle.o manage.o spurious.o 2obj-y := handle.o manage.o spurious.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 97d5559997..1279e34995 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -204,10 +204,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
204 p = &desc->action; 204 p = &desc->action;
205 if ((old = *p) != NULL) { 205 if ((old = *p) != NULL) {
206 /* Can't share interrupts unless both agree to */ 206 /* Can't share interrupts unless both agree to */
207 if (!(old->flags & new->flags & SA_SHIRQ)) { 207 if (!(old->flags & new->flags & SA_SHIRQ))
208 spin_unlock_irqrestore(&desc->lock,flags); 208 goto mismatch;
209 return -EBUSY; 209
210 } 210#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
211 /* All handlers must agree on per-cpuness */
212 if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
213 goto mismatch;
214#endif
211 215
212 /* add new interrupt at end of irq queue */ 216 /* add new interrupt at end of irq queue */
213 do { 217 do {
@@ -218,7 +222,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
218 } 222 }
219 223
220 *p = new; 224 *p = new;
221 225#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
226 if (new->flags & SA_PERCPU_IRQ)
227 desc->status |= IRQ_PER_CPU;
228#endif
222 if (!shared) { 229 if (!shared) {
223 desc->depth = 0; 230 desc->depth = 0;
224 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | 231 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -236,6 +243,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
236 register_handler_proc(irq, new); 243 register_handler_proc(irq, new);
237 244
238 return 0; 245 return 0;
246
247mismatch:
248 spin_unlock_irqrestore(&desc->lock, flags);
249 if (!(new->flags & SA_PROBEIRQ)) {
250 printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
251 dump_stack();
252 }
253 return -EBUSY;
239} 254}
240 255
241/** 256/**
@@ -258,6 +273,7 @@ void free_irq(unsigned int irq, void *dev_id)
258 struct irqaction **p; 273 struct irqaction **p;
259 unsigned long flags; 274 unsigned long flags;
260 275
276 WARN_ON(in_interrupt());
261 if (irq >= NR_IRQS) 277 if (irq >= NR_IRQS)
262 return; 278 return;
263 279
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
new file mode 100644
index 0000000000..134f9f2e0e
--- /dev/null
+++ b/kernel/irq/migration.c
@@ -0,0 +1,62 @@
1
2#include <linux/irq.h>
3
4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{
6 irq_desc_t *desc = irq_desc + irq;
7 unsigned long flags;
8
9 spin_lock_irqsave(&desc->lock, flags);
10 desc->move_irq = 1;
11 pending_irq_cpumask[irq] = mask;
12 spin_unlock_irqrestore(&desc->lock, flags);
13}
14
15void move_native_irq(int irq)
16{
17 cpumask_t tmp;
18 irq_desc_t *desc = irq_descp(irq);
19
20 if (likely(!desc->move_irq))
21 return;
22
23 /*
24 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
25 */
26 if (CHECK_IRQ_PER_CPU(desc->status)) {
27 WARN_ON(1);
28 return;
29 }
30
31 desc->move_irq = 0;
32
33 if (likely(cpus_empty(pending_irq_cpumask[irq])))
34 return;
35
36 if (!desc->handler->set_affinity)
37 return;
38
39 assert_spin_locked(&desc->lock);
40
41 cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
42
43 /*
44 * If there was a valid mask to work with, please
45 * do the disable, re-program, enable sequence.
46 * This is *not* particularly important for level triggered
47 * but in a edge trigger case, we might be setting rte
48 * when an active trigger is comming in. This could
49 * cause some ioapics to mal-function.
50 * Being paranoid i guess!
51 */
52 if (unlikely(!cpus_empty(tmp))) {
53 if (likely(!(desc->status & IRQ_DISABLED)))
54 desc->handler->disable(irq);
55
56 desc->handler->set_affinity(irq,tmp);
57
58 if (likely(!(desc->status & IRQ_DISABLED)))
59 desc->handler->enable(irq);
60 }
61 cpus_clear(pending_irq_cpumask[irq]);
62}
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 379be2f8c8..204ed7939e 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,21 +128,75 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
128/* 128/*
129 * The timer is automagically restarted, when interval != 0 129 * The timer is automagically restarted, when interval != 0
130 */ 130 */
131int it_real_fn(void *data) 131int it_real_fn(struct hrtimer *timer)
132{ 132{
133 struct task_struct *tsk = (struct task_struct *) data; 133 struct signal_struct *sig =
134 container_of(timer, struct signal_struct, real_timer);
134 135
135 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk); 136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
136
137 if (tsk->signal->it_real_incr.tv64 != 0) {
138 hrtimer_forward(&tsk->signal->real_timer,
139 tsk->signal->it_real_incr);
140 137
138 if (sig->it_real_incr.tv64 != 0) {
139 hrtimer_forward(timer, timer->base->softirq_time,
140 sig->it_real_incr);
141 return HRTIMER_RESTART; 141 return HRTIMER_RESTART;
142 } 142 }
143 return HRTIMER_NORESTART; 143 return HRTIMER_NORESTART;
144} 144}
145 145
146/*
147 * We do not care about correctness. We just sanitize the values so
148 * the ktime_t operations which expect normalized values do not
149 * break. This converts negative values to long timeouts similar to
150 * the code in kernel versions < 2.6.16
151 *
152 * Print a limited number of warning messages when an invalid timeval
153 * is detected.
154 */
155static void fixup_timeval(struct timeval *tv, int interval)
156{
157 static int warnlimit = 10;
158 unsigned long tmp;
159
160 if (warnlimit > 0) {
161 warnlimit--;
162 printk(KERN_WARNING
163 "setitimer: %s (pid = %d) provided "
164 "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
165 current->comm, current->pid,
166 interval ? "it_interval" : "it_value",
167 tv->tv_sec, (long) tv->tv_usec);
168 }
169
170 tmp = tv->tv_usec;
171 if (tmp >= USEC_PER_SEC) {
172 tv->tv_usec = tmp % USEC_PER_SEC;
173 tv->tv_sec += tmp / USEC_PER_SEC;
174 }
175
176 tmp = tv->tv_sec;
177 if (tmp > LONG_MAX)
178 tv->tv_sec = LONG_MAX;
179}
180
181/*
182 * Returns true if the timeval is in canonical form
183 */
184#define timeval_valid(t) \
185 (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
186
187/*
188 * Check for invalid timevals, sanitize them and print a limited
189 * number of warnings.
190 */
191static void check_itimerval(struct itimerval *value) {
192
193 if (unlikely(!timeval_valid(&value->it_value)))
194 fixup_timeval(&value->it_value, 0);
195
196 if (unlikely(!timeval_valid(&value->it_interval)))
197 fixup_timeval(&value->it_interval, 1);
198}
199
146int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) 200int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
147{ 201{
148 struct task_struct *tsk = current; 202 struct task_struct *tsk = current;
@@ -150,6 +204,18 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
150 ktime_t expires; 204 ktime_t expires;
151 cputime_t cval, cinterval, nval, ninterval; 205 cputime_t cval, cinterval, nval, ninterval;
152 206
207 /*
208 * Validate the timevals in value.
209 *
210 * Note: Although the spec requires that invalid values shall
211 * return -EINVAL, we just fixup the value and print a limited
212 * number of warnings in order not to break users of this
213 * historical misfeature.
214 *
215 * Scheduled for replacement in March 2007
216 */
217 check_itimerval(value);
218
153 switch (which) { 219 switch (which) {
154 case ITIMER_REAL: 220 case ITIMER_REAL:
155again: 221again:
@@ -226,6 +292,43 @@ again:
226 return 0; 292 return 0;
227} 293}
228 294
295/**
296 * alarm_setitimer - set alarm in seconds
297 *
298 * @seconds: number of seconds until alarm
299 * 0 disables the alarm
300 *
301 * Returns the remaining time in seconds of a pending timer or 0 when
302 * the timer is not active.
303 *
304 * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
305 * negative timeval settings which would cause immediate expiry.
306 */
307unsigned int alarm_setitimer(unsigned int seconds)
308{
309 struct itimerval it_new, it_old;
310
311#if BITS_PER_LONG < 64
312 if (seconds > INT_MAX)
313 seconds = INT_MAX;
314#endif
315 it_new.it_value.tv_sec = seconds;
316 it_new.it_value.tv_usec = 0;
317 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
318
319 do_setitimer(ITIMER_REAL, &it_new, &it_old);
320
321 /*
322 * We can't return 0 if we have an alarm pending ... And we'd
323 * better return too much than too little anyway
324 */
325 if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
326 it_old.it_value.tv_usec >= 500000)
327 it_old.it_value.tv_sec++;
328
329 return it_old.it_value.tv_sec;
330}
331
229asmlinkage long sys_setitimer(int which, 332asmlinkage long sys_setitimer(int which,
230 struct itimerval __user *value, 333 struct itimerval __user *value,
231 struct itimerval __user *ovalue) 334 struct itimerval __user *ovalue)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 51a892063a..20a997c73c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data)
170 sa.sa.sa_handler = SIG_IGN; 170 sa.sa.sa_handler = SIG_IGN;
171 sa.sa.sa_flags = 0; 171 sa.sa.sa_flags = 0;
172 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); 172 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
173 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); 173 do_sigaction(SIGCHLD, &sa, NULL);
174 allow_signal(SIGCHLD); 174 allow_signal(SIGCHLD);
175 175
176 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 176 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73..1fbf466a29 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50 50
51DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 54
@@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
323} 323}
324 324
325/* 325/*
326 * This function is called from exit_thread or flush_thread when task tk's 326 * This function is called from finish_task_switch when task tk becomes dead,
327 * stack is being recycled so that we can recycle any function-return probe 327 * so that we can recycle any function-return probe instances associated
328 * instances associated with this task. These left over instances represent 328 * with this task. These left over instances represent probed functions
329 * probed functions that have been called but will never return. 329 * that have been called but will never return.
330 */ 330 */
331void __kprobes kprobe_flush_task(struct task_struct *tk) 331void __kprobes kprobe_flush_task(struct task_struct *tk)
332{ 332{
@@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
336 unsigned long flags = 0; 336 unsigned long flags = 0;
337 337
338 spin_lock_irqsave(&kretprobe_lock, flags); 338 spin_lock_irqsave(&kretprobe_lock, flags);
339 head = kretprobe_inst_table_head(current); 339 head = kretprobe_inst_table_head(tk);
340 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 340 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
341 if (ri->task == tk) 341 if (ri->task == tk)
342 recycle_rp_inst(ri); 342 recycle_rp_inst(ri);
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
460 } 460 }
461 461
462 p->nmissed = 0; 462 p->nmissed = 0;
463 down(&kprobe_mutex); 463 mutex_lock(&kprobe_mutex);
464 old_p = get_kprobe(p->addr); 464 old_p = get_kprobe(p->addr);
465 if (old_p) { 465 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 466 ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
477 arch_arm_kprobe(p); 477 arch_arm_kprobe(p);
478 478
479out: 479out:
480 up(&kprobe_mutex); 480 mutex_unlock(&kprobe_mutex);
481 481
482 if (ret && probed_mod) 482 if (ret && probed_mod)
483 module_put(probed_mod); 483 module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
496 struct kprobe *old_p, *list_p; 496 struct kprobe *old_p, *list_p;
497 int cleanup_p; 497 int cleanup_p;
498 498
499 down(&kprobe_mutex); 499 mutex_lock(&kprobe_mutex);
500 old_p = get_kprobe(p->addr); 500 old_p = get_kprobe(p->addr);
501 if (unlikely(!old_p)) { 501 if (unlikely(!old_p)) {
502 up(&kprobe_mutex); 502 mutex_unlock(&kprobe_mutex);
503 return; 503 return;
504 } 504 }
505 if (p != old_p) { 505 if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
507 if (list_p == p) 507 if (list_p == p)
508 /* kprobe p is a valid probe */ 508 /* kprobe p is a valid probe */
509 goto valid_p; 509 goto valid_p;
510 up(&kprobe_mutex); 510 mutex_unlock(&kprobe_mutex);
511 return; 511 return;
512 } 512 }
513valid_p: 513valid_p:
@@ -523,7 +523,7 @@ valid_p:
523 cleanup_p = 0; 523 cleanup_p = 0;
524 } 524 }
525 525
526 up(&kprobe_mutex); 526 mutex_unlock(&kprobe_mutex);
527 527
528 synchronize_sched(); 528 synchronize_sched();
529 if (p->mod_refcounted && 529 if (p->mod_refcounted &&
@@ -585,6 +585,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
585 int i; 585 int i;
586 586
587 rp->kp.pre_handler = pre_handler_kretprobe; 587 rp->kp.pre_handler = pre_handler_kretprobe;
588 rp->kp.post_handler = NULL;
589 rp->kp.fault_handler = NULL;
590 rp->kp.break_handler = NULL;
588 591
589 /* Pre-allocate memory for max kretprobe instances */ 592 /* Pre-allocate memory for max kretprobe instances */
590 if (rp->maxactive <= 0) { 593 if (rp->maxactive <= 0) {
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d5eeae0fa5..f119e098e6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,9 +15,6 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/init.h> 16#include <linux/init.h>
17 17
18u64 uevent_seqnum;
19char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
20
21#define KERNEL_ATTR_RO(_name) \ 18#define KERNEL_ATTR_RO(_name) \
22static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 19static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
23 20
@@ -25,7 +22,7 @@ static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
25static struct subsys_attribute _name##_attr = \ 22static struct subsys_attribute _name##_attr = \
26 __ATTR(_name, 0644, _name##_show, _name##_store) 23 __ATTR(_name, 0644, _name##_show, _name##_store)
27 24
28#ifdef CONFIG_HOTPLUG 25#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
29/* current uevent sequence number */ 26/* current uevent sequence number */
30static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) 27static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
31{ 28{
@@ -55,7 +52,7 @@ decl_subsys(kernel, NULL, NULL);
55EXPORT_SYMBOL_GPL(kernel_subsys); 52EXPORT_SYMBOL_GPL(kernel_subsys);
56 53
57static struct attribute * kernel_attrs[] = { 54static struct attribute * kernel_attrs[] = {
58#ifdef CONFIG_HOTPLUG 55#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
59 &uevent_seqnum_attr.attr, 56 &uevent_seqnum_attr.attr,
60 &uevent_helper_attr.attr, 57 &uevent_helper_attr.attr,
61#endif 58#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e75950a109..c5f3c6613b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,6 +12,7 @@
12#include <linux/unistd.h> 12#include <linux/unistd.h>
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h>
15#include <asm/semaphore.h> 16#include <asm/semaphore.h>
16 17
17/* 18/*
@@ -41,7 +42,7 @@ struct kthread_stop_info
41 42
42/* Thread stopping is done by setthing this var: lock serializes 43/* Thread stopping is done by setthing this var: lock serializes
43 * multiple kthread_stop calls. */ 44 * multiple kthread_stop calls. */
44static DECLARE_MUTEX(kthread_stop_lock); 45static DEFINE_MUTEX(kthread_stop_lock);
45static struct kthread_stop_info kthread_stop_info; 46static struct kthread_stop_info kthread_stop_info;
46 47
47int kthread_should_stop(void) 48int kthread_should_stop(void)
@@ -114,7 +115,9 @@ static void keventd_create_kthread(void *_create)
114 create->result = ERR_PTR(pid); 115 create->result = ERR_PTR(pid);
115 } else { 116 } else {
116 wait_for_completion(&create->started); 117 wait_for_completion(&create->started);
118 read_lock(&tasklist_lock);
117 create->result = find_task_by_pid(pid); 119 create->result = find_task_by_pid(pid);
120 read_unlock(&tasklist_lock);
118 } 121 }
119 complete(&create->done); 122 complete(&create->done);
120} 123}
@@ -173,7 +176,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
173{ 176{
174 int ret; 177 int ret;
175 178
176 down(&kthread_stop_lock); 179 mutex_lock(&kthread_stop_lock);
177 180
178 /* It could exit after stop_info.k set, but before wake_up_process. */ 181 /* It could exit after stop_info.k set, but before wake_up_process. */
179 get_task_struct(k); 182 get_task_struct(k);
@@ -194,7 +197,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
194 wait_for_completion(&kthread_stop_info.done); 197 wait_for_completion(&kthread_stop_info.done);
195 kthread_stop_info.k = NULL; 198 kthread_stop_info.k = NULL;
196 ret = kthread_stop_info.err; 199 ret = kthread_stop_info.err;
197 up(&kthread_stop_lock); 200 mutex_unlock(&kthread_stop_lock);
198 201
199 return ret; 202 return ret;
200} 203}
diff --git a/kernel/module.c b/kernel/module.c
index 5aad477ddc..bbe04862e1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -39,6 +39,7 @@
39#include <linux/device.h> 39#include <linux/device.h>
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/mutex.h>
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/semaphore.h> 44#include <asm/semaphore.h>
44#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
@@ -60,29 +61,20 @@
60static DEFINE_SPINLOCK(modlist_lock); 61static DEFINE_SPINLOCK(modlist_lock);
61 62
62/* List of modules, protected by module_mutex AND modlist_lock */ 63/* List of modules, protected by module_mutex AND modlist_lock */
63static DECLARE_MUTEX(module_mutex); 64static DEFINE_MUTEX(module_mutex);
64static LIST_HEAD(modules); 65static LIST_HEAD(modules);
65 66
66static DECLARE_MUTEX(notify_mutex); 67static BLOCKING_NOTIFIER_HEAD(module_notify_list);
67static struct notifier_block * module_notify_list;
68 68
69int register_module_notifier(struct notifier_block * nb) 69int register_module_notifier(struct notifier_block * nb)
70{ 70{
71 int err; 71 return blocking_notifier_chain_register(&module_notify_list, nb);
72 down(&notify_mutex);
73 err = notifier_chain_register(&module_notify_list, nb);
74 up(&notify_mutex);
75 return err;
76} 72}
77EXPORT_SYMBOL(register_module_notifier); 73EXPORT_SYMBOL(register_module_notifier);
78 74
79int unregister_module_notifier(struct notifier_block * nb) 75int unregister_module_notifier(struct notifier_block * nb)
80{ 76{
81 int err; 77 return blocking_notifier_chain_unregister(&module_notify_list, nb);
82 down(&notify_mutex);
83 err = notifier_chain_unregister(&module_notify_list, nb);
84 up(&notify_mutex);
85 return err;
86} 78}
87EXPORT_SYMBOL(unregister_module_notifier); 79EXPORT_SYMBOL(unregister_module_notifier);
88 80
@@ -126,15 +118,30 @@ extern const struct kernel_symbol __start___ksymtab[];
126extern const struct kernel_symbol __stop___ksymtab[]; 118extern const struct kernel_symbol __stop___ksymtab[];
127extern const struct kernel_symbol __start___ksymtab_gpl[]; 119extern const struct kernel_symbol __start___ksymtab_gpl[];
128extern const struct kernel_symbol __stop___ksymtab_gpl[]; 120extern const struct kernel_symbol __stop___ksymtab_gpl[];
121extern const struct kernel_symbol __start___ksymtab_gpl_future[];
122extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
129extern const unsigned long __start___kcrctab[]; 123extern const unsigned long __start___kcrctab[];
130extern const unsigned long __start___kcrctab_gpl[]; 124extern const unsigned long __start___kcrctab_gpl[];
125extern const unsigned long __start___kcrctab_gpl_future[];
131 126
132#ifndef CONFIG_MODVERSIONS 127#ifndef CONFIG_MODVERSIONS
133#define symversion(base, idx) NULL 128#define symversion(base, idx) NULL
134#else 129#else
135#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) 130#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
136#endif 131#endif
137 132
133/* lookup symbol in given range of kernel_symbols */
134static const struct kernel_symbol *lookup_symbol(const char *name,
135 const struct kernel_symbol *start,
136 const struct kernel_symbol *stop)
137{
138 const struct kernel_symbol *ks = start;
139 for (; ks < stop; ks++)
140 if (strcmp(ks->name, name) == 0)
141 return ks;
142 return NULL;
143}
144
138/* Find a symbol, return value, crc and module which owns it */ 145/* Find a symbol, return value, crc and module which owns it */
139static unsigned long __find_symbol(const char *name, 146static unsigned long __find_symbol(const char *name,
140 struct module **owner, 147 struct module **owner,
@@ -142,64 +149,81 @@ static unsigned long __find_symbol(const char *name,
142 int gplok) 149 int gplok)
143{ 150{
144 struct module *mod; 151 struct module *mod;
145 unsigned int i; 152 const struct kernel_symbol *ks;
146 153
147 /* Core kernel first. */ 154 /* Core kernel first. */
148 *owner = NULL; 155 *owner = NULL;
149 for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) { 156 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
150 if (strcmp(__start___ksymtab[i].name, name) == 0) { 157 if (ks) {
151 *crc = symversion(__start___kcrctab, i); 158 *crc = symversion(__start___kcrctab, (ks - __start___ksymtab));
152 return __start___ksymtab[i].value; 159 return ks->value;
153 }
154 } 160 }
155 if (gplok) { 161 if (gplok) {
156 for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++) 162 ks = lookup_symbol(name, __start___ksymtab_gpl,
157 if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) { 163 __stop___ksymtab_gpl);
158 *crc = symversion(__start___kcrctab_gpl, i); 164 if (ks) {
159 return __start___ksymtab_gpl[i].value; 165 *crc = symversion(__start___kcrctab_gpl,
160 } 166 (ks - __start___ksymtab_gpl));
167 return ks->value;
168 }
169 }
170 ks = lookup_symbol(name, __start___ksymtab_gpl_future,
171 __stop___ksymtab_gpl_future);
172 if (ks) {
173 if (!gplok) {
174 printk(KERN_WARNING "Symbol %s is being used "
175 "by a non-GPL module, which will not "
176 "be allowed in the future\n", name);
177 printk(KERN_WARNING "Please see the file "
178 "Documentation/feature-removal-schedule.txt "
179 "in the kernel source tree for more "
180 "details.\n");
181 }
182 *crc = symversion(__start___kcrctab_gpl_future,
183 (ks - __start___ksymtab_gpl_future));
184 return ks->value;
161 } 185 }
162 186
163 /* Now try modules. */ 187 /* Now try modules. */
164 list_for_each_entry(mod, &modules, list) { 188 list_for_each_entry(mod, &modules, list) {
165 *owner = mod; 189 *owner = mod;
166 for (i = 0; i < mod->num_syms; i++) 190 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
167 if (strcmp(mod->syms[i].name, name) == 0) { 191 if (ks) {
168 *crc = symversion(mod->crcs, i); 192 *crc = symversion(mod->crcs, (ks - mod->syms));
169 return mod->syms[i].value; 193 return ks->value;
170 } 194 }
171 195
172 if (gplok) { 196 if (gplok) {
173 for (i = 0; i < mod->num_gpl_syms; i++) { 197 ks = lookup_symbol(name, mod->gpl_syms,
174 if (strcmp(mod->gpl_syms[i].name, name) == 0) { 198 mod->gpl_syms + mod->num_gpl_syms);
175 *crc = symversion(mod->gpl_crcs, i); 199 if (ks) {
176 return mod->gpl_syms[i].value; 200 *crc = symversion(mod->gpl_crcs,
177 } 201 (ks - mod->gpl_syms));
202 return ks->value;
178 } 203 }
179 } 204 }
205 ks = lookup_symbol(name, mod->gpl_future_syms,
206 (mod->gpl_future_syms +
207 mod->num_gpl_future_syms));
208 if (ks) {
209 if (!gplok) {
210 printk(KERN_WARNING "Symbol %s is being used "
211 "by a non-GPL module, which will not "
212 "be allowed in the future\n", name);
213 printk(KERN_WARNING "Please see the file "
214 "Documentation/feature-removal-schedule.txt "
215 "in the kernel source tree for more "
216 "details.\n");
217 }
218 *crc = symversion(mod->gpl_future_crcs,
219 (ks - mod->gpl_future_syms));
220 return ks->value;
221 }
180 } 222 }
181 DEBUGP("Failed to find symbol %s\n", name); 223 DEBUGP("Failed to find symbol %s\n", name);
182 return 0; 224 return 0;
183} 225}
184 226
185/* Find a symbol in this elf symbol table */
186static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
187 unsigned int symindex,
188 const char *strtab,
189 const char *name)
190{
191 unsigned int i;
192 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
193
194 /* Search (defined) internal symbols first. */
195 for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
196 if (sym[i].st_shndx != SHN_UNDEF
197 && strcmp(name, strtab + sym[i].st_name) == 0)
198 return sym[i].st_value;
199 }
200 return 0;
201}
202
203/* Search for module by name: must hold module_mutex. */ 227/* Search for module by name: must hold module_mutex. */
204static struct module *find_module(const char *name) 228static struct module *find_module(const char *name)
205{ 229{
@@ -379,7 +403,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
379} 403}
380#endif /* CONFIG_SMP */ 404#endif /* CONFIG_SMP */
381 405
382#ifdef CONFIG_MODULE_UNLOAD
383#define MODINFO_ATTR(field) \ 406#define MODINFO_ATTR(field) \
384static void setup_modinfo_##field(struct module *mod, const char *s) \ 407static void setup_modinfo_##field(struct module *mod, const char *s) \
385{ \ 408{ \
@@ -411,12 +434,7 @@ static struct module_attribute modinfo_##field = { \
411MODINFO_ATTR(version); 434MODINFO_ATTR(version);
412MODINFO_ATTR(srcversion); 435MODINFO_ATTR(srcversion);
413 436
414static struct module_attribute *modinfo_attrs[] = { 437#ifdef CONFIG_MODULE_UNLOAD
415 &modinfo_version,
416 &modinfo_srcversion,
417 NULL,
418};
419
420/* Init the unload section of the module. */ 438/* Init the unload section of the module. */
421static void module_unload_init(struct module *mod) 439static void module_unload_init(struct module *mod)
422{ 440{
@@ -557,7 +575,7 @@ static void free_module(struct module *mod);
557static void wait_for_zero_refcount(struct module *mod) 575static void wait_for_zero_refcount(struct module *mod)
558{ 576{
559 /* Since we might sleep for some time, drop the semaphore first */ 577 /* Since we might sleep for some time, drop the semaphore first */
560 up(&module_mutex); 578 mutex_unlock(&module_mutex);
561 for (;;) { 579 for (;;) {
562 DEBUGP("Looking at refcount...\n"); 580 DEBUGP("Looking at refcount...\n");
563 set_current_state(TASK_UNINTERRUPTIBLE); 581 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -566,7 +584,7 @@ static void wait_for_zero_refcount(struct module *mod)
566 schedule(); 584 schedule();
567 } 585 }
568 current->state = TASK_RUNNING; 586 current->state = TASK_RUNNING;
569 down(&module_mutex); 587 mutex_lock(&module_mutex);
570} 588}
571 589
572asmlinkage long 590asmlinkage long
@@ -583,7 +601,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
583 return -EFAULT; 601 return -EFAULT;
584 name[MODULE_NAME_LEN-1] = '\0'; 602 name[MODULE_NAME_LEN-1] = '\0';
585 603
586 if (down_interruptible(&module_mutex) != 0) 604 if (mutex_lock_interruptible(&module_mutex) != 0)
587 return -EINTR; 605 return -EINTR;
588 606
589 mod = find_module(name); 607 mod = find_module(name);
@@ -632,14 +650,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
632 650
633 /* Final destruction now noone is using it. */ 651 /* Final destruction now noone is using it. */
634 if (mod->exit != NULL) { 652 if (mod->exit != NULL) {
635 up(&module_mutex); 653 mutex_unlock(&module_mutex);
636 mod->exit(); 654 mod->exit();
637 down(&module_mutex); 655 mutex_lock(&module_mutex);
638 } 656 }
639 free_module(mod); 657 free_module(mod);
640 658
641 out: 659 out:
642 up(&module_mutex); 660 mutex_unlock(&module_mutex);
643 return ret; 661 return ret;
644} 662}
645 663
@@ -687,14 +705,14 @@ EXPORT_SYMBOL(__symbol_put);
687 705
688void symbol_put_addr(void *addr) 706void symbol_put_addr(void *addr)
689{ 707{
690 unsigned long flags; 708 struct module *modaddr;
691 709
692 spin_lock_irqsave(&modlist_lock, flags); 710 if (core_kernel_text((unsigned long)addr))
693 if (!kernel_text_address((unsigned long)addr)) 711 return;
694 BUG();
695 712
696 module_put(module_text_address((unsigned long)addr)); 713 if (!(modaddr = module_text_address((unsigned long)addr)))
697 spin_unlock_irqrestore(&modlist_lock, flags); 714 BUG();
715 module_put(modaddr);
698} 716}
699EXPORT_SYMBOL_GPL(symbol_put_addr); 717EXPORT_SYMBOL_GPL(symbol_put_addr);
700 718
@@ -731,138 +749,14 @@ static inline void module_unload_init(struct module *mod)
731} 749}
732#endif /* CONFIG_MODULE_UNLOAD */ 750#endif /* CONFIG_MODULE_UNLOAD */
733 751
734#ifdef CONFIG_OBSOLETE_MODPARM 752static struct module_attribute *modinfo_attrs[] = {
735/* Bounds checking done below */ 753 &modinfo_version,
736static int obsparm_copy_string(const char *val, struct kernel_param *kp) 754 &modinfo_srcversion,
737{ 755#ifdef CONFIG_MODULE_UNLOAD
738 strcpy(kp->arg, val); 756 &refcnt,
739 return 0; 757#endif
740} 758 NULL,
741 759};
742static int set_obsolete(const char *val, struct kernel_param *kp)
743{
744 unsigned int min, max;
745 unsigned int size, maxsize;
746 int dummy;
747 char *endp;
748 const char *p;
749 struct obsolete_modparm *obsparm = kp->arg;
750
751 if (!val) {
752 printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
753 return -EINVAL;
754 }
755
756 /* type is: [min[-max]]{b,h,i,l,s} */
757 p = obsparm->type;
758 min = simple_strtol(p, &endp, 10);
759 if (endp == obsparm->type)
760 min = max = 1;
761 else if (*endp == '-') {
762 p = endp+1;
763 max = simple_strtol(p, &endp, 10);
764 } else
765 max = min;
766 switch (*endp) {
767 case 'b':
768 return param_array(kp->name, val, min, max, obsparm->addr,
769 1, param_set_byte, &dummy);
770 case 'h':
771 return param_array(kp->name, val, min, max, obsparm->addr,
772 sizeof(short), param_set_short, &dummy);
773 case 'i':
774 return param_array(kp->name, val, min, max, obsparm->addr,
775 sizeof(int), param_set_int, &dummy);
776 case 'l':
777 return param_array(kp->name, val, min, max, obsparm->addr,
778 sizeof(long), param_set_long, &dummy);
779 case 's':
780 return param_array(kp->name, val, min, max, obsparm->addr,
781 sizeof(char *), param_set_charp, &dummy);
782
783 case 'c':
784 /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
785 and the decl is "char xxx[5][50];" */
786 p = endp+1;
787 maxsize = simple_strtol(p, &endp, 10);
788 /* We check lengths here (yes, this is a hack). */
789 p = val;
790 while (p[size = strcspn(p, ",")]) {
791 if (size >= maxsize)
792 goto oversize;
793 p += size+1;
794 }
795 if (size >= maxsize)
796 goto oversize;
797 return param_array(kp->name, val, min, max, obsparm->addr,
798 maxsize, obsparm_copy_string, &dummy);
799 }
800 printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
801 return -EINVAL;
802 oversize:
803 printk(KERN_ERR
804 "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
805 return -EINVAL;
806}
807
808static int obsolete_params(const char *name,
809 char *args,
810 struct obsolete_modparm obsparm[],
811 unsigned int num,
812 Elf_Shdr *sechdrs,
813 unsigned int symindex,
814 const char *strtab)
815{
816 struct kernel_param *kp;
817 unsigned int i;
818 int ret;
819
820 kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
821 if (!kp)
822 return -ENOMEM;
823
824 for (i = 0; i < num; i++) {
825 char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
826
827 snprintf(sym_name, sizeof(sym_name), "%s%s",
828 MODULE_SYMBOL_PREFIX, obsparm[i].name);
829
830 kp[i].name = obsparm[i].name;
831 kp[i].perm = 000;
832 kp[i].set = set_obsolete;
833 kp[i].get = NULL;
834 obsparm[i].addr
835 = (void *)find_local_symbol(sechdrs, symindex, strtab,
836 sym_name);
837 if (!obsparm[i].addr) {
838 printk("%s: falsely claims to have parameter %s\n",
839 name, obsparm[i].name);
840 ret = -EINVAL;
841 goto out;
842 }
843 kp[i].arg = &obsparm[i];
844 }
845
846 ret = parse_args(name, args, kp, num, NULL);
847 out:
848 kfree(kp);
849 return ret;
850}
851#else
852static int obsolete_params(const char *name,
853 char *args,
854 struct obsolete_modparm obsparm[],
855 unsigned int num,
856 Elf_Shdr *sechdrs,
857 unsigned int symindex,
858 const char *strtab)
859{
860 if (num != 0)
861 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
862 name);
863 return 0;
864}
865#endif /* CONFIG_OBSOLETE_MODPARM */
866 760
867static const char vermagic[] = VERMAGIC_STRING; 761static const char vermagic[] = VERMAGIC_STRING;
868 762
@@ -1056,37 +950,28 @@ static inline void remove_sect_attrs(struct module *mod)
1056} 950}
1057#endif /* CONFIG_KALLSYMS */ 951#endif /* CONFIG_KALLSYMS */
1058 952
1059
1060#ifdef CONFIG_MODULE_UNLOAD
1061static inline int module_add_refcnt_attr(struct module *mod)
1062{
1063 return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
1064}
1065static void module_remove_refcnt_attr(struct module *mod)
1066{
1067 return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
1068}
1069#else
1070static inline int module_add_refcnt_attr(struct module *mod)
1071{
1072 return 0;
1073}
1074static void module_remove_refcnt_attr(struct module *mod)
1075{
1076}
1077#endif
1078
1079#ifdef CONFIG_MODULE_UNLOAD
1080static int module_add_modinfo_attrs(struct module *mod) 953static int module_add_modinfo_attrs(struct module *mod)
1081{ 954{
1082 struct module_attribute *attr; 955 struct module_attribute *attr;
956 struct module_attribute *temp_attr;
1083 int error = 0; 957 int error = 0;
1084 int i; 958 int i;
1085 959
960 mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
961 (ARRAY_SIZE(modinfo_attrs) + 1)),
962 GFP_KERNEL);
963 if (!mod->modinfo_attrs)
964 return -ENOMEM;
965
966 temp_attr = mod->modinfo_attrs;
1086 for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { 967 for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
1087 if (!attr->test || 968 if (!attr->test ||
1088 (attr->test && attr->test(mod))) 969 (attr->test && attr->test(mod))) {
1089 error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr); 970 memcpy(temp_attr, attr, sizeof(*temp_attr));
971 temp_attr->attr.owner = mod;
972 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
973 ++temp_attr;
974 }
1090 } 975 }
1091 return error; 976 return error;
1092} 977}
@@ -1096,12 +981,16 @@ static void module_remove_modinfo_attrs(struct module *mod)
1096 struct module_attribute *attr; 981 struct module_attribute *attr;
1097 int i; 982 int i;
1098 983
1099 for (i = 0; (attr = modinfo_attrs[i]); i++) { 984 for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
985 /* pick a field to test for end of list */
986 if (!attr->attr.name)
987 break;
1100 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); 988 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
1101 attr->free(mod); 989 if (attr->free)
990 attr->free(mod);
1102 } 991 }
992 kfree(mod->modinfo_attrs);
1103} 993}
1104#endif
1105 994
1106static int mod_sysfs_setup(struct module *mod, 995static int mod_sysfs_setup(struct module *mod,
1107 struct kernel_param *kparam, 996 struct kernel_param *kparam,
@@ -1119,19 +1008,13 @@ static int mod_sysfs_setup(struct module *mod,
1119 if (err) 1008 if (err)
1120 goto out; 1009 goto out;
1121 1010
1122 err = module_add_refcnt_attr(mod);
1123 if (err)
1124 goto out_unreg;
1125
1126 err = module_param_sysfs_setup(mod, kparam, num_params); 1011 err = module_param_sysfs_setup(mod, kparam, num_params);
1127 if (err) 1012 if (err)
1128 goto out_unreg; 1013 goto out_unreg;
1129 1014
1130#ifdef CONFIG_MODULE_UNLOAD
1131 err = module_add_modinfo_attrs(mod); 1015 err = module_add_modinfo_attrs(mod);
1132 if (err) 1016 if (err)
1133 goto out_unreg; 1017 goto out_unreg;
1134#endif
1135 1018
1136 return 0; 1019 return 0;
1137 1020
@@ -1143,10 +1026,7 @@ out:
1143 1026
1144static void mod_kobject_remove(struct module *mod) 1027static void mod_kobject_remove(struct module *mod)
1145{ 1028{
1146#ifdef CONFIG_MODULE_UNLOAD
1147 module_remove_modinfo_attrs(mod); 1029 module_remove_modinfo_attrs(mod);
1148#endif
1149 module_remove_refcnt_attr(mod);
1150 module_param_sysfs_remove(mod); 1030 module_param_sysfs_remove(mod);
1151 1031
1152 kobject_unregister(&mod->mkobj.kobj); 1032 kobject_unregister(&mod->mkobj.kobj);
@@ -1374,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
1374 || strcmp(license, "GPL v2") == 0 1254 || strcmp(license, "GPL v2") == 0
1375 || strcmp(license, "GPL and additional rights") == 0 1255 || strcmp(license, "GPL and additional rights") == 0
1376 || strcmp(license, "Dual BSD/GPL") == 0 1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1377 || strcmp(license, "Dual MPL/GPL") == 0); 1258 || strcmp(license, "Dual MPL/GPL") == 0);
1378} 1259}
1379 1260
@@ -1424,7 +1305,6 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
1424 return NULL; 1305 return NULL;
1425} 1306}
1426 1307
1427#ifdef CONFIG_MODULE_UNLOAD
1428static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, 1308static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1429 unsigned int infoindex) 1309 unsigned int infoindex)
1430{ 1310{
@@ -1439,23 +1319,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1439 attr->attr.name)); 1319 attr->attr.name));
1440 } 1320 }
1441} 1321}
1442#endif
1443 1322
1444#ifdef CONFIG_KALLSYMS 1323#ifdef CONFIG_KALLSYMS
1445int is_exported(const char *name, const struct module *mod) 1324int is_exported(const char *name, const struct module *mod)
1446{ 1325{
1447 unsigned int i; 1326 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
1448 1327 return 1;
1449 if (!mod) { 1328 else
1450 for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) 1329 if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
1451 if (strcmp(__start___ksymtab[i].name, name) == 0)
1452 return 1;
1453 return 0;
1454 }
1455 for (i = 0; i < mod->num_syms; i++)
1456 if (strcmp(mod->syms[i].name, name) == 0)
1457 return 1; 1330 return 1;
1458 return 0; 1331 else
1332 return 0;
1459} 1333}
1460 1334
1461/* As per nm */ 1335/* As per nm */
@@ -1537,8 +1411,8 @@ static struct module *load_module(void __user *umod,
1537 char *secstrings, *args, *modmagic, *strtab = NULL; 1411 char *secstrings, *args, *modmagic, *strtab = NULL;
1538 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, 1412 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
1539 exportindex, modindex, obsparmindex, infoindex, gplindex, 1413 exportindex, modindex, obsparmindex, infoindex, gplindex,
1540 crcindex, gplcrcindex, versindex, pcpuindex; 1414 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
1541 long arglen; 1415 gplfuturecrcindex;
1542 struct module *mod; 1416 struct module *mod;
1543 long err = 0; 1417 long err = 0;
1544 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1418 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1618,8 +1492,10 @@ static struct module *load_module(void __user *umod,
1618 /* Optional sections */ 1492 /* Optional sections */
1619 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); 1493 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1620 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); 1494 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1495 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1621 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); 1496 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1622 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); 1497 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1498 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1623 setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); 1499 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1624 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); 1500 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1625 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); 1501 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1655,23 +1531,11 @@ static struct module *load_module(void __user *umod,
1655 } 1531 }
1656 1532
1657 /* Now copy in args */ 1533 /* Now copy in args */
1658 arglen = strlen_user(uargs); 1534 args = strndup_user(uargs, ~0UL >> 1);
1659 if (!arglen) { 1535 if (IS_ERR(args)) {
1660 err = -EFAULT; 1536 err = PTR_ERR(args);
1661 goto free_hdr; 1537 goto free_hdr;
1662 } 1538 }
1663 args = kmalloc(arglen, GFP_KERNEL);
1664 if (!args) {
1665 err = -ENOMEM;
1666 goto free_hdr;
1667 }
1668 if (copy_from_user(args, uargs, arglen) != 0) {
1669 err = -EFAULT;
1670 goto free_mod;
1671 }
1672
1673 /* Userspace could have altered the string after the strlen_user() */
1674 args[arglen - 1] = '\0';
1675 1539
1676 if (find_module(mod->name)) { 1540 if (find_module(mod->name)) {
1677 err = -EEXIST; 1541 err = -EEXIST;
@@ -1755,10 +1619,8 @@ static struct module *load_module(void __user *umod,
1755 if (strcmp(mod->name, "driverloader") == 0) 1619 if (strcmp(mod->name, "driverloader") == 0)
1756 add_taint(TAINT_PROPRIETARY_MODULE); 1620 add_taint(TAINT_PROPRIETARY_MODULE);
1757 1621
1758#ifdef CONFIG_MODULE_UNLOAD
1759 /* Set up MODINFO_ATTR fields */ 1622 /* Set up MODINFO_ATTR fields */
1760 setup_modinfo(mod, sechdrs, infoindex); 1623 setup_modinfo(mod, sechdrs, infoindex);
1761#endif
1762 1624
1763 /* Fix up syms, so that st_value is a pointer to location. */ 1625 /* Fix up syms, so that st_value is a pointer to location. */
1764 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, 1626 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
@@ -1775,10 +1637,16 @@ static struct module *load_module(void __user *umod,
1775 mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; 1637 mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
1776 if (gplcrcindex) 1638 if (gplcrcindex)
1777 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 1639 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
1640 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
1641 sizeof(*mod->gpl_future_syms);
1642 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
1643 if (gplfuturecrcindex)
1644 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
1778 1645
1779#ifdef CONFIG_MODVERSIONS 1646#ifdef CONFIG_MODVERSIONS
1780 if ((mod->num_syms && !crcindex) || 1647 if ((mod->num_syms && !crcindex) ||
1781 (mod->num_gpl_syms && !gplcrcindex)) { 1648 (mod->num_gpl_syms && !gplcrcindex) ||
1649 (mod->num_gpl_future_syms && !gplfuturecrcindex)) {
1782 printk(KERN_WARNING "%s: No versions for exported symbols." 1650 printk(KERN_WARNING "%s: No versions for exported symbols."
1783 " Tainting kernel.\n", mod->name); 1651 " Tainting kernel.\n", mod->name);
1784 add_taint(TAINT_FORCED_MODULE); 1652 add_taint(TAINT_FORCED_MODULE);
@@ -1847,27 +1715,17 @@ static struct module *load_module(void __user *umod,
1847 set_fs(old_fs); 1715 set_fs(old_fs);
1848 1716
1849 mod->args = args; 1717 mod->args = args;
1850 if (obsparmindex) { 1718 if (obsparmindex)
1851 err = obsolete_params(mod->name, mod->args, 1719 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
1852 (struct obsolete_modparm *) 1720 mod->name);
1853 sechdrs[obsparmindex].sh_addr, 1721
1854 sechdrs[obsparmindex].sh_size 1722 /* Size of section 0 is 0, so this works well if no params */
1855 / sizeof(struct obsolete_modparm), 1723 err = parse_args(mod->name, mod->args,
1856 sechdrs, symindex, 1724 (struct kernel_param *)
1857 (char *)sechdrs[strindex].sh_addr); 1725 sechdrs[setupindex].sh_addr,
1858 if (setupindex) 1726 sechdrs[setupindex].sh_size
1859 printk(KERN_WARNING "%s: Ignoring new-style " 1727 / sizeof(struct kernel_param),
1860 "parameters in presence of obsolete ones\n", 1728 NULL);
1861 mod->name);
1862 } else {
1863 /* Size of section 0 is 0, so this works well if no params */
1864 err = parse_args(mod->name, mod->args,
1865 (struct kernel_param *)
1866 sechdrs[setupindex].sh_addr,
1867 sechdrs[setupindex].sh_size
1868 / sizeof(struct kernel_param),
1869 NULL);
1870 }
1871 if (err < 0) 1729 if (err < 0)
1872 goto arch_cleanup; 1730 goto arch_cleanup;
1873 1731
@@ -1933,13 +1791,13 @@ sys_init_module(void __user *umod,
1933 return -EPERM; 1791 return -EPERM;
1934 1792
1935 /* Only one module load at a time, please */ 1793 /* Only one module load at a time, please */
1936 if (down_interruptible(&module_mutex) != 0) 1794 if (mutex_lock_interruptible(&module_mutex) != 0)
1937 return -EINTR; 1795 return -EINTR;
1938 1796
1939 /* Do all the hard work */ 1797 /* Do all the hard work */
1940 mod = load_module(umod, len, uargs); 1798 mod = load_module(umod, len, uargs);
1941 if (IS_ERR(mod)) { 1799 if (IS_ERR(mod)) {
1942 up(&module_mutex); 1800 mutex_unlock(&module_mutex);
1943 return PTR_ERR(mod); 1801 return PTR_ERR(mod);
1944 } 1802 }
1945 1803
@@ -1948,11 +1806,10 @@ sys_init_module(void __user *umod,
1948 stop_machine_run(__link_module, mod, NR_CPUS); 1806 stop_machine_run(__link_module, mod, NR_CPUS);
1949 1807
1950 /* Drop lock so they can recurse */ 1808 /* Drop lock so they can recurse */
1951 up(&module_mutex); 1809 mutex_unlock(&module_mutex);
1952 1810
1953 down(&notify_mutex); 1811 blocking_notifier_call_chain(&module_notify_list,
1954 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); 1812 MODULE_STATE_COMING, mod);
1955 up(&notify_mutex);
1956 1813
1957 /* Start the module */ 1814 /* Start the module */
1958 if (mod->init != NULL) 1815 if (mod->init != NULL)
@@ -1967,15 +1824,15 @@ sys_init_module(void __user *umod,
1967 mod->name); 1824 mod->name);
1968 else { 1825 else {
1969 module_put(mod); 1826 module_put(mod);
1970 down(&module_mutex); 1827 mutex_lock(&module_mutex);
1971 free_module(mod); 1828 free_module(mod);
1972 up(&module_mutex); 1829 mutex_unlock(&module_mutex);
1973 } 1830 }
1974 return ret; 1831 return ret;
1975 } 1832 }
1976 1833
1977 /* Now it's a first class citizen! */ 1834 /* Now it's a first class citizen! */
1978 down(&module_mutex); 1835 mutex_lock(&module_mutex);
1979 mod->state = MODULE_STATE_LIVE; 1836 mod->state = MODULE_STATE_LIVE;
1980 /* Drop initial reference. */ 1837 /* Drop initial reference. */
1981 module_put(mod); 1838 module_put(mod);
@@ -1983,7 +1840,7 @@ sys_init_module(void __user *umod,
1983 mod->module_init = NULL; 1840 mod->module_init = NULL;
1984 mod->init_size = 0; 1841 mod->init_size = 0;
1985 mod->init_text_size = 0; 1842 mod->init_text_size = 0;
1986 up(&module_mutex); 1843 mutex_unlock(&module_mutex);
1987 1844
1988 return 0; 1845 return 0;
1989} 1846}
@@ -2073,7 +1930,7 @@ struct module *module_get_kallsym(unsigned int symnum,
2073{ 1930{
2074 struct module *mod; 1931 struct module *mod;
2075 1932
2076 down(&module_mutex); 1933 mutex_lock(&module_mutex);
2077 list_for_each_entry(mod, &modules, list) { 1934 list_for_each_entry(mod, &modules, list) {
2078 if (symnum < mod->num_symtab) { 1935 if (symnum < mod->num_symtab) {
2079 *value = mod->symtab[symnum].st_value; 1936 *value = mod->symtab[symnum].st_value;
@@ -2081,12 +1938,12 @@ struct module *module_get_kallsym(unsigned int symnum,
2081 strncpy(namebuf, 1938 strncpy(namebuf,
2082 mod->strtab + mod->symtab[symnum].st_name, 1939 mod->strtab + mod->symtab[symnum].st_name,
2083 127); 1940 127);
2084 up(&module_mutex); 1941 mutex_unlock(&module_mutex);
2085 return mod; 1942 return mod;
2086 } 1943 }
2087 symnum -= mod->num_symtab; 1944 symnum -= mod->num_symtab;
2088 } 1945 }
2089 up(&module_mutex); 1946 mutex_unlock(&module_mutex);
2090 return NULL; 1947 return NULL;
2091} 1948}
2092 1949
@@ -2129,7 +1986,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
2129 struct list_head *i; 1986 struct list_head *i;
2130 loff_t n = 0; 1987 loff_t n = 0;
2131 1988
2132 down(&module_mutex); 1989 mutex_lock(&module_mutex);
2133 list_for_each(i, &modules) { 1990 list_for_each(i, &modules) {
2134 if (n++ == *pos) 1991 if (n++ == *pos)
2135 break; 1992 break;
@@ -2150,7 +2007,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2150 2007
2151static void m_stop(struct seq_file *m, void *p) 2008static void m_stop(struct seq_file *m, void *p)
2152{ 2009{
2153 up(&module_mutex); 2010 mutex_unlock(&module_mutex);
2154} 2011}
2155 2012
2156static int m_show(struct seq_file *m, void *p) 2013static int m_show(struct seq_file *m, void *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index c5c4ab2558..cc2a4c9c36 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,13 +20,15 @@
20#include <linux/nmi.h> 20#include <linux/nmi.h>
21#include <linux/kexec.h> 21#include <linux/kexec.h>
22 22
23int panic_timeout;
24int panic_on_oops; 23int panic_on_oops;
25int tainted; 24int tainted;
25static int pause_on_oops;
26static int pause_on_oops_flag;
27static DEFINE_SPINLOCK(pause_on_oops_lock);
26 28
27EXPORT_SYMBOL(panic_timeout); 29int panic_timeout;
28 30
29struct notifier_block *panic_notifier_list; 31ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
30 32
31EXPORT_SYMBOL(panic_notifier_list); 33EXPORT_SYMBOL(panic_notifier_list);
32 34
@@ -94,7 +96,7 @@ NORET_TYPE void panic(const char * fmt, ...)
94 smp_send_stop(); 96 smp_send_stop();
95#endif 97#endif
96 98
97 notifier_call_chain(&panic_notifier_list, 0, buf); 99 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
98 100
99 if (!panic_blink) 101 if (!panic_blink)
100 panic_blink = no_blink; 102 panic_blink = no_blink;
@@ -130,6 +132,7 @@ NORET_TYPE void panic(const char * fmt, ...)
130#endif 132#endif
131 local_irq_enable(); 133 local_irq_enable();
132 for (i = 0;;) { 134 for (i = 0;;) {
135 touch_softlockup_watchdog();
133 i += panic_blink(i); 136 i += panic_blink(i);
134 mdelay(1); 137 mdelay(1);
135 i++; 138 i++;
@@ -173,3 +176,95 @@ void add_taint(unsigned flag)
173 tainted |= flag; 176 tainted |= flag;
174} 177}
175EXPORT_SYMBOL(add_taint); 178EXPORT_SYMBOL(add_taint);
179
180static int __init pause_on_oops_setup(char *str)
181{
182 pause_on_oops = simple_strtoul(str, NULL, 0);
183 return 1;
184}
185__setup("pause_on_oops=", pause_on_oops_setup);
186
187static void spin_msec(int msecs)
188{
189 int i;
190
191 for (i = 0; i < msecs; i++) {
192 touch_nmi_watchdog();
193 mdelay(1);
194 }
195}
196
197/*
198 * It just happens that oops_enter() and oops_exit() are identically
199 * implemented...
200 */
201static void do_oops_enter_exit(void)
202{
203 unsigned long flags;
204 static int spin_counter;
205
206 if (!pause_on_oops)
207 return;
208
209 spin_lock_irqsave(&pause_on_oops_lock, flags);
210 if (pause_on_oops_flag == 0) {
211 /* This CPU may now print the oops message */
212 pause_on_oops_flag = 1;
213 } else {
214 /* We need to stall this CPU */
215 if (!spin_counter) {
216 /* This CPU gets to do the counting */
217 spin_counter = pause_on_oops;
218 do {
219 spin_unlock(&pause_on_oops_lock);
220 spin_msec(MSEC_PER_SEC);
221 spin_lock(&pause_on_oops_lock);
222 } while (--spin_counter);
223 pause_on_oops_flag = 0;
224 } else {
225 /* This CPU waits for a different one */
226 while (spin_counter) {
227 spin_unlock(&pause_on_oops_lock);
228 spin_msec(1);
229 spin_lock(&pause_on_oops_lock);
230 }
231 }
232 }
233 spin_unlock_irqrestore(&pause_on_oops_lock, flags);
234}
235
236/*
237 * Return true if the calling CPU is allowed to print oops-related info. This
238 * is a bit racy..
239 */
240int oops_may_print(void)
241{
242 return pause_on_oops_flag == 0;
243}
244
245/*
246 * Called when the architecture enters its oops handler, before it prints
247 * anything. If this is the first CPU to oops, and it's oopsing the first time
248 * then let it proceed.
249 *
250 * This is all enabled by the pause_on_oops kernel boot option. We do all this
251 * to ensure that oopses don't scroll off the screen. It has the side-effect
252 * of preventing later-oopsing CPUs from mucking up the display, too.
253 *
254 * It turns out that the CPU which is allowed to print ends up pausing for the
255 * right duration, whereas all the other CPUs pause for twice as long: once in
256 * oops_enter(), once in oops_exit().
257 */
258void oops_enter(void)
259{
260 do_oops_enter_exit();
261}
262
263/*
264 * Called when the architecture exits its oops handler, after printing
265 * everything.
266 */
267void oops_exit(void)
268{
269 do_oops_enter_exit();
270}
diff --git a/kernel/params.c b/kernel/params.c
index c76ad25e6a..af43ecdc8d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,7 +31,7 @@
31#define DEBUGP(fmt, a...) 31#define DEBUGP(fmt, a...)
32#endif 32#endif
33 33
34static inline int dash2underscore(char c) 34static inline char dash2underscore(char c)
35{ 35{
36 if (c == '-') 36 if (c == '-')
37 return '_'; 37 return '_';
@@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
265} 265}
266 266
267/* We cheat here and temporarily mangle the string. */ 267/* We cheat here and temporarily mangle the string. */
268int param_array(const char *name, 268static int param_array(const char *name,
269 const char *val, 269 const char *val,
270 unsigned int min, unsigned int max, 270 unsigned int min, unsigned int max,
271 void *elem, int elemsize, 271 void *elem, int elemsize,
272 int (*set)(const char *, struct kernel_param *kp), 272 int (*set)(const char *, struct kernel_param *kp),
273 int *num) 273 int *num)
274{ 274{
275 int ret; 275 int ret;
276 struct kernel_param kp; 276 struct kernel_param kp;
@@ -638,13 +638,8 @@ static ssize_t module_attr_show(struct kobject *kobj,
638 if (!attribute->show) 638 if (!attribute->show)
639 return -EIO; 639 return -EIO;
640 640
641 if (!try_module_get(mk->mod))
642 return -ENODEV;
643
644 ret = attribute->show(attribute, mk->mod, buf); 641 ret = attribute->show(attribute, mk->mod, buf);
645 642
646 module_put(mk->mod);
647
648 return ret; 643 return ret;
649} 644}
650 645
@@ -662,13 +657,8 @@ static ssize_t module_attr_store(struct kobject *kobj,
662 if (!attribute->store) 657 if (!attribute->store)
663 return -EIO; 658 return -EIO;
664 659
665 if (!try_module_get(mk->mod))
666 return -ENODEV;
667
668 ret = attribute->store(attribute, mk->mod, buf, len); 660 ret = attribute->store(attribute, mk->mod, buf, len);
669 661
670 module_put(mk->mod);
671
672 return ret; 662 return ret;
673} 663}
674 664
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc072469..eeb836b65c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
28#include <linux/hash.h> 28#include <linux/hash.h>
29 29
30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
31static struct hlist_head *pid_hash[PIDTYPE_MAX]; 31static struct hlist_head *pid_hash;
32static int pidhash_shift; 32static int pidhash_shift;
33static kmem_cache_t *pid_cachep;
33 34
34int pid_max = PID_MAX_DEFAULT; 35int pid_max = PID_MAX_DEFAULT;
35int last_pid; 36int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
60static pidmap_t pidmap_array[PIDMAP_ENTRIES] = 61static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
61 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; 62 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
62 63
64/*
65 * Note: disable interrupts while the pidmap_lock is held as an
66 * interrupt might come in and do read_lock(&tasklist_lock).
67 *
68 * If we don't disable interrupts there is a nasty deadlock between
69 * detach_pid()->free_pid() and another cpu that does
70 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
71 * read_lock(&tasklist_lock);
72 *
73 * After we clean up the tasklist_lock and know there are no
74 * irq handlers that take it we can leave the interrupts enabled.
75 * For now it is easier to be safe than to prove it can't happen.
76 */
63static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 77static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
64 78
65fastcall void free_pidmap(int pid) 79static fastcall void free_pidmap(int pid)
66{ 80{
67 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; 81 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
68 int offset = pid & BITS_PER_PAGE_MASK; 82 int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
71 atomic_inc(&map->nr_free); 85 atomic_inc(&map->nr_free);
72} 86}
73 87
74int alloc_pidmap(void) 88static int alloc_pidmap(void)
75{ 89{
76 int i, offset, max_scan, pid, last = last_pid; 90 int i, offset, max_scan, pid, last = last_pid;
77 pidmap_t *map; 91 pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
89 * Free the page if someone raced with us 103 * Free the page if someone raced with us
90 * installing it: 104 * installing it:
91 */ 105 */
92 spin_lock(&pidmap_lock); 106 spin_lock_irq(&pidmap_lock);
93 if (map->page) 107 if (map->page)
94 free_page(page); 108 free_page(page);
95 else 109 else
96 map->page = (void *)page; 110 map->page = (void *)page;
97 spin_unlock(&pidmap_lock); 111 spin_unlock_irq(&pidmap_lock);
98 if (unlikely(!map->page)) 112 if (unlikely(!map->page))
99 break; 113 break;
100 } 114 }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
131 return -1; 145 return -1;
132} 146}
133 147
134struct pid * fastcall find_pid(enum pid_type type, int nr) 148fastcall void put_pid(struct pid *pid)
149{
150 if (!pid)
151 return;
152 if ((atomic_read(&pid->count) == 1) ||
153 atomic_dec_and_test(&pid->count))
154 kmem_cache_free(pid_cachep, pid);
155}
156
157static void delayed_put_pid(struct rcu_head *rhp)
158{
159 struct pid *pid = container_of(rhp, struct pid, rcu);
160 put_pid(pid);
161}
162
163fastcall void free_pid(struct pid *pid)
164{
165 /* We can be called with write_lock_irq(&tasklist_lock) held */
166 unsigned long flags;
167
168 spin_lock_irqsave(&pidmap_lock, flags);
169 hlist_del_rcu(&pid->pid_chain);
170 spin_unlock_irqrestore(&pidmap_lock, flags);
171
172 free_pidmap(pid->nr);
173 call_rcu(&pid->rcu, delayed_put_pid);
174}
175
176struct pid *alloc_pid(void)
177{
178 struct pid *pid;
179 enum pid_type type;
180 int nr = -1;
181
182 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
183 if (!pid)
184 goto out;
185
186 nr = alloc_pidmap();
187 if (nr < 0)
188 goto out_free;
189
190 atomic_set(&pid->count, 1);
191 pid->nr = nr;
192 for (type = 0; type < PIDTYPE_MAX; ++type)
193 INIT_HLIST_HEAD(&pid->tasks[type]);
194
195 spin_lock_irq(&pidmap_lock);
196 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
197 spin_unlock_irq(&pidmap_lock);
198
199out:
200 return pid;
201
202out_free:
203 kmem_cache_free(pid_cachep, pid);
204 pid = NULL;
205 goto out;
206}
207
208struct pid * fastcall find_pid(int nr)
135{ 209{
136 struct hlist_node *elem; 210 struct hlist_node *elem;
137 struct pid *pid; 211 struct pid *pid;
138 212
139 hlist_for_each_entry_rcu(pid, elem, 213 hlist_for_each_entry_rcu(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) { 214 &pid_hash[pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr) 215 if (pid->nr == nr)
142 return pid; 216 return pid;
143 } 217 }
@@ -146,105 +220,80 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
146 220
147int fastcall attach_pid(task_t *task, enum pid_type type, int nr) 221int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
148{ 222{
149 struct pid *pid, *task_pid; 223 struct pid_link *link;
150 224 struct pid *pid;
151 task_pid = &task->pids[type];
152 pid = find_pid(type, nr);
153 task_pid->nr = nr;
154 if (pid == NULL) {
155 INIT_LIST_HEAD(&task_pid->pid_list);
156 hlist_add_head_rcu(&task_pid->pid_chain,
157 &pid_hash[type][pid_hashfn(nr)]);
158 } else {
159 INIT_HLIST_NODE(&task_pid->pid_chain);
160 list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
161 }
162
163 return 0;
164}
165
166static fastcall int __detach_pid(task_t *task, enum pid_type type)
167{
168 struct pid *pid, *pid_next;
169 int nr = 0;
170
171 pid = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) {
173 225
174 if (list_empty(&pid->pid_list)) { 226 WARN_ON(!task->pid); /* to be removed soon */
175 nr = pid->nr; 227 WARN_ON(!nr); /* to be removed soon */
176 hlist_del_rcu(&pid->pid_chain);
177 } else {
178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */
181 hlist_replace_rcu(&pid->pid_chain,
182 &pid_next->pid_chain);
183 }
184 }
185 228
186 list_del_rcu(&pid->pid_list); 229 link = &task->pids[type];
187 pid->nr = 0; 230 link->pid = pid = find_pid(nr);
231 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
188 232
189 return nr; 233 return 0;
190} 234}
191 235
192void fastcall detach_pid(task_t *task, enum pid_type type) 236void fastcall detach_pid(task_t *task, enum pid_type type)
193{ 237{
194 int tmp, nr; 238 struct pid_link *link;
239 struct pid *pid;
240 int tmp;
195 241
196 nr = __detach_pid(task, type); 242 link = &task->pids[type];
197 if (!nr) 243 pid = link->pid;
198 return; 244
245 hlist_del_rcu(&link->node);
246 link->pid = NULL;
199 247
200 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 248 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
201 if (tmp != type && find_pid(tmp, nr)) 249 if (!hlist_empty(&pid->tasks[tmp]))
202 return; 250 return;
203 251
204 free_pidmap(nr); 252 free_pid(pid);
205} 253}
206 254
207task_t *find_task_by_pid_type(int type, int nr) 255struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
208{ 256{
209 struct pid *pid; 257 struct task_struct *result = NULL;
210 258 if (pid) {
211 pid = find_pid(type, nr); 259 struct hlist_node *first;
212 if (!pid) 260 first = rcu_dereference(pid->tasks[type].first);
213 return NULL; 261 if (first)
262 result = hlist_entry(first, struct task_struct, pids[(type)].node);
263 }
264 return result;
265}
214 266
215 return pid_task(&pid->pid_list, type); 267/*
268 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
269 */
270task_t *find_task_by_pid_type(int type, int nr)
271{
272 return pid_task(find_pid(nr), type);
216} 273}
217 274
218EXPORT_SYMBOL(find_task_by_pid_type); 275EXPORT_SYMBOL(find_task_by_pid_type);
219 276
220/* 277struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
221 * This function switches the PIDs if a non-leader thread calls 278{
222 * sys_execve() - this must be done without releasing the PID. 279 struct task_struct *result;
223 * (which a detach_pid() would eventually do.) 280 rcu_read_lock();
224 */ 281 result = pid_task(pid, type);
225void switch_exec_pids(task_t *leader, task_t *thread) 282 if (result)
283 get_task_struct(result);
284 rcu_read_unlock();
285 return result;
286}
287
288struct pid *find_get_pid(pid_t nr)
226{ 289{
227 __detach_pid(leader, PIDTYPE_PID); 290 struct pid *pid;
228 __detach_pid(leader, PIDTYPE_TGID); 291
229 __detach_pid(leader, PIDTYPE_PGID); 292 rcu_read_lock();
230 __detach_pid(leader, PIDTYPE_SID); 293 pid = get_pid(find_pid(nr));
231 294 rcu_read_unlock();
232 __detach_pid(thread, PIDTYPE_PID); 295
233 __detach_pid(thread, PIDTYPE_TGID); 296 return pid;
234
235 leader->pid = leader->tgid = thread->pid;
236 thread->pid = thread->tgid;
237
238 attach_pid(thread, PIDTYPE_PID, thread->pid);
239 attach_pid(thread, PIDTYPE_TGID, thread->tgid);
240 attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
241 attach_pid(thread, PIDTYPE_SID, thread->signal->session);
242 list_add_tail(&thread->tasks, &init_task.tasks);
243
244 attach_pid(leader, PIDTYPE_PID, leader->pid);
245 attach_pid(leader, PIDTYPE_TGID, leader->tgid);
246 attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
247 attach_pid(leader, PIDTYPE_SID, leader->signal->session);
248} 297}
249 298
250/* 299/*
@@ -254,7 +303,7 @@ void switch_exec_pids(task_t *leader, task_t *thread)
254 */ 303 */
255void __init pidhash_init(void) 304void __init pidhash_init(void)
256{ 305{
257 int i, j, pidhash_size; 306 int i, pidhash_size;
258 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); 307 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
259 308
260 pidhash_shift = max(4, fls(megabytes * 4)); 309 pidhash_shift = max(4, fls(megabytes * 4));
@@ -263,30 +312,23 @@ void __init pidhash_init(void)
263 312
264 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", 313 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
265 pidhash_size, pidhash_shift, 314 pidhash_size, pidhash_shift,
266 PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); 315 pidhash_size * sizeof(struct hlist_head));
267 316
268 for (i = 0; i < PIDTYPE_MAX; i++) { 317 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
269 pid_hash[i] = alloc_bootmem(pidhash_size * 318 if (!pid_hash)
270 sizeof(*(pid_hash[i]))); 319 panic("Could not alloc pidhash!\n");
271 if (!pid_hash[i]) 320 for (i = 0; i < pidhash_size; i++)
272 panic("Could not alloc pidhash!\n"); 321 INIT_HLIST_HEAD(&pid_hash[i]);
273 for (j = 0; j < pidhash_size; j++)
274 INIT_HLIST_HEAD(&pid_hash[i][j]);
275 }
276} 322}
277 323
278void __init pidmap_init(void) 324void __init pidmap_init(void)
279{ 325{
280 int i;
281
282 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); 326 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
327 /* Reserve PID 0. We never call free_pidmap(0) */
283 set_bit(0, pidmap_array->page); 328 set_bit(0, pidmap_array->page);
284 atomic_dec(&pidmap_array->nr_free); 329 atomic_dec(&pidmap_array->nr_free);
285 330
286 /* 331 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
287 * Allocate PID 0, and hash it via all PID types: 332 __alignof__(struct pid),
288 */ 333 SLAB_PANIC, NULL, NULL);
289
290 for (i = 0; i < PIDTYPE_MAX; i++)
291 attach_pid(current, i, 0);
292} 334}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 216f574b5f..ac6dc87444 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
35#include <linux/interrupt.h> 35#include <linux/interrupt.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/mutex.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40#include <asm/semaphore.h> 41#include <asm/semaphore.h>
@@ -144,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
144 struct itimerspec *, struct itimerspec *); 145 struct itimerspec *, struct itimerspec *);
145static int common_timer_del(struct k_itimer *timer); 146static int common_timer_del(struct k_itimer *timer);
146 147
147static int posix_timer_fn(void *data); 148static int posix_timer_fn(struct hrtimer *data);
148 149
149static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
150 151
@@ -250,15 +251,18 @@ __initcall(init_posix_timers);
250 251
251static void schedule_next_timer(struct k_itimer *timr) 252static void schedule_next_timer(struct k_itimer *timr)
252{ 253{
254 struct hrtimer *timer = &timr->it.real.timer;
255
253 if (timr->it.real.interval.tv64 == 0) 256 if (timr->it.real.interval.tv64 == 0)
254 return; 257 return;
255 258
256 timr->it_overrun += hrtimer_forward(&timr->it.real.timer, 259 timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
257 timr->it.real.interval); 260 timr->it.real.interval);
261
258 timr->it_overrun_last = timr->it_overrun; 262 timr->it_overrun_last = timr->it_overrun;
259 timr->it_overrun = -1; 263 timr->it_overrun = -1;
260 ++timr->it_requeue_pending; 264 ++timr->it_requeue_pending;
261 hrtimer_restart(&timr->it.real.timer); 265 hrtimer_restart(timer);
262} 266}
263 267
264/* 268/*
@@ -330,13 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
330 334
331 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
332 */ 336 */
333static int posix_timer_fn(void *data) 337static int posix_timer_fn(struct hrtimer *timer)
334{ 338{
335 struct k_itimer *timr = data; 339 struct k_itimer *timr;
336 unsigned long flags; 340 unsigned long flags;
337 int si_private = 0; 341 int si_private = 0;
338 int ret = HRTIMER_NORESTART; 342 int ret = HRTIMER_NORESTART;
339 343
344 timr = container_of(timer, struct k_itimer, it.real.timer);
340 spin_lock_irqsave(&timr->it_lock, flags); 345 spin_lock_irqsave(&timr->it_lock, flags);
341 346
342 if (timr->it.real.interval.tv64 != 0) 347 if (timr->it.real.interval.tv64 != 0)
@@ -350,9 +355,11 @@ static int posix_timer_fn(void *data)
350 */ 355 */
351 if (timr->it.real.interval.tv64 != 0) { 356 if (timr->it.real.interval.tv64 != 0) {
352 timr->it_overrun += 357 timr->it_overrun +=
353 hrtimer_forward(&timr->it.real.timer, 358 hrtimer_forward(timer,
359 timer->base->softirq_time,
354 timr->it.real.interval); 360 timr->it.real.interval);
355 ret = HRTIMER_RESTART; 361 ret = HRTIMER_RESTART;
362 ++timr->it_requeue_pending;
356 } 363 }
357 } 364 }
358 365
@@ -601,38 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
601static void 608static void
602common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) 609common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
603{ 610{
604 ktime_t remaining; 611 ktime_t now, remaining, iv;
605 struct hrtimer *timer = &timr->it.real.timer; 612 struct hrtimer *timer = &timr->it.real.timer;
606 613
607 memset(cur_setting, 0, sizeof(struct itimerspec)); 614 memset(cur_setting, 0, sizeof(struct itimerspec));
608 remaining = hrtimer_get_remaining(timer);
609 615
610 /* Time left ? or timer pending */ 616 iv = timr->it.real.interval;
611 if (remaining.tv64 > 0 || hrtimer_active(timer)) 617
612 goto calci;
613 /* interval timer ? */ 618 /* interval timer ? */
614 if (timr->it.real.interval.tv64 == 0) 619 if (iv.tv64)
620 cur_setting->it_interval = ktime_to_timespec(iv);
621 else if (!hrtimer_active(timer) &&
622 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
615 return; 623 return;
624
625 now = timer->base->get_time();
626
616 /* 627 /*
617 * When a requeue is pending or this is a SIGEV_NONE timer 628 * When a requeue is pending or this is a SIGEV_NONE
618 * move the expiry time forward by intervals, so expiry is > 629 * timer move the expiry time forward by intervals, so
619 * now. 630 * expiry is > now.
620 */ 631 */
621 if (timr->it_requeue_pending & REQUEUE_PENDING || 632 if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
622 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { 633 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
623 timr->it_overrun += 634 timr->it_overrun += hrtimer_forward(timer, now, iv);
624 hrtimer_forward(timer, timr->it.real.interval); 635
625 remaining = hrtimer_get_remaining(timer); 636 remaining = ktime_sub(timer->expires, now);
626 }
627 calci:
628 /* interval timer ? */
629 if (timr->it.real.interval.tv64 != 0)
630 cur_setting->it_interval =
631 ktime_to_timespec(timr->it.real.interval);
632 /* Return 0 only, when the timer is expired and not pending */ 637 /* Return 0 only, when the timer is expired and not pending */
633 if (remaining.tv64 <= 0) 638 if (remaining.tv64 <= 0) {
634 cur_setting->it_value.tv_nsec = 1; 639 /*
635 else 640 * A single shot SIGEV_NONE timer must return 0, when
641 * it is expired !
642 */
643 if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
644 cur_setting->it_value.tv_nsec = 1;
645 } else
636 cur_setting->it_value = ktime_to_timespec(remaining); 646 cur_setting->it_value = ktime_to_timespec(remaining);
637} 647}
638 648
@@ -715,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
715 725
716 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; 726 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
717 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 727 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
718 timr->it.real.timer.data = timr;
719 timr->it.real.timer.function = posix_timer_fn; 728 timr->it.real.timer.function = posix_timer_fn;
720 729
721 timer->expires = timespec_to_ktime(new_setting->it_value); 730 timer->expires = timespec_to_ktime(new_setting->it_value);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9fd8d4f035..ce0dfb8f4a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -41,7 +41,7 @@ config SOFTWARE_SUSPEND
41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
42 ---help--- 42 ---help---
43 Enable the possibility of suspending the machine. 43 Enable the possibility of suspending the machine.
44 It doesn't need APM. 44 It doesn't need ACPI or APM.
45 You may suspend your machine by 'swsusp' or 'shutdown -z <time>' 45 You may suspend your machine by 'swsusp' or 'shutdown -z <time>'
46 (patch for sysvinit needed). 46 (patch for sysvinit needed).
47 47
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 04be7d0d96..8d0af3d37a 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
5 5
6obj-y := main.o process.o console.o 6obj-y := main.o process.o console.o
7obj-$(CONFIG_PM_LEGACY) += pm.o 7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o 8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
9 9
10obj-$(CONFIG_SUSPEND_SMP) += smp.o 10obj-$(CONFIG_SUSPEND_SMP) += smp.o
11 11
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc9..81d4d982f3 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
22#include "power.h" 22#include "power.h"
23 23
24 24
25extern suspend_disk_method_t pm_disk_mode;
26
27extern int swsusp_shrink_memory(void);
28extern int swsusp_suspend(void);
29extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
30extern int swsusp_check(void);
31extern int swsusp_read(struct pbe **pblist_ptr);
32extern void swsusp_close(void);
33extern int swsusp_resume(void);
34
35
36static int noresume = 0; 25static int noresume = 0;
37char resume_file[256] = CONFIG_PM_STD_PARTITION; 26char resume_file[256] = CONFIG_PM_STD_PARTITION;
38dev_t swsusp_resume_device; 27dev_t swsusp_resume_device;
@@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode)
70 while(1); 59 while(1);
71} 60}
72 61
73
74static int in_suspend __nosavedata = 0;
75
76
77static inline void platform_finish(void) 62static inline void platform_finish(void)
78{ 63{
79 if (pm_disk_mode == PM_DISK_PLATFORM) { 64 if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -87,7 +72,6 @@ static int prepare_processes(void)
87 int error; 72 int error;
88 73
89 pm_prepare_console(); 74 pm_prepare_console();
90 sys_sync();
91 disable_nonboot_cpus(); 75 disable_nonboot_cpus();
92 76
93 if (freeze_processes()) { 77 if (freeze_processes()) {
@@ -145,7 +129,7 @@ int pm_suspend_disk(void)
145 if (in_suspend) { 129 if (in_suspend) {
146 device_resume(); 130 device_resume();
147 pr_debug("PM: writing image.\n"); 131 pr_debug("PM: writing image.\n");
148 error = swsusp_write(pagedir_nosave, nr_copy_pages); 132 error = swsusp_write();
149 if (!error) 133 if (!error)
150 power_down(pm_disk_mode); 134 power_down(pm_disk_mode);
151 else { 135 else {
@@ -216,7 +200,7 @@ static int software_resume(void)
216 200
217 pr_debug("PM: Reading swsusp image.\n"); 201 pr_debug("PM: Reading swsusp image.\n");
218 202
219 if ((error = swsusp_read(&pagedir_nosave))) { 203 if ((error = swsusp_read())) {
220 swsusp_free(); 204 swsusp_free();
221 goto Thaw; 205 goto Thaw;
222 } 206 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9cb235cba4..a6d9ef4600 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state)
103} 103}
104 104
105 105
106static int suspend_enter(suspend_state_t state) 106int suspend_enter(suspend_state_t state)
107{ 107{
108 int error = 0; 108 int error = 0;
109 unsigned long flags; 109 unsigned long flags;
@@ -272,7 +272,7 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
272 if (*s && !strncmp(buf, *s, len)) 272 if (*s && !strncmp(buf, *s, len))
273 break; 273 break;
274 } 274 }
275 if (*s) 275 if (state < PM_SUSPEND_MAX && *s)
276 error = enter_state(state); 276 error = enter_state(state);
277 else 277 else
278 error = -EINVAL; 278 error = -EINVAL;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 33c508e857..84063ac8fc 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -25,6 +25,7 @@
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/pm_legacy.h> 26#include <linux/pm_legacy.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/mutex.h>
28 29
29int pm_active; 30int pm_active;
30 31
@@ -40,7 +41,7 @@ int pm_active;
40 * until a resume but that will be fine. 41 * until a resume but that will be fine.
41 */ 42 */
42 43
43static DECLARE_MUTEX(pm_devs_lock); 44static DEFINE_MUTEX(pm_devs_lock);
44static LIST_HEAD(pm_devs); 45static LIST_HEAD(pm_devs);
45 46
46/** 47/**
@@ -67,32 +68,13 @@ struct pm_dev *pm_register(pm_dev_t type,
67 dev->id = id; 68 dev->id = id;
68 dev->callback = callback; 69 dev->callback = callback;
69 70
70 down(&pm_devs_lock); 71 mutex_lock(&pm_devs_lock);
71 list_add(&dev->entry, &pm_devs); 72 list_add(&dev->entry, &pm_devs);
72 up(&pm_devs_lock); 73 mutex_unlock(&pm_devs_lock);
73 } 74 }
74 return dev; 75 return dev;
75} 76}
76 77
77/**
78 * pm_unregister - unregister a device with power management
79 * @dev: device to unregister
80 *
81 * Remove a device from the power management notification lists. The
82 * dev passed must be a handle previously returned by pm_register.
83 */
84
85void pm_unregister(struct pm_dev *dev)
86{
87 if (dev) {
88 down(&pm_devs_lock);
89 list_del(&dev->entry);
90 up(&pm_devs_lock);
91
92 kfree(dev);
93 }
94}
95
96static void __pm_unregister(struct pm_dev *dev) 78static void __pm_unregister(struct pm_dev *dev)
97{ 79{
98 if (dev) { 80 if (dev) {
@@ -118,7 +100,7 @@ void pm_unregister_all(pm_callback callback)
118 if (!callback) 100 if (!callback)
119 return; 101 return;
120 102
121 down(&pm_devs_lock); 103 mutex_lock(&pm_devs_lock);
122 entry = pm_devs.next; 104 entry = pm_devs.next;
123 while (entry != &pm_devs) { 105 while (entry != &pm_devs) {
124 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); 106 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -126,7 +108,7 @@ void pm_unregister_all(pm_callback callback)
126 if (dev->callback == callback) 108 if (dev->callback == callback)
127 __pm_unregister(dev); 109 __pm_unregister(dev);
128 } 110 }
129 up(&pm_devs_lock); 111 mutex_unlock(&pm_devs_lock);
130} 112}
131 113
132/** 114/**
@@ -234,7 +216,7 @@ int pm_send_all(pm_request_t rqst, void *data)
234{ 216{
235 struct list_head *entry; 217 struct list_head *entry;
236 218
237 down(&pm_devs_lock); 219 mutex_lock(&pm_devs_lock);
238 entry = pm_devs.next; 220 entry = pm_devs.next;
239 while (entry != &pm_devs) { 221 while (entry != &pm_devs) {
240 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); 222 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -246,18 +228,17 @@ int pm_send_all(pm_request_t rqst, void *data)
246 */ 228 */
247 if (rqst == PM_SUSPEND) 229 if (rqst == PM_SUSPEND)
248 pm_undo_all(dev); 230 pm_undo_all(dev);
249 up(&pm_devs_lock); 231 mutex_unlock(&pm_devs_lock);
250 return status; 232 return status;
251 } 233 }
252 } 234 }
253 entry = entry->next; 235 entry = entry->next;
254 } 236 }
255 up(&pm_devs_lock); 237 mutex_unlock(&pm_devs_lock);
256 return 0; 238 return 0;
257} 239}
258 240
259EXPORT_SYMBOL(pm_register); 241EXPORT_SYMBOL(pm_register);
260EXPORT_SYMBOL(pm_unregister);
261EXPORT_SYMBOL(pm_unregister_all); 242EXPORT_SYMBOL(pm_unregister_all);
262EXPORT_SYMBOL(pm_send_all); 243EXPORT_SYMBOL(pm_send_all);
263EXPORT_SYMBOL(pm_active); 244EXPORT_SYMBOL(pm_active);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba6808..f06f12f217 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
8 int cpus; 8 int cpus;
9 unsigned long image_pages; 9 unsigned long image_pages;
10 unsigned long pages; 10 unsigned long pages;
11 unsigned long size;
11} __attribute__((aligned(PAGE_SIZE))); 12} __attribute__((aligned(PAGE_SIZE)));
12 13
13 14
@@ -37,21 +38,79 @@ extern struct subsystem power_subsys;
37/* References to section boundaries */ 38/* References to section boundaries */
38extern const void __nosave_begin, __nosave_end; 39extern const void __nosave_begin, __nosave_end;
39 40
40extern unsigned int nr_copy_pages;
41extern struct pbe *pagedir_nosave; 41extern struct pbe *pagedir_nosave;
42 42
43/* Preferred image size in bytes (default 500 MB) */ 43/* Preferred image size in bytes (default 500 MB) */
44extern unsigned long image_size; 44extern unsigned long image_size;
45extern int in_suspend;
46extern dev_t swsusp_resume_device;
45 47
46extern asmlinkage int swsusp_arch_suspend(void); 48extern asmlinkage int swsusp_arch_suspend(void);
47extern asmlinkage int swsusp_arch_resume(void); 49extern asmlinkage int swsusp_arch_resume(void);
48 50
49extern unsigned int count_data_pages(void); 51extern unsigned int count_data_pages(void);
50extern void free_pagedir(struct pbe *pblist); 52
51extern void release_eaten_pages(void); 53struct snapshot_handle {
52extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); 54 loff_t offset;
55 unsigned int page;
56 unsigned int page_offset;
57 unsigned int prev;
58 struct pbe *pbe;
59 void *buffer;
60 unsigned int buf_offset;
61};
62
63#define data_of(handle) ((handle).buffer + (handle).buf_offset)
64
65extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
66extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
67int snapshot_image_loaded(struct snapshot_handle *handle);
68
69#define SNAPSHOT_IOC_MAGIC '3'
70#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
71#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2)
72#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
73#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4)
74#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5)
75#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
76#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
77#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
78#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
79#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
80#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
81#define SNAPSHOT_IOC_MAXNR 11
82
83/**
84 * The bitmap is used for tracing allocated swap pages
85 *
86 * The entire bitmap consists of a number of bitmap_page
87 * structures linked with the help of the .next member.
88 * Thus each page can be allocated individually, so we only
89 * need to make 0-order memory allocations to create
90 * the bitmap.
91 */
92
93#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *))
94#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long))
95#define BITS_PER_CHUNK (sizeof(long) * 8)
96#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
97
98struct bitmap_page {
99 unsigned long chunks[BITMAP_PAGE_CHUNKS];
100 struct bitmap_page *next;
101};
102
103extern void free_bitmap(struct bitmap_page *bitmap);
104extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
105extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
106extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
107
108extern int swsusp_check(void);
109extern int swsusp_shrink_memory(void);
53extern void swsusp_free(void); 110extern void swsusp_free(void);
54extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); 111extern int swsusp_suspend(void);
55extern unsigned int snapshot_nr_pages(void); 112extern int swsusp_resume(void);
56extern struct pbe *snapshot_pblist(void); 113extern int swsusp_read(void);
57extern void snapshot_pblist_set(struct pbe *pblist); 114extern int swsusp_write(void);
115extern void swsusp_close(void);
116extern int suspend_enter(suspend_state_t state);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 28de118f7a..b2a5f671d6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,11 +12,12 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h>
15 16
16/* 17/*
17 * Timeout for stopping processes 18 * Timeout for stopping processes
18 */ 19 */
19#define TIMEOUT (6 * HZ) 20#define TIMEOUT (20 * HZ)
20 21
21 22
22static inline int freezeable(struct task_struct * p) 23static inline int freezeable(struct task_struct * p)
@@ -25,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
25 (p->flags & PF_NOFREEZE) || 26 (p->flags & PF_NOFREEZE) ||
26 (p->exit_state == EXIT_ZOMBIE) || 27 (p->exit_state == EXIT_ZOMBIE) ||
27 (p->exit_state == EXIT_DEAD) || 28 (p->exit_state == EXIT_DEAD) ||
28 (p->state == TASK_STOPPED) || 29 (p->state == TASK_STOPPED))
29 (p->state == TASK_TRACED))
30 return 0; 30 return 0;
31 return 1; 31 return 1;
32} 32}
@@ -54,38 +54,62 @@ void refrigerator(void)
54 current->state = save; 54 current->state = save;
55} 55}
56 56
57static inline void freeze_process(struct task_struct *p)
58{
59 unsigned long flags;
60
61 if (!freezing(p)) {
62 freeze(p);
63 spin_lock_irqsave(&p->sighand->siglock, flags);
64 signal_wake_up(p, 0);
65 spin_unlock_irqrestore(&p->sighand->siglock, flags);
66 }
67}
68
57/* 0 = success, else # of processes that we failed to stop */ 69/* 0 = success, else # of processes that we failed to stop */
58int freeze_processes(void) 70int freeze_processes(void)
59{ 71{
60 int todo; 72 int todo, nr_user, user_frozen;
61 unsigned long start_time; 73 unsigned long start_time;
62 struct task_struct *g, *p; 74 struct task_struct *g, *p;
63 unsigned long flags; 75 unsigned long flags;
64 76
65 printk( "Stopping tasks: " ); 77 printk( "Stopping tasks: " );
66 start_time = jiffies; 78 start_time = jiffies;
79 user_frozen = 0;
67 do { 80 do {
68 todo = 0; 81 nr_user = todo = 0;
69 read_lock(&tasklist_lock); 82 read_lock(&tasklist_lock);
70 do_each_thread(g, p) { 83 do_each_thread(g, p) {
71 if (!freezeable(p)) 84 if (!freezeable(p))
72 continue; 85 continue;
73 if (frozen(p)) 86 if (frozen(p))
74 continue; 87 continue;
75 88 if (p->mm && !(p->flags & PF_BORROWED_MM)) {
76 freeze(p); 89 /* The task is a user-space one.
77 spin_lock_irqsave(&p->sighand->siglock, flags); 90 * Freeze it unless there's a vfork completion
78 signal_wake_up(p, 0); 91 * pending
79 spin_unlock_irqrestore(&p->sighand->siglock, flags); 92 */
80 todo++; 93 if (!p->vfork_done)
94 freeze_process(p);
95 nr_user++;
96 } else {
97 /* Freeze only if the user space is frozen */
98 if (user_frozen)
99 freeze_process(p);
100 todo++;
101 }
81 } while_each_thread(g, p); 102 } while_each_thread(g, p);
82 read_unlock(&tasklist_lock); 103 read_unlock(&tasklist_lock);
104 todo += nr_user;
105 if (!user_frozen && !nr_user) {
106 sys_sync();
107 start_time = jiffies;
108 }
109 user_frozen = !nr_user;
83 yield(); /* Yield is okay here */ 110 yield(); /* Yield is okay here */
84 if (todo && time_after(jiffies, start_time + TIMEOUT)) { 111 if (todo && time_after(jiffies, start_time + TIMEOUT))
85 printk( "\n" );
86 printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
87 break; 112 break;
88 }
89 } while(todo); 113 } while(todo);
90 114
91 /* This does not unfreeze processes that are already frozen 115 /* This does not unfreeze processes that are already frozen
@@ -94,8 +118,14 @@ int freeze_processes(void)
94 * but it cleans up leftover PF_FREEZE requests. 118 * but it cleans up leftover PF_FREEZE requests.
95 */ 119 */
96 if (todo) { 120 if (todo) {
121 printk( "\n" );
122 printk(KERN_ERR " stopping tasks timed out "
123 "after %d seconds (%d tasks remaining):\n",
124 TIMEOUT / HZ, todo);
97 read_lock(&tasklist_lock); 125 read_lock(&tasklist_lock);
98 do_each_thread(g, p) 126 do_each_thread(g, p) {
127 if (freezeable(p) && !frozen(p))
128 printk(KERN_ERR " %s\n", p->comm);
99 if (freezing(p)) { 129 if (freezing(p)) {
100 pr_debug(" clean up: %s\n", p->comm); 130 pr_debug(" clean up: %s\n", p->comm);
101 p->flags &= ~PF_FREEZE; 131 p->flags &= ~PF_FREEZE;
@@ -103,7 +133,7 @@ int freeze_processes(void)
103 recalc_sigpending_tsk(p); 133 recalc_sigpending_tsk(p);
104 spin_unlock_irqrestore(&p->sighand->siglock, flags); 134 spin_unlock_irqrestore(&p->sighand->siglock, flags);
105 } 135 }
106 while_each_thread(g, p); 136 } while_each_thread(g, p);
107 read_unlock(&tasklist_lock); 137 read_unlock(&tasklist_lock);
108 return todo; 138 return todo;
109 } 139 }
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 911fc62b82..5957312b2d 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -49,9 +49,7 @@ void enable_nonboot_cpus(void)
49 49
50 printk("Thawing cpus ...\n"); 50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) { 51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = smp_prepare_cpu(cpu); 52 error = cpu_up(cpu);
53 if (!error)
54 error = cpu_up(cpu);
55 if (!error) { 53 if (!error) {
56 printk("CPU%d is up\n", cpu); 54 printk("CPU%d is up\n", cpu);
57 continue; 55 continue;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 41f66365f0..3eeedbb13b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12 12
13#include <linux/version.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/suspend.h> 16#include <linux/suspend.h>
@@ -34,7 +35,9 @@
34#include "power.h" 35#include "power.h"
35 36
36struct pbe *pagedir_nosave; 37struct pbe *pagedir_nosave;
37unsigned int nr_copy_pages; 38static unsigned int nr_copy_pages;
39static unsigned int nr_meta_pages;
40static unsigned long *buffer;
38 41
39#ifdef CONFIG_HIGHMEM 42#ifdef CONFIG_HIGHMEM
40unsigned int count_highmem_pages(void) 43unsigned int count_highmem_pages(void)
@@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone)
80 void *kaddr; 83 void *kaddr;
81 unsigned long pfn = zone_pfn + zone->zone_start_pfn; 84 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
82 85
83 if (!(pfn%1000)) 86 if (!(pfn%10000))
84 printk("."); 87 printk(".");
85 if (!pfn_valid(pfn)) 88 if (!pfn_valid(pfn))
86 continue; 89 continue;
@@ -91,10 +94,8 @@ static int save_highmem_zone(struct zone *zone)
91 * corrected eventually when the cases giving rise to this 94 * corrected eventually when the cases giving rise to this
92 * are better understood. 95 * are better understood.
93 */ 96 */
94 if (PageReserved(page)) { 97 if (PageReserved(page))
95 printk("highmem reserved page?!\n");
96 continue; 98 continue;
97 }
98 BUG_ON(PageNosave(page)); 99 BUG_ON(PageNosave(page));
99 if (PageNosaveFree(page)) 100 if (PageNosaveFree(page))
100 continue; 101 continue;
@@ -121,13 +122,15 @@ int save_highmem(void)
121 struct zone *zone; 122 struct zone *zone;
122 int res = 0; 123 int res = 0;
123 124
124 pr_debug("swsusp: Saving Highmem\n"); 125 pr_debug("swsusp: Saving Highmem");
126 drain_local_pages();
125 for_each_zone (zone) { 127 for_each_zone (zone) {
126 if (is_highmem(zone)) 128 if (is_highmem(zone))
127 res = save_highmem_zone(zone); 129 res = save_highmem_zone(zone);
128 if (res) 130 if (res)
129 return res; 131 return res;
130 } 132 }
133 printk("\n");
131 return 0; 134 return 0;
132} 135}
133 136
@@ -237,14 +240,15 @@ static void copy_data_pages(struct pbe *pblist)
237 * free_pagedir - free pages allocated with alloc_pagedir() 240 * free_pagedir - free pages allocated with alloc_pagedir()
238 */ 241 */
239 242
240void free_pagedir(struct pbe *pblist) 243static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
241{ 244{
242 struct pbe *pbe; 245 struct pbe *pbe;
243 246
244 while (pblist) { 247 while (pblist) {
245 pbe = (pblist + PB_PAGE_SKIP)->next; 248 pbe = (pblist + PB_PAGE_SKIP)->next;
246 ClearPageNosave(virt_to_page(pblist)); 249 ClearPageNosave(virt_to_page(pblist));
247 ClearPageNosaveFree(virt_to_page(pblist)); 250 if (clear_nosave_free)
251 ClearPageNosaveFree(virt_to_page(pblist));
248 free_page((unsigned long)pblist); 252 free_page((unsigned long)pblist);
249 pblist = pbe; 253 pblist = pbe;
250 } 254 }
@@ -303,7 +307,7 @@ struct eaten_page {
303 307
304static struct eaten_page *eaten_pages = NULL; 308static struct eaten_page *eaten_pages = NULL;
305 309
306void release_eaten_pages(void) 310static void release_eaten_pages(void)
307{ 311{
308 struct eaten_page *p, *q; 312 struct eaten_page *p, *q;
309 313
@@ -378,7 +382,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
378 if (!nr_pages) 382 if (!nr_pages)
379 return NULL; 383 return NULL;
380 384
381 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
382 pblist = alloc_image_page(gfp_mask, safe_needed); 385 pblist = alloc_image_page(gfp_mask, safe_needed);
383 /* FIXME: rewrite this ugly loop */ 386 /* FIXME: rewrite this ugly loop */
384 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; 387 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -387,10 +390,10 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
387 pbe->next = alloc_image_page(gfp_mask, safe_needed); 390 pbe->next = alloc_image_page(gfp_mask, safe_needed);
388 } 391 }
389 if (!pbe) { /* get_zeroed_page() failed */ 392 if (!pbe) { /* get_zeroed_page() failed */
390 free_pagedir(pblist); 393 free_pagedir(pblist, 1);
391 pblist = NULL; 394 pblist = NULL;
392 } else 395 } else
393 create_pbe_list(pblist, nr_pages); 396 create_pbe_list(pblist, nr_pages);
394 return pblist; 397 return pblist;
395} 398}
396 399
@@ -416,6 +419,10 @@ void swsusp_free(void)
416 } 419 }
417 } 420 }
418 } 421 }
422 nr_copy_pages = 0;
423 nr_meta_pages = 0;
424 pagedir_nosave = NULL;
425 buffer = NULL;
419} 426}
420 427
421 428
@@ -439,7 +446,7 @@ static int enough_free_mem(unsigned int nr_pages)
439 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 446 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
440} 447}
441 448
442int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) 449static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
443{ 450{
444 struct pbe *p; 451 struct pbe *p;
445 452
@@ -506,7 +513,318 @@ asmlinkage int swsusp_save(void)
506 */ 513 */
507 514
508 nr_copy_pages = nr_pages; 515 nr_copy_pages = nr_pages;
516 nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
509 517
510 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 518 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
511 return 0; 519 return 0;
512} 520}
521
522static void init_header(struct swsusp_info *info)
523{
524 memset(info, 0, sizeof(struct swsusp_info));
525 info->version_code = LINUX_VERSION_CODE;
526 info->num_physpages = num_physpages;
527 memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
528 info->cpus = num_online_cpus();
529 info->image_pages = nr_copy_pages;
530 info->pages = nr_copy_pages + nr_meta_pages + 1;
531 info->size = info->pages;
532 info->size <<= PAGE_SHIFT;
533}
534
535/**
536 * pack_orig_addresses - the .orig_address fields of the PBEs from the
537 * list starting at @pbe are stored in the array @buf[] (1 page)
538 */
539
540static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
541{
542 int j;
543
544 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
545 buf[j] = pbe->orig_address;
546 pbe = pbe->next;
547 }
548 if (!pbe)
549 for (; j < PAGE_SIZE / sizeof(long); j++)
550 buf[j] = 0;
551 return pbe;
552}
553
554/**
555 * snapshot_read_next - used for reading the system memory snapshot.
556 *
557 * On the first call to it @handle should point to a zeroed
558 * snapshot_handle structure. The structure gets updated and a pointer
559 * to it should be passed to this function every next time.
560 *
561 * The @count parameter should contain the number of bytes the caller
562 * wants to read from the snapshot. It must not be zero.
563 *
564 * On success the function returns a positive number. Then, the caller
565 * is allowed to read up to the returned number of bytes from the memory
566 * location computed by the data_of() macro. The number returned
567 * may be smaller than @count, but this only happens if the read would
568 * cross a page boundary otherwise.
569 *
570 * The function returns 0 to indicate the end of data stream condition,
571 * and a negative number is returned on error. In such cases the
572 * structure pointed to by @handle is not updated and should not be used
573 * any more.
574 */
575
576int snapshot_read_next(struct snapshot_handle *handle, size_t count)
577{
578 if (handle->page > nr_meta_pages + nr_copy_pages)
579 return 0;
580 if (!buffer) {
581 /* This makes the buffer be freed by swsusp_free() */
582 buffer = alloc_image_page(GFP_ATOMIC, 0);
583 if (!buffer)
584 return -ENOMEM;
585 }
586 if (!handle->offset) {
587 init_header((struct swsusp_info *)buffer);
588 handle->buffer = buffer;
589 handle->pbe = pagedir_nosave;
590 }
591 if (handle->prev < handle->page) {
592 if (handle->page <= nr_meta_pages) {
593 handle->pbe = pack_orig_addresses(buffer, handle->pbe);
594 if (!handle->pbe)
595 handle->pbe = pagedir_nosave;
596 } else {
597 handle->buffer = (void *)handle->pbe->address;
598 handle->pbe = handle->pbe->next;
599 }
600 handle->prev = handle->page;
601 }
602 handle->buf_offset = handle->page_offset;
603 if (handle->page_offset + count >= PAGE_SIZE) {
604 count = PAGE_SIZE - handle->page_offset;
605 handle->page_offset = 0;
606 handle->page++;
607 } else {
608 handle->page_offset += count;
609 }
610 handle->offset += count;
611 return count;
612}
613
614/**
615 * mark_unsafe_pages - mark the pages that cannot be used for storing
616 * the image during resume, because they conflict with the pages that
617 * had been used before suspend
618 */
619
620static int mark_unsafe_pages(struct pbe *pblist)
621{
622 struct zone *zone;
623 unsigned long zone_pfn;
624 struct pbe *p;
625
626 if (!pblist) /* a sanity check */
627 return -EINVAL;
628
629 /* Clear page flags */
630 for_each_zone (zone) {
631 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
632 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
633 ClearPageNosaveFree(pfn_to_page(zone_pfn +
634 zone->zone_start_pfn));
635 }
636
637 /* Mark orig addresses */
638 for_each_pbe (p, pblist) {
639 if (virt_addr_valid(p->orig_address))
640 SetPageNosaveFree(virt_to_page(p->orig_address));
641 else
642 return -EFAULT;
643 }
644
645 return 0;
646}
647
648static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
649{
650 /* We assume both lists contain the same number of elements */
651 while (src) {
652 dst->orig_address = src->orig_address;
653 dst = dst->next;
654 src = src->next;
655 }
656}
657
658static int check_header(struct swsusp_info *info)
659{
660 char *reason = NULL;
661
662 if (info->version_code != LINUX_VERSION_CODE)
663 reason = "kernel version";
664 if (info->num_physpages != num_physpages)
665 reason = "memory size";
666 if (strcmp(info->uts.sysname,system_utsname.sysname))
667 reason = "system type";
668 if (strcmp(info->uts.release,system_utsname.release))
669 reason = "kernel release";
670 if (strcmp(info->uts.version,system_utsname.version))
671 reason = "version";
672 if (strcmp(info->uts.machine,system_utsname.machine))
673 reason = "machine";
674 if (reason) {
675 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
676 return -EPERM;
677 }
678 return 0;
679}
680
681/**
682 * load header - check the image header and copy data from it
683 */
684
685static int load_header(struct snapshot_handle *handle,
686 struct swsusp_info *info)
687{
688 int error;
689 struct pbe *pblist;
690
691 error = check_header(info);
692 if (!error) {
693 pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
694 if (!pblist)
695 return -ENOMEM;
696 pagedir_nosave = pblist;
697 handle->pbe = pblist;
698 nr_copy_pages = info->image_pages;
699 nr_meta_pages = info->pages - info->image_pages - 1;
700 }
701 return error;
702}
703
704/**
705 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
706 * the PBEs in the list starting at @pbe
707 */
708
709static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
710 struct pbe *pbe)
711{
712 int j;
713
714 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
715 pbe->orig_address = buf[j];
716 pbe = pbe->next;
717 }
718 return pbe;
719}
720
721/**
722 * create_image - use metadata contained in the PBE list
723 * pointed to by pagedir_nosave to mark the pages that will
724 * be overwritten in the process of restoring the system
725 * memory state from the image and allocate memory for
726 * the image avoiding these pages
727 */
728
729static int create_image(struct snapshot_handle *handle)
730{
731 int error = 0;
732 struct pbe *p, *pblist;
733
734 p = pagedir_nosave;
735 error = mark_unsafe_pages(p);
736 if (!error) {
737 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
738 if (pblist)
739 copy_page_backup_list(pblist, p);
740 free_pagedir(p, 0);
741 if (!pblist)
742 error = -ENOMEM;
743 }
744 if (!error)
745 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
746 if (!error) {
747 release_eaten_pages();
748 pagedir_nosave = pblist;
749 } else {
750 pagedir_nosave = NULL;
751 handle->pbe = NULL;
752 nr_copy_pages = 0;
753 nr_meta_pages = 0;
754 }
755 return error;
756}
757
758/**
759 * snapshot_write_next - used for writing the system memory snapshot.
760 *
761 * On the first call to it @handle should point to a zeroed
762 * snapshot_handle structure. The structure gets updated and a pointer
763 * to it should be passed to this function every next time.
764 *
765 * The @count parameter should contain the number of bytes the caller
766 * wants to write to the image. It must not be zero.
767 *
768 * On success the function returns a positive number. Then, the caller
769 * is allowed to write up to the returned number of bytes to the memory
770 * location computed by the data_of() macro. The number returned
771 * may be smaller than @count, but this only happens if the write would
772 * cross a page boundary otherwise.
773 *
774 * The function returns 0 to indicate the "end of file" condition,
775 * and a negative number is returned on error. In such cases the
776 * structure pointed to by @handle is not updated and should not be used
777 * any more.
778 */
779
780int snapshot_write_next(struct snapshot_handle *handle, size_t count)
781{
782 int error = 0;
783
784 if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
785 return 0;
786 if (!buffer) {
787 /* This makes the buffer be freed by swsusp_free() */
788 buffer = alloc_image_page(GFP_ATOMIC, 0);
789 if (!buffer)
790 return -ENOMEM;
791 }
792 if (!handle->offset)
793 handle->buffer = buffer;
794 if (handle->prev < handle->page) {
795 if (!handle->prev) {
796 error = load_header(handle, (struct swsusp_info *)buffer);
797 if (error)
798 return error;
799 } else if (handle->prev <= nr_meta_pages) {
800 handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
801 if (!handle->pbe) {
802 error = create_image(handle);
803 if (error)
804 return error;
805 handle->pbe = pagedir_nosave;
806 handle->buffer = (void *)handle->pbe->address;
807 }
808 } else {
809 handle->pbe = handle->pbe->next;
810 handle->buffer = (void *)handle->pbe->address;
811 }
812 handle->prev = handle->page;
813 }
814 handle->buf_offset = handle->page_offset;
815 if (handle->page_offset + count >= PAGE_SIZE) {
816 count = PAGE_SIZE - handle->page_offset;
817 handle->page_offset = 0;
818 handle->page++;
819 } else {
820 handle->page_offset += count;
821 }
822 handle->offset += count;
823 return count;
824}
825
826int snapshot_image_loaded(struct snapshot_handle *handle)
827{
828 return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
829 handle->page <= nr_meta_pages + nr_copy_pages);
830}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 0000000000..044b8e0c10
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,545 @@
1/*
2 * linux/kernel/power/swap.c
3 *
4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition.
6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 *
10 * This file is released under the GPLv2.
11 *
12 */
13
14#include <linux/module.h>
15#include <linux/smp_lock.h>
16#include <linux/file.h>
17#include <linux/utsname.h>
18#include <linux/version.h>
19#include <linux/delay.h>
20#include <linux/bitops.h>
21#include <linux/genhd.h>
22#include <linux/device.h>
23#include <linux/buffer_head.h>
24#include <linux/bio.h>
25#include <linux/swap.h>
26#include <linux/swapops.h>
27#include <linux/pm.h>
28
29#include "power.h"
30
31extern char resume_file[];
32
33#define SWSUSP_SIG "S1SUSPEND"
34
35static struct swsusp_header {
36 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
37 swp_entry_t image;
38 char orig_sig[10];
39 char sig[10];
40} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
41
42/*
43 * Saving part...
44 */
45
46static unsigned short root_swap = 0xffff;
47
48static int mark_swapfiles(swp_entry_t start)
49{
50 int error;
51
52 rw_swap_page_sync(READ,
53 swp_entry(root_swap, 0),
54 virt_to_page((unsigned long)&swsusp_header));
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE,
61 swp_entry(root_swap, 0),
62 virt_to_page((unsigned long)
63 &swsusp_header));
64 } else {
65 pr_debug("swsusp: Partition is not swap space.\n");
66 error = -ENODEV;
67 }
68 return error;
69}
70
71/**
72 * swsusp_swap_check - check if the resume device is a swap device
73 * and get its index (if so)
74 */
75
76static int swsusp_swap_check(void) /* This is called before saving image */
77{
78 int res = swap_type_of(swsusp_resume_device);
79
80 if (res >= 0) {
81 root_swap = res;
82 return 0;
83 }
84 return res;
85}
86
87/**
88 * write_page - Write one page to given swap location.
89 * @buf: Address we're writing.
90 * @offset: Offset of the swap page we're writing to.
91 */
92
93static int write_page(void *buf, unsigned long offset)
94{
95 swp_entry_t entry;
96 int error = -ENOSPC;
97
98 if (offset) {
99 entry = swp_entry(root_swap, offset);
100 error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
101 }
102 return error;
103}
104
105/*
106 * The swap map is a data structure used for keeping track of each page
107 * written to a swap partition. It consists of many swap_map_page
108 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
109 * These structures are stored on the swap and linked together with the
110 * help of the .next_swap member.
111 *
112 * The swap map is created during suspend. The swap map pages are
113 * allocated and populated one at a time, so we only need one memory
114 * page to set up the entire structure.
115 *
116 * During resume we also only need to use one swap_map_page structure
117 * at a time.
118 */
119
120#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1)
121
122struct swap_map_page {
123 unsigned long entries[MAP_PAGE_ENTRIES];
124 unsigned long next_swap;
125};
126
127/**
128 * The swap_map_handle structure is used for handling swap in
129 * a file-alike way
130 */
131
132struct swap_map_handle {
133 struct swap_map_page *cur;
134 unsigned long cur_swap;
135 struct bitmap_page *bitmap;
136 unsigned int k;
137};
138
139static void release_swap_writer(struct swap_map_handle *handle)
140{
141 if (handle->cur)
142 free_page((unsigned long)handle->cur);
143 handle->cur = NULL;
144 if (handle->bitmap)
145 free_bitmap(handle->bitmap);
146 handle->bitmap = NULL;
147}
148
149static int get_swap_writer(struct swap_map_handle *handle)
150{
151 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
152 if (!handle->cur)
153 return -ENOMEM;
154 handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
155 if (!handle->bitmap) {
156 release_swap_writer(handle);
157 return -ENOMEM;
158 }
159 handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
160 if (!handle->cur_swap) {
161 release_swap_writer(handle);
162 return -ENOSPC;
163 }
164 handle->k = 0;
165 return 0;
166}
167
168static int swap_write_page(struct swap_map_handle *handle, void *buf)
169{
170 int error;
171 unsigned long offset;
172
173 if (!handle->cur)
174 return -EINVAL;
175 offset = alloc_swap_page(root_swap, handle->bitmap);
176 error = write_page(buf, offset);
177 if (error)
178 return error;
179 handle->cur->entries[handle->k++] = offset;
180 if (handle->k >= MAP_PAGE_ENTRIES) {
181 offset = alloc_swap_page(root_swap, handle->bitmap);
182 if (!offset)
183 return -ENOSPC;
184 handle->cur->next_swap = offset;
185 error = write_page(handle->cur, handle->cur_swap);
186 if (error)
187 return error;
188 memset(handle->cur, 0, PAGE_SIZE);
189 handle->cur_swap = offset;
190 handle->k = 0;
191 }
192 return 0;
193}
194
195static int flush_swap_writer(struct swap_map_handle *handle)
196{
197 if (handle->cur && handle->cur_swap)
198 return write_page(handle->cur, handle->cur_swap);
199 else
200 return -EINVAL;
201}
202
203/**
204 * save_image - save the suspend image data
205 */
206
207static int save_image(struct swap_map_handle *handle,
208 struct snapshot_handle *snapshot,
209 unsigned int nr_pages)
210{
211 unsigned int m;
212 int ret;
213 int error = 0;
214
215 printk("Saving image data pages (%u pages) ... ", nr_pages);
216 m = nr_pages / 100;
217 if (!m)
218 m = 1;
219 nr_pages = 0;
220 do {
221 ret = snapshot_read_next(snapshot, PAGE_SIZE);
222 if (ret > 0) {
223 error = swap_write_page(handle, data_of(*snapshot));
224 if (error)
225 break;
226 if (!(nr_pages % m))
227 printk("\b\b\b\b%3d%%", nr_pages / m);
228 nr_pages++;
229 }
230 } while (ret > 0);
231 if (!error)
232 printk("\b\b\b\bdone\n");
233 return error;
234}
235
236/**
237 * enough_swap - Make sure we have enough swap to save the image.
238 *
239 * Returns TRUE or FALSE after checking the total amount of swap
240 * space avaiable from the resume partition.
241 */
242
243static int enough_swap(unsigned int nr_pages)
244{
245 unsigned int free_swap = count_swap_pages(root_swap, 1);
246
247 pr_debug("swsusp: free swap pages: %u\n", free_swap);
248 return free_swap > (nr_pages + PAGES_FOR_IO +
249 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
250}
251
252/**
253 * swsusp_write - Write entire image and metadata.
254 *
255 * It is important _NOT_ to umount filesystems at this point. We want
256 * them synced (in case something goes wrong) but we DO not want to mark
257 * filesystem clean: it is not. (And it does not matter, if we resume
258 * correctly, we'll mark system clean, anyway.)
259 */
260
261int swsusp_write(void)
262{
263 struct swap_map_handle handle;
264 struct snapshot_handle snapshot;
265 struct swsusp_info *header;
266 unsigned long start;
267 int error;
268
269 if ((error = swsusp_swap_check())) {
270 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
271 return error;
272 }
273 memset(&snapshot, 0, sizeof(struct snapshot_handle));
274 error = snapshot_read_next(&snapshot, PAGE_SIZE);
275 if (error < PAGE_SIZE)
276 return error < 0 ? error : -EFAULT;
277 header = (struct swsusp_info *)data_of(snapshot);
278 if (!enough_swap(header->pages)) {
279 printk(KERN_ERR "swsusp: Not enough free swap\n");
280 return -ENOSPC;
281 }
282 error = get_swap_writer(&handle);
283 if (!error) {
284 start = handle.cur_swap;
285 error = swap_write_page(&handle, header);
286 }
287 if (!error)
288 error = save_image(&handle, &snapshot, header->pages - 1);
289 if (!error) {
290 flush_swap_writer(&handle);
291 printk("S");
292 error = mark_swapfiles(swp_entry(root_swap, start));
293 printk("|\n");
294 }
295 if (error)
296 free_all_swap_pages(root_swap, handle.bitmap);
297 release_swap_writer(&handle);
298 return error;
299}
300
301/*
302 * Using bio to read from swap.
303 * This code requires a bit more work than just using buffer heads
304 * but, it is the recommended way for 2.5/2.6.
305 * The following are to signal the beginning and end of I/O. Bios
306 * finish asynchronously, while we want them to happen synchronously.
307 * A simple atomic_t, and a wait loop take care of this problem.
308 */
309
310static atomic_t io_done = ATOMIC_INIT(0);
311
312static int end_io(struct bio *bio, unsigned int num, int err)
313{
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
315 panic("I/O error reading memory image");
316 atomic_set(&io_done, 0);
317 return 0;
318}
319
320static struct block_device *resume_bdev;
321
322/**
323 * submit - submit BIO request.
324 * @rw: READ or WRITE.
325 * @off physical offset of page.
326 * @page: page we're reading or writing.
327 *
328 * Straight from the textbook - allocate and initialize the bio.
329 * If we're writing, make sure the page is marked as dirty.
330 * Then submit it and wait.
331 */
332
333static int submit(int rw, pgoff_t page_off, void *page)
334{
335 int error = 0;
336 struct bio *bio;
337
338 bio = bio_alloc(GFP_ATOMIC, 1);
339 if (!bio)
340 return -ENOMEM;
341 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
342 bio->bi_bdev = resume_bdev;
343 bio->bi_end_io = end_io;
344
345 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
346 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
347 error = -EFAULT;
348 goto Done;
349 }
350
351 atomic_set(&io_done, 1);
352 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
353 while (atomic_read(&io_done))
354 yield();
355 if (rw == READ)
356 bio_set_pages_dirty(bio);
357 Done:
358 bio_put(bio);
359 return error;
360}
361
362static int bio_read_page(pgoff_t page_off, void *page)
363{
364 return submit(READ, page_off, page);
365}
366
367static int bio_write_page(pgoff_t page_off, void *page)
368{
369 return submit(WRITE, page_off, page);
370}
371
372/**
373 * The following functions allow us to read data using a swap map
374 * in a file-alike way
375 */
376
377static void release_swap_reader(struct swap_map_handle *handle)
378{
379 if (handle->cur)
380 free_page((unsigned long)handle->cur);
381 handle->cur = NULL;
382}
383
384static int get_swap_reader(struct swap_map_handle *handle,
385 swp_entry_t start)
386{
387 int error;
388
389 if (!swp_offset(start))
390 return -EINVAL;
391 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
392 if (!handle->cur)
393 return -ENOMEM;
394 error = bio_read_page(swp_offset(start), handle->cur);
395 if (error) {
396 release_swap_reader(handle);
397 return error;
398 }
399 handle->k = 0;
400 return 0;
401}
402
403static int swap_read_page(struct swap_map_handle *handle, void *buf)
404{
405 unsigned long offset;
406 int error;
407
408 if (!handle->cur)
409 return -EINVAL;
410 offset = handle->cur->entries[handle->k];
411 if (!offset)
412 return -EFAULT;
413 error = bio_read_page(offset, buf);
414 if (error)
415 return error;
416 if (++handle->k >= MAP_PAGE_ENTRIES) {
417 handle->k = 0;
418 offset = handle->cur->next_swap;
419 if (!offset)
420 release_swap_reader(handle);
421 else
422 error = bio_read_page(offset, handle->cur);
423 }
424 return error;
425}
426
427/**
428 * load_image - load the image using the swap map handle
429 * @handle and the snapshot handle @snapshot
430 * (assume there are @nr_pages pages to load)
431 */
432
433static int load_image(struct swap_map_handle *handle,
434 struct snapshot_handle *snapshot,
435 unsigned int nr_pages)
436{
437 unsigned int m;
438 int ret;
439 int error = 0;
440
441 printk("Loading image data pages (%u pages) ... ", nr_pages);
442 m = nr_pages / 100;
443 if (!m)
444 m = 1;
445 nr_pages = 0;
446 do {
447 ret = snapshot_write_next(snapshot, PAGE_SIZE);
448 if (ret > 0) {
449 error = swap_read_page(handle, data_of(*snapshot));
450 if (error)
451 break;
452 if (!(nr_pages % m))
453 printk("\b\b\b\b%3d%%", nr_pages / m);
454 nr_pages++;
455 }
456 } while (ret > 0);
457 if (!error) {
458 printk("\b\b\b\bdone\n");
459 if (!snapshot_image_loaded(snapshot))
460 error = -ENODATA;
461 }
462 return error;
463}
464
465int swsusp_read(void)
466{
467 int error;
468 struct swap_map_handle handle;
469 struct snapshot_handle snapshot;
470 struct swsusp_info *header;
471
472 if (IS_ERR(resume_bdev)) {
473 pr_debug("swsusp: block device not initialised\n");
474 return PTR_ERR(resume_bdev);
475 }
476
477 memset(&snapshot, 0, sizeof(struct snapshot_handle));
478 error = snapshot_write_next(&snapshot, PAGE_SIZE);
479 if (error < PAGE_SIZE)
480 return error < 0 ? error : -EFAULT;
481 header = (struct swsusp_info *)data_of(snapshot);
482 error = get_swap_reader(&handle, swsusp_header.image);
483 if (!error)
484 error = swap_read_page(&handle, header);
485 if (!error)
486 error = load_image(&handle, &snapshot, header->pages - 1);
487 release_swap_reader(&handle);
488
489 blkdev_put(resume_bdev);
490
491 if (!error)
492 pr_debug("swsusp: Reading resume file was successful\n");
493 else
494 pr_debug("swsusp: Error %d resuming\n", error);
495 return error;
496}
497
498/**
499 * swsusp_check - Check for swsusp signature in the resume device
500 */
501
502int swsusp_check(void)
503{
504 int error;
505
506 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
507 if (!IS_ERR(resume_bdev)) {
508 set_blocksize(resume_bdev, PAGE_SIZE);
509 memset(&swsusp_header, 0, sizeof(swsusp_header));
510 if ((error = bio_read_page(0, &swsusp_header)))
511 return error;
512 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
513 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
514 /* Reset swap signature now */
515 error = bio_write_page(0, &swsusp_header);
516 } else {
517 return -EINVAL;
518 }
519 if (error)
520 blkdev_put(resume_bdev);
521 else
522 pr_debug("swsusp: Signature found, resuming\n");
523 } else {
524 error = PTR_ERR(resume_bdev);
525 }
526
527 if (error)
528 pr_debug("swsusp: Error %d check for resume file\n", error);
529
530 return error;
531}
532
533/**
534 * swsusp_close - close swap device.
535 */
536
537void swsusp_close(void)
538{
539 if (IS_ERR(resume_bdev)) {
540 pr_debug("swsusp: block device not initialised\n");
541 return;
542 }
543
544 blkdev_put(resume_bdev);
545}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e..c4016cbbd3 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,41 +31,24 @@
31 * Fixed runaway init 31 * Fixed runaway init
32 * 32 *
33 * Rafael J. Wysocki <rjw@sisk.pl> 33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Added the swap map data structure and reworked the handling of swap 34 * Reworked the freeing of memory and the handling of swap
35 * 35 *
36 * More state savers are welcome. Especially for the scsi layer... 36 * More state savers are welcome. Especially for the scsi layer...
37 * 37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt 38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */ 39 */
40 40
41#include <linux/module.h>
42#include <linux/mm.h> 41#include <linux/mm.h>
43#include <linux/suspend.h> 42#include <linux/suspend.h>
44#include <linux/smp_lock.h>
45#include <linux/file.h>
46#include <linux/utsname.h>
47#include <linux/version.h>
48#include <linux/delay.h>
49#include <linux/bitops.h>
50#include <linux/spinlock.h> 43#include <linux/spinlock.h>
51#include <linux/genhd.h>
52#include <linux/kernel.h> 44#include <linux/kernel.h>
53#include <linux/major.h> 45#include <linux/major.h>
54#include <linux/swap.h> 46#include <linux/swap.h>
55#include <linux/pm.h> 47#include <linux/pm.h>
56#include <linux/device.h>
57#include <linux/buffer_head.h>
58#include <linux/swapops.h> 48#include <linux/swapops.h>
59#include <linux/bootmem.h> 49#include <linux/bootmem.h>
60#include <linux/syscalls.h> 50#include <linux/syscalls.h>
61#include <linux/highmem.h> 51#include <linux/highmem.h>
62#include <linux/bio.h>
63
64#include <asm/uaccess.h>
65#include <asm/mmu_context.h>
66#include <asm/pgtable.h>
67#include <asm/tlbflush.h>
68#include <asm/io.h>
69 52
70#include "power.h" 53#include "power.h"
71 54
@@ -77,6 +60,8 @@
77 */ 60 */
78unsigned long image_size = 500 * 1024 * 1024; 61unsigned long image_size = 500 * 1024 * 1024;
79 62
63int in_suspend __nosavedata = 0;
64
80#ifdef CONFIG_HIGHMEM 65#ifdef CONFIG_HIGHMEM
81unsigned int count_highmem_pages(void); 66unsigned int count_highmem_pages(void);
82int save_highmem(void); 67int save_highmem(void);
@@ -87,473 +72,97 @@ static int restore_highmem(void) { return 0; }
87static unsigned int count_highmem_pages(void) { return 0; } 72static unsigned int count_highmem_pages(void) { return 0; }
88#endif 73#endif
89 74
90extern char resume_file[];
91
92#define SWSUSP_SIG "S1SUSPEND"
93
94static struct swsusp_header {
95 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
96 swp_entry_t image;
97 char orig_sig[10];
98 char sig[10];
99} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
100
101static struct swsusp_info swsusp_info;
102
103/*
104 * Saving part...
105 */
106
107static unsigned short root_swap = 0xffff;
108
109static int mark_swapfiles(swp_entry_t start)
110{
111 int error;
112
113 rw_swap_page_sync(READ,
114 swp_entry(root_swap, 0),
115 virt_to_page((unsigned long)&swsusp_header));
116 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
117 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
118 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
119 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
120 swsusp_header.image = start;
121 error = rw_swap_page_sync(WRITE,
122 swp_entry(root_swap, 0),
123 virt_to_page((unsigned long)
124 &swsusp_header));
125 } else {
126 pr_debug("swsusp: Partition is not swap space.\n");
127 error = -ENODEV;
128 }
129 return error;
130}
131
132/*
133 * Check whether the swap device is the specified resume
134 * device, irrespective of whether they are specified by
135 * identical names.
136 *
137 * (Thus, device inode aliasing is allowed. You can say /dev/hda4
138 * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
139 * and they'll be considered the same device. This is *necessary* for
140 * devfs, since the resume code can only recognize the form /dev/hda4,
141 * but the suspend code would see the long name.)
142 */
143static inline int is_resume_device(const struct swap_info_struct *swap_info)
144{
145 struct file *file = swap_info->swap_file;
146 struct inode *inode = file->f_dentry->d_inode;
147
148 return S_ISBLK(inode->i_mode) &&
149 swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
150}
151
152static int swsusp_swap_check(void) /* This is called before saving image */
153{
154 int i;
155
156 if (!swsusp_resume_device)
157 return -ENODEV;
158 spin_lock(&swap_lock);
159 for (i = 0; i < MAX_SWAPFILES; i++) {
160 if (!(swap_info[i].flags & SWP_WRITEOK))
161 continue;
162 if (is_resume_device(swap_info + i)) {
163 spin_unlock(&swap_lock);
164 root_swap = i;
165 return 0;
166 }
167 }
168 spin_unlock(&swap_lock);
169 return -ENODEV;
170}
171
172/**
173 * write_page - Write one page to a fresh swap location.
174 * @addr: Address we're writing.
175 * @loc: Place to store the entry we used.
176 *
177 * Allocate a new swap entry and 'sync' it. Note we discard -EIO
178 * errors. That is an artifact left over from swsusp. It did not
179 * check the return of rw_swap_page_sync() at all, since most pages
180 * written back to swap would return -EIO.
181 * This is a partial improvement, since we will at least return other
182 * errors, though we need to eventually fix the damn code.
183 */
184static int write_page(unsigned long addr, swp_entry_t *loc)
185{
186 swp_entry_t entry;
187 int error = -ENOSPC;
188
189 entry = get_swap_page_of_type(root_swap);
190 if (swp_offset(entry)) {
191 error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
192 if (!error || error == -EIO)
193 *loc = entry;
194 }
195 return error;
196}
197
198/** 75/**
199 * Swap map-handling functions 76 * The following functions are used for tracing the allocated
200 * 77 * swap pages, so that they can be freed in case of an error.
201 * The swap map is a data structure used for keeping track of each page
202 * written to the swap. It consists of many swap_map_page structures
203 * that contain each an array of MAP_PAGE_SIZE swap entries.
204 * These structures are linked together with the help of either the
205 * .next (in memory) or the .next_swap (in swap) member.
206 * 78 *
207 * The swap map is created during suspend. At that time we need to keep 79 * The functions operate on a linked bitmap structure defined
208 * it in memory, because we have to free all of the allocated swap 80 * in power.h
209 * entries if an error occurs. The memory needed is preallocated
210 * so that we know in advance if there's enough of it.
211 *
212 * The first swap_map_page structure is filled with the swap entries that
213 * correspond to the first MAP_PAGE_SIZE data pages written to swap and
214 * so on. After the all of the data pages have been written, the order
215 * of the swap_map_page structures in the map is reversed so that they
216 * can be read from swap in the original order. This causes the data
217 * pages to be loaded in exactly the same order in which they have been
218 * saved.
219 *
220 * During resume we only need to use one swap_map_page structure
221 * at a time, which means that we only need to use two memory pages for
222 * reading the image - one for reading the swap_map_page structures
223 * and the second for reading the data pages from swap.
224 */ 81 */
225 82
226#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ 83void free_bitmap(struct bitmap_page *bitmap)
227 / sizeof(swp_entry_t))
228
229struct swap_map_page {
230 swp_entry_t entries[MAP_PAGE_SIZE];
231 swp_entry_t next_swap;
232 struct swap_map_page *next;
233};
234
235static inline void free_swap_map(struct swap_map_page *swap_map)
236{ 84{
237 struct swap_map_page *swp; 85 struct bitmap_page *bp;
238 86
239 while (swap_map) { 87 while (bitmap) {
240 swp = swap_map->next; 88 bp = bitmap->next;
241 free_page((unsigned long)swap_map); 89 free_page((unsigned long)bitmap);
242 swap_map = swp; 90 bitmap = bp;
243 } 91 }
244} 92}
245 93
246static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) 94struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
247{ 95{
248 struct swap_map_page *swap_map, *swp; 96 struct bitmap_page *bitmap, *bp;
249 unsigned n = 0; 97 unsigned int n;
250 98
251 if (!nr_pages) 99 if (!nr_bits)
252 return NULL; 100 return NULL;
253 101
254 pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); 102 bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
255 swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 103 bp = bitmap;
256 swp = swap_map; 104 for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
257 for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { 105 bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
258 swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 106 bp = bp->next;
259 swp = swp->next; 107 if (!bp) {
260 if (!swp) { 108 free_bitmap(bitmap);
261 free_swap_map(swap_map);
262 return NULL; 109 return NULL;
263 } 110 }
264 } 111 }
265 return swap_map; 112 return bitmap;
266} 113}
267 114
268/** 115static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
269 * reverse_swap_map - reverse the order of pages in the swap map
270 * @swap_map
271 */
272
273static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
274{
275 struct swap_map_page *prev, *next;
276
277 prev = NULL;
278 while (swap_map) {
279 next = swap_map->next;
280 swap_map->next = prev;
281 prev = swap_map;
282 swap_map = next;
283 }
284 return prev;
285}
286
287/**
288 * free_swap_map_entries - free the swap entries allocated to store
289 * the swap map @swap_map (this is only called in case of an error)
290 */
291static inline void free_swap_map_entries(struct swap_map_page *swap_map)
292{
293 while (swap_map) {
294 if (swap_map->next_swap.val)
295 swap_free(swap_map->next_swap);
296 swap_map = swap_map->next;
297 }
298}
299
300/**
301 * save_swap_map - save the swap map used for tracing the data pages
302 * stored in the swap
303 */
304
305static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
306{
307 swp_entry_t entry = (swp_entry_t){0};
308 int error;
309
310 while (swap_map) {
311 swap_map->next_swap = entry;
312 if ((error = write_page((unsigned long)swap_map, &entry)))
313 return error;
314 swap_map = swap_map->next;
315 }
316 *start = entry;
317 return 0;
318}
319
320/**
321 * free_image_entries - free the swap entries allocated to store
322 * the image data pages (this is only called in case of an error)
323 */
324
325static inline void free_image_entries(struct swap_map_page *swp)
326{ 116{
327 unsigned k; 117 unsigned int n;
328 118
329 while (swp) { 119 n = BITMAP_PAGE_BITS;
330 for (k = 0; k < MAP_PAGE_SIZE; k++) 120 while (bitmap && n <= bit) {
331 if (swp->entries[k].val) 121 n += BITMAP_PAGE_BITS;
332 swap_free(swp->entries[k]); 122 bitmap = bitmap->next;
333 swp = swp->next;
334 } 123 }
335} 124 if (!bitmap)
336 125 return -EINVAL;
337/** 126 n -= BITMAP_PAGE_BITS;
338 * The swap_map_handle structure is used for handling the swap map in 127 bit -= n;
339 * a file-alike way 128 n = 0;
340 */ 129 while (bit >= BITS_PER_CHUNK) {
341 130 bit -= BITS_PER_CHUNK;
342struct swap_map_handle { 131 n++;
343 struct swap_map_page *cur;
344 unsigned int k;
345};
346
347static inline void init_swap_map_handle(struct swap_map_handle *handle,
348 struct swap_map_page *map)
349{
350 handle->cur = map;
351 handle->k = 0;
352}
353
354static inline int swap_map_write_page(struct swap_map_handle *handle,
355 unsigned long addr)
356{
357 int error;
358
359 error = write_page(addr, handle->cur->entries + handle->k);
360 if (error)
361 return error;
362 if (++handle->k >= MAP_PAGE_SIZE) {
363 handle->cur = handle->cur->next;
364 handle->k = 0;
365 } 132 }
133 bitmap->chunks[n] |= (1UL << bit);
366 return 0; 134 return 0;
367} 135}
368 136
369/** 137unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
370 * save_image_data - save the data pages pointed to by the PBEs
371 * from the list @pblist using the swap map handle @handle
372 * (assume there are @nr_pages data pages to save)
373 */
374
375static int save_image_data(struct pbe *pblist,
376 struct swap_map_handle *handle,
377 unsigned int nr_pages)
378{
379 unsigned int m;
380 struct pbe *p;
381 int error = 0;
382
383 printk("Saving image data pages (%u pages) ... ", nr_pages);
384 m = nr_pages / 100;
385 if (!m)
386 m = 1;
387 nr_pages = 0;
388 for_each_pbe (p, pblist) {
389 error = swap_map_write_page(handle, p->address);
390 if (error)
391 break;
392 if (!(nr_pages % m))
393 printk("\b\b\b\b%3d%%", nr_pages / m);
394 nr_pages++;
395 }
396 if (!error)
397 printk("\b\b\b\bdone\n");
398 return error;
399}
400
401static void dump_info(void)
402{
403 pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
404 pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
405 pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
406 pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
407 pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
408 pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
409 pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
410 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
411 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
412 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
413 pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
414}
415
416static void init_header(unsigned int nr_pages)
417{
418 memset(&swsusp_info, 0, sizeof(swsusp_info));
419 swsusp_info.version_code = LINUX_VERSION_CODE;
420 swsusp_info.num_physpages = num_physpages;
421 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
422
423 swsusp_info.cpus = num_online_cpus();
424 swsusp_info.image_pages = nr_pages;
425 swsusp_info.pages = nr_pages +
426 ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
427}
428
429/**
430 * pack_orig_addresses - the .orig_address fields of the PBEs from the
431 * list starting at @pbe are stored in the array @buf[] (1 page)
432 */
433
434static inline struct pbe *pack_orig_addresses(unsigned long *buf,
435 struct pbe *pbe)
436{
437 int j;
438
439 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
440 buf[j] = pbe->orig_address;
441 pbe = pbe->next;
442 }
443 if (!pbe)
444 for (; j < PAGE_SIZE / sizeof(long); j++)
445 buf[j] = 0;
446 return pbe;
447}
448
449/**
450 * save_image_metadata - save the .orig_address fields of the PBEs
451 * from the list @pblist using the swap map handle @handle
452 */
453
454static int save_image_metadata(struct pbe *pblist,
455 struct swap_map_handle *handle)
456{ 138{
457 unsigned long *buf; 139 unsigned long offset;
458 unsigned int n = 0;
459 struct pbe *p;
460 int error = 0;
461 140
462 printk("Saving image metadata ... "); 141 offset = swp_offset(get_swap_page_of_type(swap));
463 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); 142 if (offset) {
464 if (!buf) 143 if (bitmap_set(bitmap, offset)) {
465 return -ENOMEM; 144 swap_free(swp_entry(swap, offset));
466 p = pblist; 145 offset = 0;
467 while (p) { 146 }
468 p = pack_orig_addresses(buf, p);
469 error = swap_map_write_page(handle, (unsigned long)buf);
470 if (error)
471 break;
472 n++;
473 } 147 }
474 free_page((unsigned long)buf); 148 return offset;
475 if (!error)
476 printk("done (%u pages saved)\n", n);
477 return error;
478} 149}
479 150
480/** 151void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
481 * enough_swap - Make sure we have enough swap to save the image.
482 *
483 * Returns TRUE or FALSE after checking the total amount of swap
484 * space avaiable from the resume partition.
485 */
486
487static int enough_swap(unsigned int nr_pages)
488{ 152{
489 unsigned int free_swap = swap_info[root_swap].pages - 153 unsigned int bit, n;
490 swap_info[root_swap].inuse_pages; 154 unsigned long test;
491
492 pr_debug("swsusp: free swap pages: %u\n", free_swap);
493 return free_swap > (nr_pages + PAGES_FOR_IO +
494 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
495}
496 155
497/** 156 bit = 0;
498 * swsusp_write - Write entire image and metadata. 157 while (bitmap) {
499 * 158 for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
500 * It is important _NOT_ to umount filesystems at this point. We want 159 for (test = 1UL; test; test <<= 1) {
501 * them synced (in case something goes wrong) but we DO not want to mark 160 if (bitmap->chunks[n] & test)
502 * filesystem clean: it is not. (And it does not matter, if we resume 161 swap_free(swp_entry(swap, bit));
503 * correctly, we'll mark system clean, anyway.) 162 bit++;
504 */ 163 }
505 164 bitmap = bitmap->next;
506int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
507{
508 struct swap_map_page *swap_map;
509 struct swap_map_handle handle;
510 swp_entry_t start;
511 int error;
512
513 if ((error = swsusp_swap_check())) {
514 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
515 return error;
516 }
517 if (!enough_swap(nr_pages)) {
518 printk(KERN_ERR "swsusp: Not enough free swap\n");
519 return -ENOSPC;
520 } 165 }
521
522 init_header(nr_pages);
523 swap_map = alloc_swap_map(swsusp_info.pages);
524 if (!swap_map)
525 return -ENOMEM;
526 init_swap_map_handle(&handle, swap_map);
527
528 error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
529 if (!error)
530 error = save_image_metadata(pblist, &handle);
531 if (!error)
532 error = save_image_data(pblist, &handle, nr_pages);
533 if (error)
534 goto Free_image_entries;
535
536 swap_map = reverse_swap_map(swap_map);
537 error = save_swap_map(swap_map, &start);
538 if (error)
539 goto Free_map_entries;
540
541 dump_info();
542 printk( "S" );
543 error = mark_swapfiles(start);
544 printk( "|\n" );
545 if (error)
546 goto Free_map_entries;
547
548Free_swap_map:
549 free_swap_map(swap_map);
550 return error;
551
552Free_map_entries:
553 free_swap_map_entries(swap_map);
554Free_image_entries:
555 free_image_entries(swap_map);
556 goto Free_swap_map;
557} 166}
558 167
559/** 168/**
@@ -662,379 +271,3 @@ int swsusp_resume(void)
662 local_irq_enable(); 271 local_irq_enable();
663 return error; 272 return error;
664} 273}
665
666/**
667 * mark_unsafe_pages - mark the pages that cannot be used for storing
668 * the image during resume, because they conflict with the pages that
669 * had been used before suspend
670 */
671
672static void mark_unsafe_pages(struct pbe *pblist)
673{
674 struct zone *zone;
675 unsigned long zone_pfn;
676 struct pbe *p;
677
678 if (!pblist) /* a sanity check */
679 return;
680
681 /* Clear page flags */
682 for_each_zone (zone) {
683 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
684 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
685 ClearPageNosaveFree(pfn_to_page(zone_pfn +
686 zone->zone_start_pfn));
687 }
688
689 /* Mark orig addresses */
690 for_each_pbe (p, pblist)
691 SetPageNosaveFree(virt_to_page(p->orig_address));
692
693}
694
695static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
696{
697 /* We assume both lists contain the same number of elements */
698 while (src) {
699 dst->orig_address = src->orig_address;
700 dst = dst->next;
701 src = src->next;
702 }
703}
704
705/*
706 * Using bio to read from swap.
707 * This code requires a bit more work than just using buffer heads
708 * but, it is the recommended way for 2.5/2.6.
709 * The following are to signal the beginning and end of I/O. Bios
710 * finish asynchronously, while we want them to happen synchronously.
711 * A simple atomic_t, and a wait loop take care of this problem.
712 */
713
714static atomic_t io_done = ATOMIC_INIT(0);
715
716static int end_io(struct bio *bio, unsigned int num, int err)
717{
718 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
719 panic("I/O error reading memory image");
720 atomic_set(&io_done, 0);
721 return 0;
722}
723
724static struct block_device *resume_bdev;
725
726/**
727 * submit - submit BIO request.
728 * @rw: READ or WRITE.
729 * @off physical offset of page.
730 * @page: page we're reading or writing.
731 *
732 * Straight from the textbook - allocate and initialize the bio.
733 * If we're writing, make sure the page is marked as dirty.
734 * Then submit it and wait.
735 */
736
737static int submit(int rw, pgoff_t page_off, void *page)
738{
739 int error = 0;
740 struct bio *bio;
741
742 bio = bio_alloc(GFP_ATOMIC, 1);
743 if (!bio)
744 return -ENOMEM;
745 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
746 bio->bi_bdev = resume_bdev;
747 bio->bi_end_io = end_io;
748
749 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
750 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
751 error = -EFAULT;
752 goto Done;
753 }
754
755
756 atomic_set(&io_done, 1);
757 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
758 while (atomic_read(&io_done))
759 yield();
760 if (rw == READ)
761 bio_set_pages_dirty(bio);
762 Done:
763 bio_put(bio);
764 return error;
765}
766
767static int bio_read_page(pgoff_t page_off, void *page)
768{
769 return submit(READ, page_off, page);
770}
771
772static int bio_write_page(pgoff_t page_off, void *page)
773{
774 return submit(WRITE, page_off, page);
775}
776
777/**
778 * The following functions allow us to read data using a swap map
779 * in a file-alike way
780 */
781
782static inline void release_swap_map_reader(struct swap_map_handle *handle)
783{
784 if (handle->cur)
785 free_page((unsigned long)handle->cur);
786 handle->cur = NULL;
787}
788
789static inline int get_swap_map_reader(struct swap_map_handle *handle,
790 swp_entry_t start)
791{
792 int error;
793
794 if (!swp_offset(start))
795 return -EINVAL;
796 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
797 if (!handle->cur)
798 return -ENOMEM;
799 error = bio_read_page(swp_offset(start), handle->cur);
800 if (error) {
801 release_swap_map_reader(handle);
802 return error;
803 }
804 handle->k = 0;
805 return 0;
806}
807
808static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
809{
810 unsigned long offset;
811 int error;
812
813 if (!handle->cur)
814 return -EINVAL;
815 offset = swp_offset(handle->cur->entries[handle->k]);
816 if (!offset)
817 return -EINVAL;
818 error = bio_read_page(offset, buf);
819 if (error)
820 return error;
821 if (++handle->k >= MAP_PAGE_SIZE) {
822 handle->k = 0;
823 offset = swp_offset(handle->cur->next_swap);
824 if (!offset)
825 release_swap_map_reader(handle);
826 else
827 error = bio_read_page(offset, handle->cur);
828 }
829 return error;
830}
831
832static int check_header(void)
833{
834 char *reason = NULL;
835
836 dump_info();
837 if (swsusp_info.version_code != LINUX_VERSION_CODE)
838 reason = "kernel version";
839 if (swsusp_info.num_physpages != num_physpages)
840 reason = "memory size";
841 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
842 reason = "system type";
843 if (strcmp(swsusp_info.uts.release,system_utsname.release))
844 reason = "kernel release";
845 if (strcmp(swsusp_info.uts.version,system_utsname.version))
846 reason = "version";
847 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
848 reason = "machine";
849 if (reason) {
850 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
851 return -EPERM;
852 }
853 return 0;
854}
855
856/**
857 * load_image_data - load the image data using the swap map handle
858 * @handle and store them using the page backup list @pblist
859 * (assume there are @nr_pages pages to load)
860 */
861
862static int load_image_data(struct pbe *pblist,
863 struct swap_map_handle *handle,
864 unsigned int nr_pages)
865{
866 int error;
867 unsigned int m;
868 struct pbe *p;
869
870 if (!pblist)
871 return -EINVAL;
872 printk("Loading image data pages (%u pages) ... ", nr_pages);
873 m = nr_pages / 100;
874 if (!m)
875 m = 1;
876 nr_pages = 0;
877 p = pblist;
878 while (p) {
879 error = swap_map_read_page(handle, (void *)p->address);
880 if (error)
881 break;
882 p = p->next;
883 if (!(nr_pages % m))
884 printk("\b\b\b\b%3d%%", nr_pages / m);
885 nr_pages++;
886 }
887 if (!error)
888 printk("\b\b\b\bdone\n");
889 return error;
890}
891
892/**
893 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
894 * the PBEs in the list starting at @pbe
895 */
896
897static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
898 struct pbe *pbe)
899{
900 int j;
901
902 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
903 pbe->orig_address = buf[j];
904 pbe = pbe->next;
905 }
906 return pbe;
907}
908
909/**
910 * load_image_metadata - load the image metadata using the swap map
911 * handle @handle and put them into the PBEs in the list @pblist
912 */
913
914static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
915{
916 struct pbe *p;
917 unsigned long *buf;
918 unsigned int n = 0;
919 int error = 0;
920
921 printk("Loading image metadata ... ");
922 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
923 if (!buf)
924 return -ENOMEM;
925 p = pblist;
926 while (p) {
927 error = swap_map_read_page(handle, buf);
928 if (error)
929 break;
930 p = unpack_orig_addresses(buf, p);
931 n++;
932 }
933 free_page((unsigned long)buf);
934 if (!error)
935 printk("done (%u pages loaded)\n", n);
936 return error;
937}
938
939int swsusp_read(struct pbe **pblist_ptr)
940{
941 int error;
942 struct pbe *p, *pblist;
943 struct swap_map_handle handle;
944 unsigned int nr_pages;
945
946 if (IS_ERR(resume_bdev)) {
947 pr_debug("swsusp: block device not initialised\n");
948 return PTR_ERR(resume_bdev);
949 }
950
951 error = get_swap_map_reader(&handle, swsusp_header.image);
952 if (!error)
953 error = swap_map_read_page(&handle, &swsusp_info);
954 if (!error)
955 error = check_header();
956 if (error)
957 return error;
958 nr_pages = swsusp_info.image_pages;
959 p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
960 if (!p)
961 return -ENOMEM;
962 error = load_image_metadata(p, &handle);
963 if (!error) {
964 mark_unsafe_pages(p);
965 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
966 if (pblist)
967 copy_page_backup_list(pblist, p);
968 free_pagedir(p);
969 if (!pblist)
970 error = -ENOMEM;
971
972 /* Allocate memory for the image and read the data from swap */
973 if (!error)
974 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
975 if (!error) {
976 release_eaten_pages();
977 error = load_image_data(pblist, &handle, nr_pages);
978 }
979 if (!error)
980 *pblist_ptr = pblist;
981 }
982 release_swap_map_reader(&handle);
983
984 blkdev_put(resume_bdev);
985
986 if (!error)
987 pr_debug("swsusp: Reading resume file was successful\n");
988 else
989 pr_debug("swsusp: Error %d resuming\n", error);
990 return error;
991}
992
993/**
994 * swsusp_check - Check for swsusp signature in the resume device
995 */
996
997int swsusp_check(void)
998{
999 int error;
1000
1001 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
1002 if (!IS_ERR(resume_bdev)) {
1003 set_blocksize(resume_bdev, PAGE_SIZE);
1004 memset(&swsusp_header, 0, sizeof(swsusp_header));
1005 if ((error = bio_read_page(0, &swsusp_header)))
1006 return error;
1007 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1008 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1009 /* Reset swap signature now */
1010 error = bio_write_page(0, &swsusp_header);
1011 } else {
1012 return -EINVAL;
1013 }
1014 if (error)
1015 blkdev_put(resume_bdev);
1016 else
1017 pr_debug("swsusp: Signature found, resuming\n");
1018 } else {
1019 error = PTR_ERR(resume_bdev);
1020 }
1021
1022 if (error)
1023 pr_debug("swsusp: Error %d check for resume file\n", error);
1024
1025 return error;
1026}
1027
1028/**
1029 * swsusp_close - close swap device.
1030 */
1031
1032void swsusp_close(void)
1033{
1034 if (IS_ERR(resume_bdev)) {
1035 pr_debug("swsusp: block device not initialised\n");
1036 return;
1037 }
1038
1039 blkdev_put(resume_bdev);
1040}
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 0000000000..3f1539fbe4
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,333 @@
1/*
2 * linux/kernel/power/user.c
3 *
4 * This file provides the user space interface for software suspend/resume.
5 *
6 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 *
8 * This file is released under the GPLv2.
9 *
10 */
11
12#include <linux/suspend.h>
13#include <linux/syscalls.h>
14#include <linux/string.h>
15#include <linux/device.h>
16#include <linux/miscdevice.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/swapops.h>
20#include <linux/pm.h>
21#include <linux/fs.h>
22
23#include <asm/uaccess.h>
24
25#include "power.h"
26
27#define SNAPSHOT_MINOR 231
28
29static struct snapshot_data {
30 struct snapshot_handle handle;
31 int swap;
32 struct bitmap_page *bitmap;
33 int mode;
34 char frozen;
35 char ready;
36} snapshot_state;
37
38static atomic_t device_available = ATOMIC_INIT(1);
39
40static int snapshot_open(struct inode *inode, struct file *filp)
41{
42 struct snapshot_data *data;
43
44 if (!atomic_add_unless(&device_available, -1, 0))
45 return -EBUSY;
46
47 if ((filp->f_flags & O_ACCMODE) == O_RDWR)
48 return -ENOSYS;
49
50 nonseekable_open(inode, filp);
51 data = &snapshot_state;
52 filp->private_data = data;
53 memset(&data->handle, 0, sizeof(struct snapshot_handle));
54 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
55 data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
56 data->mode = O_RDONLY;
57 } else {
58 data->swap = -1;
59 data->mode = O_WRONLY;
60 }
61 data->bitmap = NULL;
62 data->frozen = 0;
63 data->ready = 0;
64
65 return 0;
66}
67
68static int snapshot_release(struct inode *inode, struct file *filp)
69{
70 struct snapshot_data *data;
71
72 swsusp_free();
73 data = filp->private_data;
74 free_all_swap_pages(data->swap, data->bitmap);
75 free_bitmap(data->bitmap);
76 if (data->frozen) {
77 down(&pm_sem);
78 thaw_processes();
79 enable_nonboot_cpus();
80 up(&pm_sem);
81 }
82 atomic_inc(&device_available);
83 return 0;
84}
85
86static ssize_t snapshot_read(struct file *filp, char __user *buf,
87 size_t count, loff_t *offp)
88{
89 struct snapshot_data *data;
90 ssize_t res;
91
92 data = filp->private_data;
93 res = snapshot_read_next(&data->handle, count);
94 if (res > 0) {
95 if (copy_to_user(buf, data_of(data->handle), res))
96 res = -EFAULT;
97 else
98 *offp = data->handle.offset;
99 }
100 return res;
101}
102
103static ssize_t snapshot_write(struct file *filp, const char __user *buf,
104 size_t count, loff_t *offp)
105{
106 struct snapshot_data *data;
107 ssize_t res;
108
109 data = filp->private_data;
110 res = snapshot_write_next(&data->handle, count);
111 if (res > 0) {
112 if (copy_from_user(data_of(data->handle), buf, res))
113 res = -EFAULT;
114 else
115 *offp = data->handle.offset;
116 }
117 return res;
118}
119
120static int snapshot_ioctl(struct inode *inode, struct file *filp,
121 unsigned int cmd, unsigned long arg)
122{
123 int error = 0;
124 struct snapshot_data *data;
125 loff_t offset, avail;
126
127 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
128 return -ENOTTY;
129 if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
130 return -ENOTTY;
131 if (!capable(CAP_SYS_ADMIN))
132 return -EPERM;
133
134 data = filp->private_data;
135
136 switch (cmd) {
137
138 case SNAPSHOT_FREEZE:
139 if (data->frozen)
140 break;
141 down(&pm_sem);
142 disable_nonboot_cpus();
143 if (freeze_processes()) {
144 thaw_processes();
145 enable_nonboot_cpus();
146 error = -EBUSY;
147 }
148 up(&pm_sem);
149 if (!error)
150 data->frozen = 1;
151 break;
152
153 case SNAPSHOT_UNFREEZE:
154 if (!data->frozen)
155 break;
156 down(&pm_sem);
157 thaw_processes();
158 enable_nonboot_cpus();
159 up(&pm_sem);
160 data->frozen = 0;
161 break;
162
163 case SNAPSHOT_ATOMIC_SNAPSHOT:
164 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
165 error = -EPERM;
166 break;
167 }
168 down(&pm_sem);
169 /* Free memory before shutting down devices. */
170 error = swsusp_shrink_memory();
171 if (!error) {
172 error = device_suspend(PMSG_FREEZE);
173 if (!error) {
174 in_suspend = 1;
175 error = swsusp_suspend();
176 device_resume();
177 }
178 }
179 up(&pm_sem);
180 if (!error)
181 error = put_user(in_suspend, (unsigned int __user *)arg);
182 if (!error)
183 data->ready = 1;
184 break;
185
186 case SNAPSHOT_ATOMIC_RESTORE:
187 if (data->mode != O_WRONLY || !data->frozen ||
188 !snapshot_image_loaded(&data->handle)) {
189 error = -EPERM;
190 break;
191 }
192 down(&pm_sem);
193 pm_prepare_console();
194 error = device_suspend(PMSG_FREEZE);
195 if (!error) {
196 error = swsusp_resume();
197 device_resume();
198 }
199 pm_restore_console();
200 up(&pm_sem);
201 break;
202
203 case SNAPSHOT_FREE:
204 swsusp_free();
205 memset(&data->handle, 0, sizeof(struct snapshot_handle));
206 data->ready = 0;
207 break;
208
209 case SNAPSHOT_SET_IMAGE_SIZE:
210 image_size = arg;
211 break;
212
213 case SNAPSHOT_AVAIL_SWAP:
214 avail = count_swap_pages(data->swap, 1);
215 avail <<= PAGE_SHIFT;
216 error = put_user(avail, (loff_t __user *)arg);
217 break;
218
219 case SNAPSHOT_GET_SWAP_PAGE:
220 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
221 error = -ENODEV;
222 break;
223 }
224 if (!data->bitmap) {
225 data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
226 if (!data->bitmap) {
227 error = -ENOMEM;
228 break;
229 }
230 }
231 offset = alloc_swap_page(data->swap, data->bitmap);
232 if (offset) {
233 offset <<= PAGE_SHIFT;
234 error = put_user(offset, (loff_t __user *)arg);
235 } else {
236 error = -ENOSPC;
237 }
238 break;
239
240 case SNAPSHOT_FREE_SWAP_PAGES:
241 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
242 error = -ENODEV;
243 break;
244 }
245 free_all_swap_pages(data->swap, data->bitmap);
246 free_bitmap(data->bitmap);
247 data->bitmap = NULL;
248 break;
249
250 case SNAPSHOT_SET_SWAP_FILE:
251 if (!data->bitmap) {
252 /*
253 * User space encodes device types as two-byte values,
254 * so we need to recode them
255 */
256 if (old_decode_dev(arg)) {
257 data->swap = swap_type_of(old_decode_dev(arg));
258 if (data->swap < 0)
259 error = -ENODEV;
260 } else {
261 data->swap = -1;
262 error = -EINVAL;
263 }
264 } else {
265 error = -EPERM;
266 }
267 break;
268
269 case SNAPSHOT_S2RAM:
270 if (!data->frozen) {
271 error = -EPERM;
272 break;
273 }
274
275 if (down_trylock(&pm_sem)) {
276 error = -EBUSY;
277 break;
278 }
279
280 if (pm_ops->prepare) {
281 error = pm_ops->prepare(PM_SUSPEND_MEM);
282 if (error)
283 goto OutS3;
284 }
285
286 /* Put devices to sleep */
287 error = device_suspend(PMSG_SUSPEND);
288 if (error) {
289 printk(KERN_ERR "Failed to suspend some devices.\n");
290 } else {
291 /* Enter S3, system is already frozen */
292 suspend_enter(PM_SUSPEND_MEM);
293
294 /* Wake up devices */
295 device_resume();
296 }
297
298 if (pm_ops->finish)
299 pm_ops->finish(PM_SUSPEND_MEM);
300
301OutS3:
302 up(&pm_sem);
303 break;
304
305 default:
306 error = -ENOTTY;
307
308 }
309
310 return error;
311}
312
313static struct file_operations snapshot_fops = {
314 .open = snapshot_open,
315 .release = snapshot_release,
316 .read = snapshot_read,
317 .write = snapshot_write,
318 .llseek = no_llseek,
319 .ioctl = snapshot_ioctl,
320};
321
322static struct miscdevice snapshot_device = {
323 .minor = SNAPSHOT_MINOR,
324 .name = "snapshot",
325 .fops = &snapshot_fops,
326};
327
328static int __init snapshot_device_init(void)
329{
330 return misc_register(&snapshot_device);
331};
332
333device_initcall(snapshot_device_init);
diff --git a/kernel/printk.c b/kernel/printk.c
index 13ced0f782..c056f33244 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -122,44 +122,6 @@ static char *log_buf = __log_buf;
122static int log_buf_len = __LOG_BUF_LEN; 122static int log_buf_len = __LOG_BUF_LEN;
123static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ 123static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
124 124
125/*
126 * Setup a list of consoles. Called from init/main.c
127 */
128static int __init console_setup(char *str)
129{
130 char name[sizeof(console_cmdline[0].name)];
131 char *s, *options;
132 int idx;
133
134 /*
135 * Decode str into name, index, options.
136 */
137 if (str[0] >= '0' && str[0] <= '9') {
138 strcpy(name, "ttyS");
139 strncpy(name + 4, str, sizeof(name) - 5);
140 } else
141 strncpy(name, str, sizeof(name) - 1);
142 name[sizeof(name) - 1] = 0;
143 if ((options = strchr(str, ',')) != NULL)
144 *(options++) = 0;
145#ifdef __sparc__
146 if (!strcmp(str, "ttya"))
147 strcpy(name, "ttyS0");
148 if (!strcmp(str, "ttyb"))
149 strcpy(name, "ttyS1");
150#endif
151 for (s = name; *s; s++)
152 if ((*s >= '0' && *s <= '9') || *s == ',')
153 break;
154 idx = simple_strtoul(s, NULL, 10);
155 *s = 0;
156
157 add_preferred_console(name, idx, options);
158 return 1;
159}
160
161__setup("console=", console_setup);
162
163static int __init log_buf_len_setup(char *str) 125static int __init log_buf_len_setup(char *str)
164{ 126{
165 unsigned long size = memparse(str, &str); 127 unsigned long size = memparse(str, &str);
@@ -398,8 +360,7 @@ static void call_console_drivers(unsigned long start, unsigned long end)
398 unsigned long cur_index, start_print; 360 unsigned long cur_index, start_print;
399 static int msg_level = -1; 361 static int msg_level = -1;
400 362
401 if (((long)(start - end)) > 0) 363 BUG_ON(((long)(start - end)) > 0);
402 BUG();
403 364
404 cur_index = start; 365 cur_index = start;
405 start_print = start; 366 start_print = start;
@@ -659,6 +620,44 @@ static void call_console_drivers(unsigned long start, unsigned long end)
659 620
660#endif 621#endif
661 622
623/*
624 * Set up a list of consoles. Called from init/main.c
625 */
626static int __init console_setup(char *str)
627{
628 char name[sizeof(console_cmdline[0].name)];
629 char *s, *options;
630 int idx;
631
632 /*
633 * Decode str into name, index, options.
634 */
635 if (str[0] >= '0' && str[0] <= '9') {
636 strcpy(name, "ttyS");
637 strncpy(name + 4, str, sizeof(name) - 5);
638 } else {
639 strncpy(name, str, sizeof(name) - 1);
640 }
641 name[sizeof(name) - 1] = 0;
642 if ((options = strchr(str, ',')) != NULL)
643 *(options++) = 0;
644#ifdef __sparc__
645 if (!strcmp(str, "ttya"))
646 strcpy(name, "ttyS0");
647 if (!strcmp(str, "ttyb"))
648 strcpy(name, "ttyS1");
649#endif
650 for (s = name; *s; s++)
651 if ((*s >= '0' && *s <= '9') || *s == ',')
652 break;
653 idx = simple_strtoul(s, NULL, 10);
654 *s = 0;
655
656 add_preferred_console(name, idx, options);
657 return 1;
658}
659__setup("console=", console_setup);
660
662/** 661/**
663 * add_preferred_console - add a device to the list of preferred consoles. 662 * add_preferred_console - add a device to the list of preferred consoles.
664 * @name: device name 663 * @name: device name
@@ -708,8 +707,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
708 */ 707 */
709void acquire_console_sem(void) 708void acquire_console_sem(void)
710{ 709{
711 if (in_interrupt()) 710 BUG_ON(in_interrupt());
712 BUG();
713 down(&console_sem); 711 down(&console_sem);
714 console_locked = 1; 712 console_locked = 1;
715 console_may_schedule = 1; 713 console_may_schedule = 1;
diff --git a/kernel/profile.c b/kernel/profile.c
index f89248e6d7..68afe121e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,6 +23,7 @@
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/profile.h> 24#include <linux/profile.h>
25#include <linux/highmem.h> 25#include <linux/highmem.h>
26#include <linux/mutex.h>
26#include <asm/sections.h> 27#include <asm/sections.h>
27#include <asm/semaphore.h> 28#include <asm/semaphore.h>
28 29
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
44#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
45static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
46static DEFINE_PER_CPU(int, cpu_profile_flip); 47static DEFINE_PER_CPU(int, cpu_profile_flip);
47static DECLARE_MUTEX(profile_flip_mutex); 48static DEFINE_MUTEX(profile_flip_mutex);
48#endif /* CONFIG_SMP */ 49#endif /* CONFIG_SMP */
49 50
50static int __init profile_setup(char * str) 51static int __init profile_setup(char * str)
@@ -86,72 +87,52 @@ void __init profile_init(void)
86 87
87#ifdef CONFIG_PROFILING 88#ifdef CONFIG_PROFILING
88 89
89static DECLARE_RWSEM(profile_rwsem); 90static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
90static DEFINE_RWLOCK(handoff_lock); 91static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
91static struct notifier_block * task_exit_notifier; 92static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
92static struct notifier_block * task_free_notifier;
93static struct notifier_block * munmap_notifier;
94 93
95void profile_task_exit(struct task_struct * task) 94void profile_task_exit(struct task_struct * task)
96{ 95{
97 down_read(&profile_rwsem); 96 blocking_notifier_call_chain(&task_exit_notifier, 0, task);
98 notifier_call_chain(&task_exit_notifier, 0, task);
99 up_read(&profile_rwsem);
100} 97}
101 98
102int profile_handoff_task(struct task_struct * task) 99int profile_handoff_task(struct task_struct * task)
103{ 100{
104 int ret; 101 int ret;
105 read_lock(&handoff_lock); 102 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
106 ret = notifier_call_chain(&task_free_notifier, 0, task);
107 read_unlock(&handoff_lock);
108 return (ret == NOTIFY_OK) ? 1 : 0; 103 return (ret == NOTIFY_OK) ? 1 : 0;
109} 104}
110 105
111void profile_munmap(unsigned long addr) 106void profile_munmap(unsigned long addr)
112{ 107{
113 down_read(&profile_rwsem); 108 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
114 notifier_call_chain(&munmap_notifier, 0, (void *)addr);
115 up_read(&profile_rwsem);
116} 109}
117 110
118int task_handoff_register(struct notifier_block * n) 111int task_handoff_register(struct notifier_block * n)
119{ 112{
120 int err = -EINVAL; 113 return atomic_notifier_chain_register(&task_free_notifier, n);
121
122 write_lock(&handoff_lock);
123 err = notifier_chain_register(&task_free_notifier, n);
124 write_unlock(&handoff_lock);
125 return err;
126} 114}
127 115
128int task_handoff_unregister(struct notifier_block * n) 116int task_handoff_unregister(struct notifier_block * n)
129{ 117{
130 int err = -EINVAL; 118 return atomic_notifier_chain_unregister(&task_free_notifier, n);
131
132 write_lock(&handoff_lock);
133 err = notifier_chain_unregister(&task_free_notifier, n);
134 write_unlock(&handoff_lock);
135 return err;
136} 119}
137 120
138int profile_event_register(enum profile_type type, struct notifier_block * n) 121int profile_event_register(enum profile_type type, struct notifier_block * n)
139{ 122{
140 int err = -EINVAL; 123 int err = -EINVAL;
141 124
142 down_write(&profile_rwsem);
143
144 switch (type) { 125 switch (type) {
145 case PROFILE_TASK_EXIT: 126 case PROFILE_TASK_EXIT:
146 err = notifier_chain_register(&task_exit_notifier, n); 127 err = blocking_notifier_chain_register(
128 &task_exit_notifier, n);
147 break; 129 break;
148 case PROFILE_MUNMAP: 130 case PROFILE_MUNMAP:
149 err = notifier_chain_register(&munmap_notifier, n); 131 err = blocking_notifier_chain_register(
132 &munmap_notifier, n);
150 break; 133 break;
151 } 134 }
152 135
153 up_write(&profile_rwsem);
154
155 return err; 136 return err;
156} 137}
157 138
@@ -160,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
160{ 141{
161 int err = -EINVAL; 142 int err = -EINVAL;
162 143
163 down_write(&profile_rwsem);
164
165 switch (type) { 144 switch (type) {
166 case PROFILE_TASK_EXIT: 145 case PROFILE_TASK_EXIT:
167 err = notifier_chain_unregister(&task_exit_notifier, n); 146 err = blocking_notifier_chain_unregister(
147 &task_exit_notifier, n);
168 break; 148 break;
169 case PROFILE_MUNMAP: 149 case PROFILE_MUNMAP:
170 err = notifier_chain_unregister(&munmap_notifier, n); 150 err = blocking_notifier_chain_unregister(
151 &munmap_notifier, n);
171 break; 152 break;
172 } 153 }
173 154
174 up_write(&profile_rwsem);
175 return err; 155 return err;
176} 156}
177 157
@@ -243,7 +223,7 @@ static void profile_flip_buffers(void)
243{ 223{
244 int i, j, cpu; 224 int i, j, cpu;
245 225
246 down(&profile_flip_mutex); 226 mutex_lock(&profile_flip_mutex);
247 j = per_cpu(cpu_profile_flip, get_cpu()); 227 j = per_cpu(cpu_profile_flip, get_cpu());
248 put_cpu(); 228 put_cpu();
249 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 229 on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -259,14 +239,14 @@ static void profile_flip_buffers(void)
259 hits[i].hits = hits[i].pc = 0; 239 hits[i].hits = hits[i].pc = 0;
260 } 240 }
261 } 241 }
262 up(&profile_flip_mutex); 242 mutex_unlock(&profile_flip_mutex);
263} 243}
264 244
265static void profile_discard_flip_buffers(void) 245static void profile_discard_flip_buffers(void)
266{ 246{
267 int i, cpu; 247 int i, cpu;
268 248
269 down(&profile_flip_mutex); 249 mutex_lock(&profile_flip_mutex);
270 i = per_cpu(cpu_profile_flip, get_cpu()); 250 i = per_cpu(cpu_profile_flip, get_cpu());
271 put_cpu(); 251 put_cpu();
272 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 252 on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -274,7 +254,7 @@ static void profile_discard_flip_buffers(void)
274 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; 254 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
275 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); 255 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
276 } 256 }
277 up(&profile_flip_mutex); 257 mutex_unlock(&profile_flip_mutex);
278} 258}
279 259
280void profile_hit(int type, void *__pc) 260void profile_hit(int type, void *__pc)
@@ -319,7 +299,7 @@ out:
319} 299}
320 300
321#ifdef CONFIG_HOTPLUG_CPU 301#ifdef CONFIG_HOTPLUG_CPU
322static int __devinit profile_cpu_callback(struct notifier_block *info, 302static int profile_cpu_callback(struct notifier_block *info,
323 unsigned long action, void *__cpu) 303 unsigned long action, void *__cpu)
324{ 304{
325 int node, cpu = (unsigned long)__cpu; 305 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5f33cdb6ff..921c22ad16 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -30,14 +30,13 @@
30 */ 30 */
31void __ptrace_link(task_t *child, task_t *new_parent) 31void __ptrace_link(task_t *child, task_t *new_parent)
32{ 32{
33 if (!list_empty(&child->ptrace_list)) 33 BUG_ON(!list_empty(&child->ptrace_list));
34 BUG();
35 if (child->parent == new_parent) 34 if (child->parent == new_parent)
36 return; 35 return;
37 list_add(&child->ptrace_list, &child->parent->ptrace_children); 36 list_add(&child->ptrace_list, &child->parent->ptrace_children);
38 REMOVE_LINKS(child); 37 remove_parent(child);
39 child->parent = new_parent; 38 child->parent = new_parent;
40 SET_LINKS(child); 39 add_parent(child);
41} 40}
42 41
43/* 42/*
@@ -57,10 +56,6 @@ void ptrace_untrace(task_t *child)
57 signal_wake_up(child, 1); 56 signal_wake_up(child, 1);
58 } 57 }
59 } 58 }
60 if (child->signal->flags & SIGNAL_GROUP_EXIT) {
61 sigaddset(&child->pending.signal, SIGKILL);
62 signal_wake_up(child, 1);
63 }
64 spin_unlock(&child->sighand->siglock); 59 spin_unlock(&child->sighand->siglock);
65} 60}
66 61
@@ -72,17 +67,18 @@ void ptrace_untrace(task_t *child)
72 */ 67 */
73void __ptrace_unlink(task_t *child) 68void __ptrace_unlink(task_t *child)
74{ 69{
75 if (!child->ptrace) 70 BUG_ON(!child->ptrace);
76 BUG(); 71
77 child->ptrace = 0; 72 child->ptrace = 0;
78 if (!list_empty(&child->ptrace_list)) { 73 if (!list_empty(&child->ptrace_list)) {
79 list_del_init(&child->ptrace_list); 74 list_del_init(&child->ptrace_list);
80 REMOVE_LINKS(child); 75 remove_parent(child);
81 child->parent = child->real_parent; 76 child->parent = child->real_parent;
82 SET_LINKS(child); 77 add_parent(child);
83 } 78 }
84 79
85 ptrace_untrace(child); 80 if (child->state == TASK_TRACED)
81 ptrace_untrace(child);
86} 82}
87 83
88/* 84/*
@@ -152,12 +148,34 @@ int ptrace_may_attach(struct task_struct *task)
152int ptrace_attach(struct task_struct *task) 148int ptrace_attach(struct task_struct *task)
153{ 149{
154 int retval; 150 int retval;
155 task_lock(task); 151
156 retval = -EPERM; 152 retval = -EPERM;
157 if (task->pid <= 1) 153 if (task->pid <= 1)
158 goto bad; 154 goto out;
159 if (task->tgid == current->tgid) 155 if (task->tgid == current->tgid)
160 goto bad; 156 goto out;
157
158repeat:
159 /*
160 * Nasty, nasty.
161 *
162 * We want to hold both the task-lock and the
163 * tasklist_lock for writing at the same time.
164 * But that's against the rules (tasklist_lock
165 * is taken for reading by interrupts on other
166 * cpu's that may have task_lock).
167 */
168 task_lock(task);
169 local_irq_disable();
170 if (!write_trylock(&tasklist_lock)) {
171 local_irq_enable();
172 task_unlock(task);
173 do {
174 cpu_relax();
175 } while (!write_can_lock(&tasklist_lock));
176 goto repeat;
177 }
178
161 /* the same process cannot be attached many times */ 179 /* the same process cannot be attached many times */
162 if (task->ptrace & PT_PTRACED) 180 if (task->ptrace & PT_PTRACED)
163 goto bad; 181 goto bad;
@@ -170,36 +188,39 @@ int ptrace_attach(struct task_struct *task)
170 ? PT_ATTACHED : 0); 188 ? PT_ATTACHED : 0);
171 if (capable(CAP_SYS_PTRACE)) 189 if (capable(CAP_SYS_PTRACE))
172 task->ptrace |= PT_PTRACE_CAP; 190 task->ptrace |= PT_PTRACE_CAP;
173 task_unlock(task);
174 191
175 write_lock_irq(&tasklist_lock);
176 __ptrace_link(task, current); 192 __ptrace_link(task, current);
177 write_unlock_irq(&tasklist_lock);
178 193
179 force_sig_specific(SIGSTOP, task); 194 force_sig_specific(SIGSTOP, task);
180 return 0;
181 195
182bad: 196bad:
197 write_unlock_irq(&tasklist_lock);
183 task_unlock(task); 198 task_unlock(task);
199out:
184 return retval; 200 return retval;
185} 201}
186 202
203void __ptrace_detach(struct task_struct *child, unsigned int data)
204{
205 child->exit_code = data;
206 /* .. re-parent .. */
207 __ptrace_unlink(child);
208 /* .. and wake it up. */
209 if (child->exit_state != EXIT_ZOMBIE)
210 wake_up_process(child);
211}
212
187int ptrace_detach(struct task_struct *child, unsigned int data) 213int ptrace_detach(struct task_struct *child, unsigned int data)
188{ 214{
189 if (!valid_signal(data)) 215 if (!valid_signal(data))
190 return -EIO; 216 return -EIO;
191 217
192 /* Architecture-specific hardware disable .. */ 218 /* Architecture-specific hardware disable .. */
193 ptrace_disable(child); 219 ptrace_disable(child);
194 220
195 /* .. re-parent .. */
196 child->exit_code = data;
197
198 write_lock_irq(&tasklist_lock); 221 write_lock_irq(&tasklist_lock);
199 __ptrace_unlink(child); 222 if (child->ptrace)
200 /* .. and wake it up. */ 223 __ptrace_detach(child, data);
201 if (child->exit_state != EXIT_ZOMBIE)
202 wake_up_process(child);
203 write_unlock_irq(&tasklist_lock); 224 write_unlock_irq(&tasklist_lock);
204 225
205 return 0; 226 return 0;
@@ -242,8 +263,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
242 if (write) { 263 if (write) {
243 copy_to_user_page(vma, page, addr, 264 copy_to_user_page(vma, page, addr,
244 maddr + offset, buf, bytes); 265 maddr + offset, buf, bytes);
245 if (!PageCompound(page)) 266 set_page_dirty_lock(page);
246 set_page_dirty_lock(page);
247 } else { 267 } else {
248 copy_from_user_page(vma, page, addr, 268 copy_from_user_page(vma, page, addr,
249 buf, maddr + offset, bytes); 269 buf, maddr + offset, bytes);
@@ -417,21 +437,22 @@ int ptrace_request(struct task_struct *child, long request,
417 */ 437 */
418int ptrace_traceme(void) 438int ptrace_traceme(void)
419{ 439{
420 int ret; 440 int ret = -EPERM;
421 441
422 /* 442 /*
423 * Are we already being traced? 443 * Are we already being traced?
424 */ 444 */
425 if (current->ptrace & PT_PTRACED) 445 task_lock(current);
426 return -EPERM; 446 if (!(current->ptrace & PT_PTRACED)) {
427 ret = security_ptrace(current->parent, current); 447 ret = security_ptrace(current->parent, current);
428 if (ret) 448 /*
429 return -EPERM; 449 * Set the ptrace bit in the process ptrace flags.
430 /* 450 */
431 * Set the ptrace bit in the process ptrace flags. 451 if (!ret)
432 */ 452 current->ptrace |= PT_PTRACED;
433 current->ptrace |= PT_PTRACED; 453 }
434 return 0; 454 task_unlock(current);
455 return ret;
435} 456}
436 457
437/** 458/**
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0cf8146bd5..2058f88c7b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,15 +47,16 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/mutex.h>
50 51
51/* Definition for rcupdate control block. */ 52/* Definition for rcupdate control block. */
52struct rcu_ctrlblk rcu_ctrlblk = { 53static struct rcu_ctrlblk rcu_ctrlblk = {
53 .cur = -300, 54 .cur = -300,
54 .completed = -300, 55 .completed = -300,
55 .lock = SPIN_LOCK_UNLOCKED, 56 .lock = SPIN_LOCK_UNLOCKED,
56 .cpumask = CPU_MASK_NONE, 57 .cpumask = CPU_MASK_NONE,
57}; 58};
58struct rcu_ctrlblk rcu_bh_ctrlblk = { 59static struct rcu_ctrlblk rcu_bh_ctrlblk = {
59 .cur = -300, 60 .cur = -300,
60 .completed = -300, 61 .completed = -300,
61 .lock = SPIN_LOCK_UNLOCKED, 62 .lock = SPIN_LOCK_UNLOCKED,
@@ -67,7 +68,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
67 68
68/* Fake initialization required by compiler */ 69/* Fake initialization required by compiler */
69static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; 70static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
70static int maxbatch = 10000; 71static int blimit = 10;
72static int qhimark = 10000;
73static int qlowmark = 100;
74#ifdef CONFIG_SMP
75static int rsinterval = 1000;
76#endif
77
78static atomic_t rcu_barrier_cpu_count;
79static DEFINE_MUTEX(rcu_barrier_mutex);
80static struct completion rcu_barrier_completion;
81
82#ifdef CONFIG_SMP
83static void force_quiescent_state(struct rcu_data *rdp,
84 struct rcu_ctrlblk *rcp)
85{
86 int cpu;
87 cpumask_t cpumask;
88 set_need_resched();
89 if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
90 rdp->last_rs_qlen = rdp->qlen;
91 /*
92 * Don't send IPI to itself. With irqs disabled,
93 * rdp->cpu is the current cpu.
94 */
95 cpumask = rcp->cpumask;
96 cpu_clear(rdp->cpu, cpumask);
97 for_each_cpu_mask(cpu, cpumask)
98 smp_send_reschedule(cpu);
99 }
100}
101#else
102static inline void force_quiescent_state(struct rcu_data *rdp,
103 struct rcu_ctrlblk *rcp)
104{
105 set_need_resched();
106}
107#endif
71 108
72/** 109/**
73 * call_rcu - Queue an RCU callback for invocation after a grace period. 110 * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -92,17 +129,13 @@ void fastcall call_rcu(struct rcu_head *head,
92 rdp = &__get_cpu_var(rcu_data); 129 rdp = &__get_cpu_var(rcu_data);
93 *rdp->nxttail = head; 130 *rdp->nxttail = head;
94 rdp->nxttail = &head->next; 131 rdp->nxttail = &head->next;
95 132 if (unlikely(++rdp->qlen > qhimark)) {
96 if (unlikely(++rdp->count > 10000)) 133 rdp->blimit = INT_MAX;
97 set_need_resched(); 134 force_quiescent_state(rdp, &rcu_ctrlblk);
98 135 }
99 local_irq_restore(flags); 136 local_irq_restore(flags);
100} 137}
101 138
102static atomic_t rcu_barrier_cpu_count;
103static struct semaphore rcu_barrier_sema;
104static struct completion rcu_barrier_completion;
105
106/** 139/**
107 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. 140 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
108 * @head: structure to be used for queueing the RCU updates. 141 * @head: structure to be used for queueing the RCU updates.
@@ -131,12 +164,12 @@ void fastcall call_rcu_bh(struct rcu_head *head,
131 rdp = &__get_cpu_var(rcu_bh_data); 164 rdp = &__get_cpu_var(rcu_bh_data);
132 *rdp->nxttail = head; 165 *rdp->nxttail = head;
133 rdp->nxttail = &head->next; 166 rdp->nxttail = &head->next;
134 rdp->count++; 167
135/* 168 if (unlikely(++rdp->qlen > qhimark)) {
136 * Should we directly call rcu_do_batch() here ? 169 rdp->blimit = INT_MAX;
137 * if (unlikely(rdp->count > 10000)) 170 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
138 * rcu_do_batch(rdp); 171 }
139 */ 172
140 local_irq_restore(flags); 173 local_irq_restore(flags);
141} 174}
142 175
@@ -175,13 +208,13 @@ static void rcu_barrier_func(void *notused)
175void rcu_barrier(void) 208void rcu_barrier(void)
176{ 209{
177 BUG_ON(in_interrupt()); 210 BUG_ON(in_interrupt());
178 /* Take cpucontrol semaphore to protect against CPU hotplug */ 211 /* Take cpucontrol mutex to protect against CPU hotplug */
179 down(&rcu_barrier_sema); 212 mutex_lock(&rcu_barrier_mutex);
180 init_completion(&rcu_barrier_completion); 213 init_completion(&rcu_barrier_completion);
181 atomic_set(&rcu_barrier_cpu_count, 0); 214 atomic_set(&rcu_barrier_cpu_count, 0);
182 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 215 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
183 wait_for_completion(&rcu_barrier_completion); 216 wait_for_completion(&rcu_barrier_completion);
184 up(&rcu_barrier_sema); 217 mutex_unlock(&rcu_barrier_mutex);
185} 218}
186EXPORT_SYMBOL_GPL(rcu_barrier); 219EXPORT_SYMBOL_GPL(rcu_barrier);
187 220
@@ -199,10 +232,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
199 next = rdp->donelist = list->next; 232 next = rdp->donelist = list->next;
200 list->func(list); 233 list->func(list);
201 list = next; 234 list = next;
202 rdp->count--; 235 rdp->qlen--;
203 if (++count >= maxbatch) 236 if (++count >= rdp->blimit)
204 break; 237 break;
205 } 238 }
239 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
240 rdp->blimit = blimit;
206 if (!rdp->donelist) 241 if (!rdp->donelist)
207 rdp->donetail = &rdp->donelist; 242 rdp->donetail = &rdp->donelist;
208 else 243 else
@@ -381,8 +416,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
381 rdp->curtail = &rdp->curlist; 416 rdp->curtail = &rdp->curlist;
382 } 417 }
383 418
384 local_irq_disable();
385 if (rdp->nxtlist && !rdp->curlist) { 419 if (rdp->nxtlist && !rdp->curlist) {
420 local_irq_disable();
386 rdp->curlist = rdp->nxtlist; 421 rdp->curlist = rdp->nxtlist;
387 rdp->curtail = rdp->nxttail; 422 rdp->curtail = rdp->nxttail;
388 rdp->nxtlist = NULL; 423 rdp->nxtlist = NULL;
@@ -407,9 +442,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
407 rcu_start_batch(rcp); 442 rcu_start_batch(rcp);
408 spin_unlock(&rcp->lock); 443 spin_unlock(&rcp->lock);
409 } 444 }
410 } else {
411 local_irq_enable();
412 } 445 }
446
413 rcu_check_quiescent_state(rcp, rdp); 447 rcu_check_quiescent_state(rcp, rdp);
414 if (rdp->donelist) 448 if (rdp->donelist)
415 rcu_do_batch(rdp); 449 rcu_do_batch(rdp);
@@ -445,12 +479,31 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
445 return 0; 479 return 0;
446} 480}
447 481
482/*
483 * Check to see if there is any immediate RCU-related work to be done
484 * by the current CPU, returning 1 if so. This function is part of the
485 * RCU implementation; it is -not- an exported member of the RCU API.
486 */
448int rcu_pending(int cpu) 487int rcu_pending(int cpu)
449{ 488{
450 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || 489 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
451 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); 490 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
452} 491}
453 492
493/*
494 * Check to see if any future RCU-related work will need to be done
495 * by the current CPU, even if none need be done immediately, returning
496 * 1 if so. This function is part of the RCU implementation; it is -not-
497 * an exported member of the RCU API.
498 */
499int rcu_needs_cpu(int cpu)
500{
501 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
502 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
503
504 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
505}
506
454void rcu_check_callbacks(int cpu, int user) 507void rcu_check_callbacks(int cpu, int user)
455{ 508{
456 if (user || 509 if (user ||
@@ -473,6 +526,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
473 rdp->quiescbatch = rcp->completed; 526 rdp->quiescbatch = rcp->completed;
474 rdp->qs_pending = 0; 527 rdp->qs_pending = 0;
475 rdp->cpu = cpu; 528 rdp->cpu = cpu;
529 rdp->blimit = blimit;
476} 530}
477 531
478static void __devinit rcu_online_cpu(int cpu) 532static void __devinit rcu_online_cpu(int cpu)
@@ -485,7 +539,7 @@ static void __devinit rcu_online_cpu(int cpu)
485 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 539 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
486} 540}
487 541
488static int __devinit rcu_cpu_notify(struct notifier_block *self, 542static int rcu_cpu_notify(struct notifier_block *self,
489 unsigned long action, void *hcpu) 543 unsigned long action, void *hcpu)
490{ 544{
491 long cpu = (long)hcpu; 545 long cpu = (long)hcpu;
@@ -502,7 +556,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
502 return NOTIFY_OK; 556 return NOTIFY_OK;
503} 557}
504 558
505static struct notifier_block __devinitdata rcu_nb = { 559static struct notifier_block rcu_nb = {
506 .notifier_call = rcu_cpu_notify, 560 .notifier_call = rcu_cpu_notify,
507}; 561};
508 562
@@ -514,7 +568,6 @@ static struct notifier_block __devinitdata rcu_nb = {
514 */ 568 */
515void __init rcu_init(void) 569void __init rcu_init(void)
516{ 570{
517 sema_init(&rcu_barrier_sema, 1);
518 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 571 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
519 (void *)(long)smp_processor_id()); 572 (void *)(long)smp_processor_id());
520 /* Register notifier for non-boot CPUs */ 573 /* Register notifier for non-boot CPUs */
@@ -567,9 +620,14 @@ void synchronize_kernel(void)
567 synchronize_rcu(); 620 synchronize_rcu();
568} 621}
569 622
570module_param(maxbatch, int, 0); 623module_param(blimit, int, 0);
624module_param(qhimark, int, 0);
625module_param(qlowmark, int, 0);
626#ifdef CONFIG_SMP
627module_param(rsinterval, int, 0);
628#endif
571EXPORT_SYMBOL_GPL(rcu_batches_completed); 629EXPORT_SYMBOL_GPL(rcu_batches_completed);
572EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ 630EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */
573EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ 631EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */
574EXPORT_SYMBOL_GPL(synchronize_rcu); 632EXPORT_SYMBOL_GPL(synchronize_rcu);
575EXPORT_SYMBOL(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ 633EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7712912dbc..8154e7589d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -54,15 +54,15 @@ static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
56 56
57MODULE_PARM(nreaders, "i"); 57module_param(nreaders, int, 0);
58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
59MODULE_PARM(stat_interval, "i"); 59module_param(stat_interval, int, 0);
60MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 60MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
61MODULE_PARM(verbose, "i"); 61module_param(verbose, bool, 0);
62MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 62MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
63MODULE_PARM(test_no_idle_hz, "i"); 63module_param(test_no_idle_hz, bool, 0);
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65MODULE_PARM(shuffle_interval, "i"); 65module_param(shuffle_interval, int, 0);
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
67#define TORTURE_FLAG "rcutorture: " 67#define TORTURE_FLAG "rcutorture: "
68#define PRINTK_STRING(s) \ 68#define PRINTK_STRING(s) \
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
301 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 301 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
302 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 302 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
303 303
304 for_each_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
305 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 305 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
306 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; 306 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
307 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; 307 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -441,6 +441,16 @@ rcu_torture_shuffle(void *arg)
441 return 0; 441 return 0;
442} 442}
443 443
444static inline void
445rcu_torture_print_module_parms(char *tag)
446{
447 printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
448 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
449 "shuffle_interval = %d\n",
450 tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
451 shuffle_interval);
452}
453
444static void 454static void
445rcu_torture_cleanup(void) 455rcu_torture_cleanup(void)
446{ 456{
@@ -483,9 +493,10 @@ rcu_torture_cleanup(void)
483 rcu_barrier(); 493 rcu_barrier();
484 494
485 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 495 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
486 printk(KERN_ALERT TORTURE_FLAG 496 if (atomic_read(&n_rcu_torture_error))
487 "--- End of test: %s\n", 497 rcu_torture_print_module_parms("End of test: FAILURE");
488 atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); 498 else
499 rcu_torture_print_module_parms("End of test: SUCCESS");
489} 500}
490 501
491static int 502static int
@@ -501,11 +512,7 @@ rcu_torture_init(void)
501 nrealreaders = nreaders; 512 nrealreaders = nreaders;
502 else 513 else
503 nrealreaders = 2 * num_online_cpus(); 514 nrealreaders = 2 * num_online_cpus();
504 printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d " 515 rcu_torture_print_module_parms("Start of test");
505 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
506 "shuffle_interval = %d\n",
507 nrealreaders, stat_interval, verbose, test_no_idle_hz,
508 shuffle_interval);
509 fullstop = 0; 516 fullstop = 0;
510 517
511 /* Set up the freelist. */ 518 /* Set up the freelist. */
@@ -528,7 +535,7 @@ rcu_torture_init(void)
528 atomic_set(&n_rcu_torture_error, 0); 535 atomic_set(&n_rcu_torture_error, 0);
529 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 536 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
530 atomic_set(&rcu_torture_wcount[i], 0); 537 atomic_set(&rcu_torture_wcount[i], 0);
531 for_each_cpu(cpu) { 538 for_each_possible_cpu(cpu) {
532 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 539 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
533 per_cpu(rcu_torture_count, cpu)[i] = 0; 540 per_cpu(rcu_torture_count, cpu)[i] = 0;
534 per_cpu(rcu_torture_batch, cpu)[i] = 0; 541 per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 0000000000..33345e7348
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,1012 @@
1/*
2 * Public API and common code for kernel->userspace relay file support.
3 *
4 * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
5 *
6 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
8 *
9 * Moved to kernel/relay.c by Paul Mundt, 2006.
10 *
11 * This file is released under the GPL.
12 */
13#include <linux/errno.h>
14#include <linux/stddef.h>
15#include <linux/slab.h>
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/relay.h>
19#include <linux/vmalloc.h>
20#include <linux/mm.h>
21
22/*
23 * close() vm_op implementation for relay file mapping.
24 */
25static void relay_file_mmap_close(struct vm_area_struct *vma)
26{
27 struct rchan_buf *buf = vma->vm_private_data;
28 buf->chan->cb->buf_unmapped(buf, vma->vm_file);
29}
30
31/*
32 * nopage() vm_op implementation for relay file mapping.
33 */
34static struct page *relay_buf_nopage(struct vm_area_struct *vma,
35 unsigned long address,
36 int *type)
37{
38 struct page *page;
39 struct rchan_buf *buf = vma->vm_private_data;
40 unsigned long offset = address - vma->vm_start;
41
42 if (address > vma->vm_end)
43 return NOPAGE_SIGBUS; /* Disallow mremap */
44 if (!buf)
45 return NOPAGE_OOM;
46
47 page = vmalloc_to_page(buf->start + offset);
48 if (!page)
49 return NOPAGE_OOM;
50 get_page(page);
51
52 if (type)
53 *type = VM_FAULT_MINOR;
54
55 return page;
56}
57
58/*
59 * vm_ops for relay file mappings.
60 */
61static struct vm_operations_struct relay_file_mmap_ops = {
62 .nopage = relay_buf_nopage,
63 .close = relay_file_mmap_close,
64};
65
66/**
67 * relay_mmap_buf: - mmap channel buffer to process address space
68 * @buf: relay channel buffer
69 * @vma: vm_area_struct describing memory to be mapped
70 *
71 * Returns 0 if ok, negative on error
72 *
73 * Caller should already have grabbed mmap_sem.
74 */
75int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
76{
77 unsigned long length = vma->vm_end - vma->vm_start;
78 struct file *filp = vma->vm_file;
79
80 if (!buf)
81 return -EBADF;
82
83 if (length != (unsigned long)buf->chan->alloc_size)
84 return -EINVAL;
85
86 vma->vm_ops = &relay_file_mmap_ops;
87 vma->vm_private_data = buf;
88 buf->chan->cb->buf_mapped(buf, filp);
89
90 return 0;
91}
92
93/**
94 * relay_alloc_buf - allocate a channel buffer
95 * @buf: the buffer struct
96 * @size: total size of the buffer
97 *
98 * Returns a pointer to the resulting buffer, NULL if unsuccessful. The
99 * passed in size will get page aligned, if it isn't already.
100 */
101static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
102{
103 void *mem;
104 unsigned int i, j, n_pages;
105
106 *size = PAGE_ALIGN(*size);
107 n_pages = *size >> PAGE_SHIFT;
108
109 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
110 if (!buf->page_array)
111 return NULL;
112
113 for (i = 0; i < n_pages; i++) {
114 buf->page_array[i] = alloc_page(GFP_KERNEL);
115 if (unlikely(!buf->page_array[i]))
116 goto depopulate;
117 }
118 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
119 if (!mem)
120 goto depopulate;
121
122 memset(mem, 0, *size);
123 buf->page_count = n_pages;
124 return mem;
125
126depopulate:
127 for (j = 0; j < i; j++)
128 __free_page(buf->page_array[j]);
129 kfree(buf->page_array);
130 return NULL;
131}
132
133/**
134 * relay_create_buf - allocate and initialize a channel buffer
135 * @alloc_size: size of the buffer to allocate
136 * @n_subbufs: number of sub-buffers in the channel
137 *
138 * Returns channel buffer if successful, NULL otherwise
139 */
140struct rchan_buf *relay_create_buf(struct rchan *chan)
141{
142 struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
143 if (!buf)
144 return NULL;
145
146 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
147 if (!buf->padding)
148 goto free_buf;
149
150 buf->start = relay_alloc_buf(buf, &chan->alloc_size);
151 if (!buf->start)
152 goto free_buf;
153
154 buf->chan = chan;
155 kref_get(&buf->chan->kref);
156 return buf;
157
158free_buf:
159 kfree(buf->padding);
160 kfree(buf);
161 return NULL;
162}
163
164/**
165 * relay_destroy_channel - free the channel struct
166 *
167 * Should only be called from kref_put().
168 */
169void relay_destroy_channel(struct kref *kref)
170{
171 struct rchan *chan = container_of(kref, struct rchan, kref);
172 kfree(chan);
173}
174
175/**
176 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
177 * @buf: the buffer struct
178 */
179void relay_destroy_buf(struct rchan_buf *buf)
180{
181 struct rchan *chan = buf->chan;
182 unsigned int i;
183
184 if (likely(buf->start)) {
185 vunmap(buf->start);
186 for (i = 0; i < buf->page_count; i++)
187 __free_page(buf->page_array[i]);
188 kfree(buf->page_array);
189 }
190 kfree(buf->padding);
191 kfree(buf);
192 kref_put(&chan->kref, relay_destroy_channel);
193}
194
195/**
196 * relay_remove_buf - remove a channel buffer
197 *
198 * Removes the file from the fileystem, which also frees the
199 * rchan_buf_struct and the channel buffer. Should only be called from
200 * kref_put().
201 */
202void relay_remove_buf(struct kref *kref)
203{
204 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
205 buf->chan->cb->remove_buf_file(buf->dentry);
206 relay_destroy_buf(buf);
207}
208
209/**
210 * relay_buf_empty - boolean, is the channel buffer empty?
211 * @buf: channel buffer
212 *
213 * Returns 1 if the buffer is empty, 0 otherwise.
214 */
215int relay_buf_empty(struct rchan_buf *buf)
216{
217 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
218}
219EXPORT_SYMBOL_GPL(relay_buf_empty);
220
221/**
222 * relay_buf_full - boolean, is the channel buffer full?
223 * @buf: channel buffer
224 *
225 * Returns 1 if the buffer is full, 0 otherwise.
226 */
227int relay_buf_full(struct rchan_buf *buf)
228{
229 size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
230 return (ready >= buf->chan->n_subbufs) ? 1 : 0;
231}
232EXPORT_SYMBOL_GPL(relay_buf_full);
233
234/*
235 * High-level relay kernel API and associated functions.
236 */
237
238/*
239 * rchan_callback implementations defining default channel behavior. Used
240 * in place of corresponding NULL values in client callback struct.
241 */
242
243/*
244 * subbuf_start() default callback. Does nothing.
245 */
246static int subbuf_start_default_callback (struct rchan_buf *buf,
247 void *subbuf,
248 void *prev_subbuf,
249 size_t prev_padding)
250{
251 if (relay_buf_full(buf))
252 return 0;
253
254 return 1;
255}
256
257/*
258 * buf_mapped() default callback. Does nothing.
259 */
260static void buf_mapped_default_callback(struct rchan_buf *buf,
261 struct file *filp)
262{
263}
264
265/*
266 * buf_unmapped() default callback. Does nothing.
267 */
268static void buf_unmapped_default_callback(struct rchan_buf *buf,
269 struct file *filp)
270{
271}
272
273/*
274 * create_buf_file_create() default callback. Does nothing.
275 */
276static struct dentry *create_buf_file_default_callback(const char *filename,
277 struct dentry *parent,
278 int mode,
279 struct rchan_buf *buf,
280 int *is_global)
281{
282 return NULL;
283}
284
285/*
286 * remove_buf_file() default callback. Does nothing.
287 */
288static int remove_buf_file_default_callback(struct dentry *dentry)
289{
290 return -EINVAL;
291}
292
293/* relay channel default callbacks */
294static struct rchan_callbacks default_channel_callbacks = {
295 .subbuf_start = subbuf_start_default_callback,
296 .buf_mapped = buf_mapped_default_callback,
297 .buf_unmapped = buf_unmapped_default_callback,
298 .create_buf_file = create_buf_file_default_callback,
299 .remove_buf_file = remove_buf_file_default_callback,
300};
301
302/**
303 * wakeup_readers - wake up readers waiting on a channel
304 * @private: the channel buffer
305 *
306 * This is the work function used to defer reader waking. The
307 * reason waking is deferred is that calling directly from write
308 * causes problems if you're writing from say the scheduler.
309 */
310static void wakeup_readers(void *private)
311{
312 struct rchan_buf *buf = private;
313 wake_up_interruptible(&buf->read_wait);
314}
315
316/**
317 * __relay_reset - reset a channel buffer
318 * @buf: the channel buffer
319 * @init: 1 if this is a first-time initialization
320 *
321 * See relay_reset for description of effect.
322 */
323static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
324{
325 size_t i;
326
327 if (init) {
328 init_waitqueue_head(&buf->read_wait);
329 kref_init(&buf->kref);
330 INIT_WORK(&buf->wake_readers, NULL, NULL);
331 } else {
332 cancel_delayed_work(&buf->wake_readers);
333 flush_scheduled_work();
334 }
335
336 buf->subbufs_produced = 0;
337 buf->subbufs_consumed = 0;
338 buf->bytes_consumed = 0;
339 buf->finalized = 0;
340 buf->data = buf->start;
341 buf->offset = 0;
342
343 for (i = 0; i < buf->chan->n_subbufs; i++)
344 buf->padding[i] = 0;
345
346 buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
347}
348
349/**
350 * relay_reset - reset the channel
351 * @chan: the channel
352 *
353 * This has the effect of erasing all data from all channel buffers
354 * and restarting the channel in its initial state. The buffers
355 * are not freed, so any mappings are still in effect.
356 *
357 * NOTE: Care should be taken that the channel isn't actually
358 * being used by anything when this call is made.
359 */
360void relay_reset(struct rchan *chan)
361{
362 unsigned int i;
363 struct rchan_buf *prev = NULL;
364
365 if (!chan)
366 return;
367
368 for (i = 0; i < NR_CPUS; i++) {
369 if (!chan->buf[i] || chan->buf[i] == prev)
370 break;
371 __relay_reset(chan->buf[i], 0);
372 prev = chan->buf[i];
373 }
374}
375EXPORT_SYMBOL_GPL(relay_reset);
376
377/**
378 * relay_open_buf - create a new relay channel buffer
379 *
380 * Internal - used by relay_open().
381 */
382static struct rchan_buf *relay_open_buf(struct rchan *chan,
383 const char *filename,
384 struct dentry *parent,
385 int *is_global)
386{
387 struct rchan_buf *buf;
388 struct dentry *dentry;
389
390 if (*is_global)
391 return chan->buf[0];
392
393 buf = relay_create_buf(chan);
394 if (!buf)
395 return NULL;
396
397 /* Create file in fs */
398 dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
399 buf, is_global);
400 if (!dentry) {
401 relay_destroy_buf(buf);
402 return NULL;
403 }
404
405 buf->dentry = dentry;
406 __relay_reset(buf, 1);
407
408 return buf;
409}
410
411/**
412 * relay_close_buf - close a channel buffer
413 * @buf: channel buffer
414 *
415 * Marks the buffer finalized and restores the default callbacks.
416 * The channel buffer and channel buffer data structure are then freed
417 * automatically when the last reference is given up.
418 */
419static inline void relay_close_buf(struct rchan_buf *buf)
420{
421 buf->finalized = 1;
422 cancel_delayed_work(&buf->wake_readers);
423 flush_scheduled_work();
424 kref_put(&buf->kref, relay_remove_buf);
425}
426
427static inline void setup_callbacks(struct rchan *chan,
428 struct rchan_callbacks *cb)
429{
430 if (!cb) {
431 chan->cb = &default_channel_callbacks;
432 return;
433 }
434
435 if (!cb->subbuf_start)
436 cb->subbuf_start = subbuf_start_default_callback;
437 if (!cb->buf_mapped)
438 cb->buf_mapped = buf_mapped_default_callback;
439 if (!cb->buf_unmapped)
440 cb->buf_unmapped = buf_unmapped_default_callback;
441 if (!cb->create_buf_file)
442 cb->create_buf_file = create_buf_file_default_callback;
443 if (!cb->remove_buf_file)
444 cb->remove_buf_file = remove_buf_file_default_callback;
445 chan->cb = cb;
446}
447
448/**
449 * relay_open - create a new relay channel
450 * @base_filename: base name of files to create
451 * @parent: dentry of parent directory, NULL for root directory
452 * @subbuf_size: size of sub-buffers
453 * @n_subbufs: number of sub-buffers
454 * @cb: client callback functions
455 *
456 * Returns channel pointer if successful, NULL otherwise.
457 *
458 * Creates a channel buffer for each cpu using the sizes and
459 * attributes specified. The created channel buffer files
460 * will be named base_filename0...base_filenameN-1. File
461 * permissions will be S_IRUSR.
462 */
463struct rchan *relay_open(const char *base_filename,
464 struct dentry *parent,
465 size_t subbuf_size,
466 size_t n_subbufs,
467 struct rchan_callbacks *cb)
468{
469 unsigned int i;
470 struct rchan *chan;
471 char *tmpname;
472 int is_global = 0;
473
474 if (!base_filename)
475 return NULL;
476
477 if (!(subbuf_size && n_subbufs))
478 return NULL;
479
480 chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
481 if (!chan)
482 return NULL;
483
484 chan->version = RELAYFS_CHANNEL_VERSION;
485 chan->n_subbufs = n_subbufs;
486 chan->subbuf_size = subbuf_size;
487 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
488 setup_callbacks(chan, cb);
489 kref_init(&chan->kref);
490
491 tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
492 if (!tmpname)
493 goto free_chan;
494
495 for_each_online_cpu(i) {
496 sprintf(tmpname, "%s%d", base_filename, i);
497 chan->buf[i] = relay_open_buf(chan, tmpname, parent,
498 &is_global);
499 if (!chan->buf[i])
500 goto free_bufs;
501
502 chan->buf[i]->cpu = i;
503 }
504
505 kfree(tmpname);
506 return chan;
507
508free_bufs:
509 for (i = 0; i < NR_CPUS; i++) {
510 if (!chan->buf[i])
511 break;
512 relay_close_buf(chan->buf[i]);
513 if (is_global)
514 break;
515 }
516 kfree(tmpname);
517
518free_chan:
519 kref_put(&chan->kref, relay_destroy_channel);
520 return NULL;
521}
522EXPORT_SYMBOL_GPL(relay_open);
523
524/**
525 * relay_switch_subbuf - switch to a new sub-buffer
526 * @buf: channel buffer
527 * @length: size of current event
528 *
529 * Returns either the length passed in or 0 if full.
530 *
531 * Performs sub-buffer-switch tasks such as invoking callbacks,
532 * updating padding counts, waking up readers, etc.
533 */
534size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
535{
536 void *old, *new;
537 size_t old_subbuf, new_subbuf;
538
539 if (unlikely(length > buf->chan->subbuf_size))
540 goto toobig;
541
542 if (buf->offset != buf->chan->subbuf_size + 1) {
543 buf->prev_padding = buf->chan->subbuf_size - buf->offset;
544 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
545 buf->padding[old_subbuf] = buf->prev_padding;
546 buf->subbufs_produced++;
547 buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
548 buf->padding[old_subbuf];
549 smp_mb();
550 if (waitqueue_active(&buf->read_wait)) {
551 PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
552 schedule_delayed_work(&buf->wake_readers, 1);
553 }
554 }
555
556 old = buf->data;
557 new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
558 new = buf->start + new_subbuf * buf->chan->subbuf_size;
559 buf->offset = 0;
560 if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
561 buf->offset = buf->chan->subbuf_size + 1;
562 return 0;
563 }
564 buf->data = new;
565 buf->padding[new_subbuf] = 0;
566
567 if (unlikely(length + buf->offset > buf->chan->subbuf_size))
568 goto toobig;
569
570 return length;
571
572toobig:
573 buf->chan->last_toobig = length;
574 return 0;
575}
576EXPORT_SYMBOL_GPL(relay_switch_subbuf);
577
578/**
579 * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
580 * @chan: the channel
581 * @cpu: the cpu associated with the channel buffer to update
582 * @subbufs_consumed: number of sub-buffers to add to current buf's count
583 *
584 * Adds to the channel buffer's consumed sub-buffer count.
585 * subbufs_consumed should be the number of sub-buffers newly consumed,
586 * not the total consumed.
587 *
588 * NOTE: kernel clients don't need to call this function if the channel
589 * mode is 'overwrite'.
590 */
591void relay_subbufs_consumed(struct rchan *chan,
592 unsigned int cpu,
593 size_t subbufs_consumed)
594{
595 struct rchan_buf *buf;
596
597 if (!chan)
598 return;
599
600 if (cpu >= NR_CPUS || !chan->buf[cpu])
601 return;
602
603 buf = chan->buf[cpu];
604 buf->subbufs_consumed += subbufs_consumed;
605 if (buf->subbufs_consumed > buf->subbufs_produced)
606 buf->subbufs_consumed = buf->subbufs_produced;
607}
608EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
609
610/**
611 * relay_close - close the channel
612 * @chan: the channel
613 *
614 * Closes all channel buffers and frees the channel.
615 */
616void relay_close(struct rchan *chan)
617{
618 unsigned int i;
619 struct rchan_buf *prev = NULL;
620
621 if (!chan)
622 return;
623
624 for (i = 0; i < NR_CPUS; i++) {
625 if (!chan->buf[i] || chan->buf[i] == prev)
626 break;
627 relay_close_buf(chan->buf[i]);
628 prev = chan->buf[i];
629 }
630
631 if (chan->last_toobig)
632 printk(KERN_WARNING "relay: one or more items not logged "
633 "[item size (%Zd) > sub-buffer size (%Zd)]\n",
634 chan->last_toobig, chan->subbuf_size);
635
636 kref_put(&chan->kref, relay_destroy_channel);
637}
638EXPORT_SYMBOL_GPL(relay_close);
639
640/**
641 * relay_flush - close the channel
642 * @chan: the channel
643 *
644 * Flushes all channel buffers i.e. forces buffer switch.
645 */
646void relay_flush(struct rchan *chan)
647{
648 unsigned int i;
649 struct rchan_buf *prev = NULL;
650
651 if (!chan)
652 return;
653
654 for (i = 0; i < NR_CPUS; i++) {
655 if (!chan->buf[i] || chan->buf[i] == prev)
656 break;
657 relay_switch_subbuf(chan->buf[i], 0);
658 prev = chan->buf[i];
659 }
660}
661EXPORT_SYMBOL_GPL(relay_flush);
662
663/**
664 * relay_file_open - open file op for relay files
665 * @inode: the inode
666 * @filp: the file
667 *
668 * Increments the channel buffer refcount.
669 */
670static int relay_file_open(struct inode *inode, struct file *filp)
671{
672 struct rchan_buf *buf = inode->u.generic_ip;
673 kref_get(&buf->kref);
674 filp->private_data = buf;
675
676 return 0;
677}
678
679/**
680 * relay_file_mmap - mmap file op for relay files
681 * @filp: the file
682 * @vma: the vma describing what to map
683 *
684 * Calls upon relay_mmap_buf to map the file into user space.
685 */
686static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
687{
688 struct rchan_buf *buf = filp->private_data;
689 return relay_mmap_buf(buf, vma);
690}
691
692/**
693 * relay_file_poll - poll file op for relay files
694 * @filp: the file
695 * @wait: poll table
696 *
697 * Poll implemention.
698 */
699static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
700{
701 unsigned int mask = 0;
702 struct rchan_buf *buf = filp->private_data;
703
704 if (buf->finalized)
705 return POLLERR;
706
707 if (filp->f_mode & FMODE_READ) {
708 poll_wait(filp, &buf->read_wait, wait);
709 if (!relay_buf_empty(buf))
710 mask |= POLLIN | POLLRDNORM;
711 }
712
713 return mask;
714}
715
716/**
717 * relay_file_release - release file op for relay files
718 * @inode: the inode
719 * @filp: the file
720 *
721 * Decrements the channel refcount, as the filesystem is
722 * no longer using it.
723 */
724static int relay_file_release(struct inode *inode, struct file *filp)
725{
726 struct rchan_buf *buf = filp->private_data;
727 kref_put(&buf->kref, relay_remove_buf);
728
729 return 0;
730}
731
732/**
733 * relay_file_read_consume - update the consumed count for the buffer
734 */
735static void relay_file_read_consume(struct rchan_buf *buf,
736 size_t read_pos,
737 size_t bytes_consumed)
738{
739 size_t subbuf_size = buf->chan->subbuf_size;
740 size_t n_subbufs = buf->chan->n_subbufs;
741 size_t read_subbuf;
742
743 if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
744 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
745 buf->bytes_consumed = 0;
746 }
747
748 buf->bytes_consumed += bytes_consumed;
749 read_subbuf = read_pos / buf->chan->subbuf_size;
750 if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
751 if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
752 (buf->offset == subbuf_size))
753 return;
754 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
755 buf->bytes_consumed = 0;
756 }
757}
758
759/**
760 * relay_file_read_avail - boolean, are there unconsumed bytes available?
761 */
762static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
763{
764 size_t subbuf_size = buf->chan->subbuf_size;
765 size_t n_subbufs = buf->chan->n_subbufs;
766 size_t produced = buf->subbufs_produced;
767 size_t consumed = buf->subbufs_consumed;
768
769 relay_file_read_consume(buf, read_pos, 0);
770
771 if (unlikely(buf->offset > subbuf_size)) {
772 if (produced == consumed)
773 return 0;
774 return 1;
775 }
776
777 if (unlikely(produced - consumed >= n_subbufs)) {
778 consumed = (produced / n_subbufs) * n_subbufs;
779 buf->subbufs_consumed = consumed;
780 }
781
782 produced = (produced % n_subbufs) * subbuf_size + buf->offset;
783 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
784
785 if (consumed > produced)
786 produced += n_subbufs * subbuf_size;
787
788 if (consumed == produced)
789 return 0;
790
791 return 1;
792}
793
794/**
795 * relay_file_read_subbuf_avail - return bytes available in sub-buffer
796 */
797static size_t relay_file_read_subbuf_avail(size_t read_pos,
798 struct rchan_buf *buf)
799{
800 size_t padding, avail = 0;
801 size_t read_subbuf, read_offset, write_subbuf, write_offset;
802 size_t subbuf_size = buf->chan->subbuf_size;
803
804 write_subbuf = (buf->data - buf->start) / subbuf_size;
805 write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
806 read_subbuf = read_pos / subbuf_size;
807 read_offset = read_pos % subbuf_size;
808 padding = buf->padding[read_subbuf];
809
810 if (read_subbuf == write_subbuf) {
811 if (read_offset + padding < write_offset)
812 avail = write_offset - (read_offset + padding);
813 } else
814 avail = (subbuf_size - padding) - read_offset;
815
816 return avail;
817}
818
819/**
820 * relay_file_read_start_pos - find the first available byte to read
821 *
822 * If the read_pos is in the middle of padding, return the
823 * position of the first actually available byte, otherwise
824 * return the original value.
825 */
826static size_t relay_file_read_start_pos(size_t read_pos,
827 struct rchan_buf *buf)
828{
829 size_t read_subbuf, padding, padding_start, padding_end;
830 size_t subbuf_size = buf->chan->subbuf_size;
831 size_t n_subbufs = buf->chan->n_subbufs;
832
833 read_subbuf = read_pos / subbuf_size;
834 padding = buf->padding[read_subbuf];
835 padding_start = (read_subbuf + 1) * subbuf_size - padding;
836 padding_end = (read_subbuf + 1) * subbuf_size;
837 if (read_pos >= padding_start && read_pos < padding_end) {
838 read_subbuf = (read_subbuf + 1) % n_subbufs;
839 read_pos = read_subbuf * subbuf_size;
840 }
841
842 return read_pos;
843}
844
845/**
846 * relay_file_read_end_pos - return the new read position
847 */
848static size_t relay_file_read_end_pos(struct rchan_buf *buf,
849 size_t read_pos,
850 size_t count)
851{
852 size_t read_subbuf, padding, end_pos;
853 size_t subbuf_size = buf->chan->subbuf_size;
854 size_t n_subbufs = buf->chan->n_subbufs;
855
856 read_subbuf = read_pos / subbuf_size;
857 padding = buf->padding[read_subbuf];
858 if (read_pos % subbuf_size + count + padding == subbuf_size)
859 end_pos = (read_subbuf + 1) * subbuf_size;
860 else
861 end_pos = read_pos + count;
862 if (end_pos >= subbuf_size * n_subbufs)
863 end_pos = 0;
864
865 return end_pos;
866}
867
868/**
869 * subbuf_read_actor - read up to one subbuf's worth of data
870 */
871static int subbuf_read_actor(size_t read_start,
872 struct rchan_buf *buf,
873 size_t avail,
874 read_descriptor_t *desc,
875 read_actor_t actor)
876{
877 void *from;
878 int ret = 0;
879
880 from = buf->start + read_start;
881 ret = avail;
882 if (copy_to_user(desc->arg.data, from, avail)) {
883 desc->error = -EFAULT;
884 ret = 0;
885 }
886 desc->arg.data += ret;
887 desc->written += ret;
888 desc->count -= ret;
889
890 return ret;
891}
892
893/**
894 * subbuf_send_actor - send up to one subbuf's worth of data
895 */
896static int subbuf_send_actor(size_t read_start,
897 struct rchan_buf *buf,
898 size_t avail,
899 read_descriptor_t *desc,
900 read_actor_t actor)
901{
902 unsigned long pidx, poff;
903 unsigned int subbuf_pages;
904 int ret = 0;
905
906 subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
907 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
908 poff = read_start & ~PAGE_MASK;
909 while (avail) {
910 struct page *p = buf->page_array[pidx];
911 unsigned int len;
912
913 len = PAGE_SIZE - poff;
914 if (len > avail)
915 len = avail;
916
917 len = actor(desc, p, poff, len);
918 if (desc->error)
919 break;
920
921 avail -= len;
922 ret += len;
923 poff = 0;
924 pidx = (pidx + 1) % subbuf_pages;
925 }
926
927 return ret;
928}
929
930typedef int (*subbuf_actor_t) (size_t read_start,
931 struct rchan_buf *buf,
932 size_t avail,
933 read_descriptor_t *desc,
934 read_actor_t actor);
935
936/**
937 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
938 */
939static inline ssize_t relay_file_read_subbufs(struct file *filp,
940 loff_t *ppos,
941 size_t count,
942 subbuf_actor_t subbuf_actor,
943 read_actor_t actor,
944 void *target)
945{
946 struct rchan_buf *buf = filp->private_data;
947 size_t read_start, avail;
948 read_descriptor_t desc;
949 int ret;
950
951 if (!count)
952 return 0;
953
954 desc.written = 0;
955 desc.count = count;
956 desc.arg.data = target;
957 desc.error = 0;
958
959 mutex_lock(&filp->f_dentry->d_inode->i_mutex);
960 do {
961 if (!relay_file_read_avail(buf, *ppos))
962 break;
963
964 read_start = relay_file_read_start_pos(*ppos, buf);
965 avail = relay_file_read_subbuf_avail(read_start, buf);
966 if (!avail)
967 break;
968
969 avail = min(desc.count, avail);
970 ret = subbuf_actor(read_start, buf, avail, &desc, actor);
971 if (desc.error < 0)
972 break;
973
974 if (ret) {
975 relay_file_read_consume(buf, read_start, ret);
976 *ppos = relay_file_read_end_pos(buf, read_start, ret);
977 }
978 } while (desc.count && ret);
979 mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
980
981 return desc.written;
982}
983
984static ssize_t relay_file_read(struct file *filp,
985 char __user *buffer,
986 size_t count,
987 loff_t *ppos)
988{
989 return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
990 NULL, buffer);
991}
992
993static ssize_t relay_file_sendfile(struct file *filp,
994 loff_t *ppos,
995 size_t count,
996 read_actor_t actor,
997 void *target)
998{
999 return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
1000 actor, target);
1001}
1002
1003struct file_operations relay_file_operations = {
1004 .open = relay_file_open,
1005 .poll = relay_file_poll,
1006 .mmap = relay_file_mmap,
1007 .read = relay_file_read,
1008 .llseek = no_llseek,
1009 .release = relay_file_release,
1010 .sendfile = relay_file_sendfile,
1011};
1012EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/kernel/sched.c b/kernel/sched.c
index bc38804e40..c13f1bd2df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,6 +49,7 @@
49#include <linux/syscalls.h> 49#include <linux/syscalls.h>
50#include <linux/times.h> 50#include <linux/times.h>
51#include <linux/acct.h> 51#include <linux/acct.h>
52#include <linux/kprobes.h>
52#include <asm/tlb.h> 53#include <asm/tlb.h>
53 54
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -144,7 +145,8 @@
144 (v1) * (v2_max) / (v1_max) 145 (v1) * (v2_max) / (v1_max)
145 146
146#define DELTA(p) \ 147#define DELTA(p) \
147 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) 148 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
149 INTERACTIVE_DELTA)
148 150
149#define TASK_INTERACTIVE(p) \ 151#define TASK_INTERACTIVE(p) \
150 ((p)->prio <= (p)->static_prio - DELTA(p)) 152 ((p)->prio <= (p)->static_prio - DELTA(p))
@@ -178,13 +180,6 @@ static unsigned int task_timeslice(task_t *p)
178#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
179 < (long long) (sd)->cache_hot_time) 181 < (long long) (sd)->cache_hot_time)
180 182
181void __put_task_struct_cb(struct rcu_head *rhp)
182{
183 __put_task_struct(container_of(rhp, struct task_struct, rcu));
184}
185
186EXPORT_SYMBOL_GPL(__put_task_struct_cb);
187
188/* 183/*
189 * These are the runqueue data structures: 184 * These are the runqueue data structures:
190 */ 185 */
@@ -215,7 +210,6 @@ struct runqueue {
215 */ 210 */
216 unsigned long nr_running; 211 unsigned long nr_running;
217#ifdef CONFIG_SMP 212#ifdef CONFIG_SMP
218 unsigned long prio_bias;
219 unsigned long cpu_load[3]; 213 unsigned long cpu_load[3];
220#endif 214#endif
221 unsigned long long nr_switches; 215 unsigned long long nr_switches;
@@ -245,6 +239,7 @@ struct runqueue {
245 239
246 task_t *migration_thread; 240 task_t *migration_thread;
247 struct list_head migration_queue; 241 struct list_head migration_queue;
242 int cpu;
248#endif 243#endif
249 244
250#ifdef CONFIG_SCHEDSTATS 245#ifdef CONFIG_SCHEDSTATS
@@ -669,68 +664,17 @@ static int effective_prio(task_t *p)
669 return prio; 664 return prio;
670} 665}
671 666
672#ifdef CONFIG_SMP
673static inline void inc_prio_bias(runqueue_t *rq, int prio)
674{
675 rq->prio_bias += MAX_PRIO - prio;
676}
677
678static inline void dec_prio_bias(runqueue_t *rq, int prio)
679{
680 rq->prio_bias -= MAX_PRIO - prio;
681}
682
683static inline void inc_nr_running(task_t *p, runqueue_t *rq)
684{
685 rq->nr_running++;
686 if (rt_task(p)) {
687 if (p != rq->migration_thread)
688 /*
689 * The migration thread does the actual balancing. Do
690 * not bias by its priority as the ultra high priority
691 * will skew balancing adversely.
692 */
693 inc_prio_bias(rq, p->prio);
694 } else
695 inc_prio_bias(rq, p->static_prio);
696}
697
698static inline void dec_nr_running(task_t *p, runqueue_t *rq)
699{
700 rq->nr_running--;
701 if (rt_task(p)) {
702 if (p != rq->migration_thread)
703 dec_prio_bias(rq, p->prio);
704 } else
705 dec_prio_bias(rq, p->static_prio);
706}
707#else
708static inline void inc_prio_bias(runqueue_t *rq, int prio)
709{
710}
711
712static inline void dec_prio_bias(runqueue_t *rq, int prio)
713{
714}
715
716static inline void inc_nr_running(task_t *p, runqueue_t *rq)
717{
718 rq->nr_running++;
719}
720
721static inline void dec_nr_running(task_t *p, runqueue_t *rq)
722{
723 rq->nr_running--;
724}
725#endif
726
727/* 667/*
728 * __activate_task - move a task to the runqueue. 668 * __activate_task - move a task to the runqueue.
729 */ 669 */
730static inline void __activate_task(task_t *p, runqueue_t *rq) 670static void __activate_task(task_t *p, runqueue_t *rq)
731{ 671{
732 enqueue_task(p, rq->active); 672 prio_array_t *target = rq->active;
733 inc_nr_running(p, rq); 673
674 if (batch_task(p))
675 target = rq->expired;
676 enqueue_task(p, target);
677 rq->nr_running++;
734} 678}
735 679
736/* 680/*
@@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
739static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 683static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
740{ 684{
741 enqueue_task_head(p, rq->active); 685 enqueue_task_head(p, rq->active);
742 inc_nr_running(p, rq); 686 rq->nr_running++;
743} 687}
744 688
745static int recalc_task_prio(task_t *p, unsigned long long now) 689static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -748,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
748 unsigned long long __sleep_time = now - p->timestamp; 692 unsigned long long __sleep_time = now - p->timestamp;
749 unsigned long sleep_time; 693 unsigned long sleep_time;
750 694
751 if (unlikely(p->policy == SCHED_BATCH)) 695 if (batch_task(p))
752 sleep_time = 0; 696 sleep_time = 0;
753 else { 697 else {
754 if (__sleep_time > NS_MAX_SLEEP_AVG) 698 if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -760,27 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
760 if (likely(sleep_time > 0)) { 704 if (likely(sleep_time > 0)) {
761 /* 705 /*
762 * User tasks that sleep a long time are categorised as 706 * User tasks that sleep a long time are categorised as
763 * idle and will get just interactive status to stay active & 707 * idle. They will only have their sleep_avg increased to a
764 * prevent them suddenly becoming cpu hogs and starving 708 * level that makes them just interactive priority to stay
765 * other processes. 709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
766 */ 711 */
767 if (p->mm && p->activated != -1 && 712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
768 sleep_time > INTERACTIVE_SLEEP(p)) { 713 unsigned long ceiling;
769 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
770 DEF_TIMESLICE);
771 } else {
772 /*
773 * The lower the sleep avg a task has the more
774 * rapidly it will rise with sleep time.
775 */
776 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
777 714
715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
716 DEF_TIMESLICE);
717 if (p->sleep_avg < ceiling)
718 p->sleep_avg = ceiling;
719 } else {
778 /* 720 /*
779 * Tasks waking from uninterruptible sleep are 721 * Tasks waking from uninterruptible sleep are
780 * limited in their sleep_avg rise as they 722 * limited in their sleep_avg rise as they
781 * are likely to be waiting on I/O 723 * are likely to be waiting on I/O
782 */ 724 */
783 if (p->activated == -1 && p->mm) { 725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
784 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
785 sleep_time = 0; 727 sleep_time = 0;
786 else if (p->sleep_avg + sleep_time >= 728 else if (p->sleep_avg + sleep_time >=
@@ -835,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
835 * This checks to make sure it's not an uninterruptible task 777 * This checks to make sure it's not an uninterruptible task
836 * that is now waking up. 778 * that is now waking up.
837 */ 779 */
838 if (!p->activated) { 780 if (p->sleep_type == SLEEP_NORMAL) {
839 /* 781 /*
840 * Tasks which were woken up by interrupts (ie. hw events) 782 * Tasks which were woken up by interrupts (ie. hw events)
841 * are most likely of interactive nature. So we give them 783 * are most likely of interactive nature. So we give them
@@ -844,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
844 * on a CPU, first time around: 786 * on a CPU, first time around:
845 */ 787 */
846 if (in_interrupt()) 788 if (in_interrupt())
847 p->activated = 2; 789 p->sleep_type = SLEEP_INTERRUPTED;
848 else { 790 else {
849 /* 791 /*
850 * Normal first-time wakeups get a credit too for 792 * Normal first-time wakeups get a credit too for
851 * on-runqueue time, but it will be weighted down: 793 * on-runqueue time, but it will be weighted down:
852 */ 794 */
853 p->activated = 1; 795 p->sleep_type = SLEEP_INTERACTIVE;
854 } 796 }
855 } 797 }
856 p->timestamp = now; 798 p->timestamp = now;
@@ -863,7 +805,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
863 */ 805 */
864static void deactivate_task(struct task_struct *p, runqueue_t *rq) 806static void deactivate_task(struct task_struct *p, runqueue_t *rq)
865{ 807{
866 dec_nr_running(p, rq); 808 rq->nr_running--;
867 dequeue_task(p, p->array); 809 dequeue_task(p, p->array);
868 p->array = NULL; 810 p->array = NULL;
869} 811}
@@ -1007,61 +949,27 @@ void kick_process(task_t *p)
1007 * We want to under-estimate the load of migration sources, to 949 * We want to under-estimate the load of migration sources, to
1008 * balance conservatively. 950 * balance conservatively.
1009 */ 951 */
1010static unsigned long __source_load(int cpu, int type, enum idle_type idle) 952static inline unsigned long source_load(int cpu, int type)
1011{ 953{
1012 runqueue_t *rq = cpu_rq(cpu); 954 runqueue_t *rq = cpu_rq(cpu);
1013 unsigned long running = rq->nr_running; 955 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
1014 unsigned long source_load, cpu_load = rq->cpu_load[type-1],
1015 load_now = running * SCHED_LOAD_SCALE;
1016
1017 if (type == 0) 956 if (type == 0)
1018 source_load = load_now; 957 return load_now;
1019 else
1020 source_load = min(cpu_load, load_now);
1021 958
1022 if (running > 1 || (idle == NOT_IDLE && running)) 959 return min(rq->cpu_load[type-1], load_now);
1023 /*
1024 * If we are busy rebalancing the load is biased by
1025 * priority to create 'nice' support across cpus. When
1026 * idle rebalancing we should only bias the source_load if
1027 * there is more than one task running on that queue to
1028 * prevent idle rebalance from trying to pull tasks from a
1029 * queue with only one running task.
1030 */
1031 source_load = source_load * rq->prio_bias / running;
1032
1033 return source_load;
1034}
1035
1036static inline unsigned long source_load(int cpu, int type)
1037{
1038 return __source_load(cpu, type, NOT_IDLE);
1039} 960}
1040 961
1041/* 962/*
1042 * Return a high guess at the load of a migration-target cpu 963 * Return a high guess at the load of a migration-target cpu
1043 */ 964 */
1044static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) 965static inline unsigned long target_load(int cpu, int type)
1045{ 966{
1046 runqueue_t *rq = cpu_rq(cpu); 967 runqueue_t *rq = cpu_rq(cpu);
1047 unsigned long running = rq->nr_running; 968 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
1048 unsigned long target_load, cpu_load = rq->cpu_load[type-1],
1049 load_now = running * SCHED_LOAD_SCALE;
1050
1051 if (type == 0) 969 if (type == 0)
1052 target_load = load_now; 970 return load_now;
1053 else
1054 target_load = max(cpu_load, load_now);
1055 971
1056 if (running > 1 || (idle == NOT_IDLE && running)) 972 return max(rq->cpu_load[type-1], load_now);
1057 target_load = target_load * rq->prio_bias / running;
1058
1059 return target_load;
1060}
1061
1062static inline unsigned long target_load(int cpu, int type)
1063{
1064 return __target_load(cpu, type, NOT_IDLE);
1065} 973}
1066 974
1067/* 975/*
@@ -1294,9 +1202,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1294 } 1202 }
1295 } 1203 }
1296 1204
1297 if (p->last_waker_cpu != this_cpu)
1298 goto out_set_cpu;
1299
1300 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1205 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1301 goto out_set_cpu; 1206 goto out_set_cpu;
1302 1207
@@ -1367,8 +1272,6 @@ out_set_cpu:
1367 cpu = task_cpu(p); 1272 cpu = task_cpu(p);
1368 } 1273 }
1369 1274
1370 p->last_waker_cpu = this_cpu;
1371
1372out_activate: 1275out_activate:
1373#endif /* CONFIG_SMP */ 1276#endif /* CONFIG_SMP */
1374 if (old_state == TASK_UNINTERRUPTIBLE) { 1277 if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1377,19 +1280,19 @@ out_activate:
1377 * Tasks on involuntary sleep don't earn 1280 * Tasks on involuntary sleep don't earn
1378 * sleep_avg beyond just interactive state. 1281 * sleep_avg beyond just interactive state.
1379 */ 1282 */
1380 p->activated = -1; 1283 p->sleep_type = SLEEP_NONINTERACTIVE;
1381 } 1284 } else
1382 1285
1383 /* 1286 /*
1384 * Tasks that have marked their sleep as noninteractive get 1287 * Tasks that have marked their sleep as noninteractive get
1385 * woken up without updating their sleep average. (i.e. their 1288 * woken up with their sleep average not weighted in an
1386 * sleep is handled in a priority-neutral manner, no priority 1289 * interactive way.
1387 * boost and no penalty.)
1388 */ 1290 */
1389 if (old_state & TASK_NONINTERACTIVE) 1291 if (old_state & TASK_NONINTERACTIVE)
1390 __activate_task(p, rq); 1292 p->sleep_type = SLEEP_NONINTERACTIVE;
1391 else 1293
1392 activate_task(p, rq, cpu == this_cpu); 1294
1295 activate_task(p, rq, cpu == this_cpu);
1393 /* 1296 /*
1394 * Sync wakeups (i.e. those types of wakeups where the waker 1297 * Sync wakeups (i.e. those types of wakeups where the waker
1395 * has indicated that it will leave the CPU in short order) 1298 * has indicated that it will leave the CPU in short order)
@@ -1450,12 +1353,9 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1450#ifdef CONFIG_SCHEDSTATS 1353#ifdef CONFIG_SCHEDSTATS
1451 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1354 memset(&p->sched_info, 0, sizeof(p->sched_info));
1452#endif 1355#endif
1453#if defined(CONFIG_SMP) 1356#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1454 p->last_waker_cpu = cpu;
1455#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
1456 p->oncpu = 0; 1357 p->oncpu = 0;
1457#endif 1358#endif
1458#endif
1459#ifdef CONFIG_PREEMPT 1359#ifdef CONFIG_PREEMPT
1460 /* Want to start with kernel preemption disabled. */ 1360 /* Want to start with kernel preemption disabled. */
1461 task_thread_info(p)->preempt_count = 1; 1361 task_thread_info(p)->preempt_count = 1;
@@ -1530,7 +1430,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1530 list_add_tail(&p->run_list, &current->run_list); 1430 list_add_tail(&p->run_list, &current->run_list);
1531 p->array = current->array; 1431 p->array = current->array;
1532 p->array->nr_active++; 1432 p->array->nr_active++;
1533 inc_nr_running(p, rq); 1433 rq->nr_running++;
1534 } 1434 }
1535 set_need_resched(); 1435 set_need_resched();
1536 } else 1436 } else
@@ -1656,8 +1556,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1656 finish_lock_switch(rq, prev); 1556 finish_lock_switch(rq, prev);
1657 if (mm) 1557 if (mm)
1658 mmdrop(mm); 1558 mmdrop(mm);
1659 if (unlikely(prev_task_flags & PF_DEAD)) 1559 if (unlikely(prev_task_flags & PF_DEAD)) {
1560 /*
1561 * Remove function-return probe instances associated with this
1562 * task and put them back on the free list.
1563 */
1564 kprobe_flush_task(prev);
1660 put_task_struct(prev); 1565 put_task_struct(prev);
1566 }
1661} 1567}
1662 1568
1663/** 1569/**
@@ -1727,7 +1633,7 @@ unsigned long nr_uninterruptible(void)
1727{ 1633{
1728 unsigned long i, sum = 0; 1634 unsigned long i, sum = 0;
1729 1635
1730 for_each_cpu(i) 1636 for_each_possible_cpu(i)
1731 sum += cpu_rq(i)->nr_uninterruptible; 1637 sum += cpu_rq(i)->nr_uninterruptible;
1732 1638
1733 /* 1639 /*
@@ -1744,7 +1650,7 @@ unsigned long long nr_context_switches(void)
1744{ 1650{
1745 unsigned long long i, sum = 0; 1651 unsigned long long i, sum = 0;
1746 1652
1747 for_each_cpu(i) 1653 for_each_possible_cpu(i)
1748 sum += cpu_rq(i)->nr_switches; 1654 sum += cpu_rq(i)->nr_switches;
1749 1655
1750 return sum; 1656 return sum;
@@ -1754,17 +1660,35 @@ unsigned long nr_iowait(void)
1754{ 1660{
1755 unsigned long i, sum = 0; 1661 unsigned long i, sum = 0;
1756 1662
1757 for_each_cpu(i) 1663 for_each_possible_cpu(i)
1758 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1664 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1759 1665
1760 return sum; 1666 return sum;
1761} 1667}
1762 1668
1669unsigned long nr_active(void)
1670{
1671 unsigned long i, running = 0, uninterruptible = 0;
1672
1673 for_each_online_cpu(i) {
1674 running += cpu_rq(i)->nr_running;
1675 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1676 }
1677
1678 if (unlikely((long)uninterruptible < 0))
1679 uninterruptible = 0;
1680
1681 return running + uninterruptible;
1682}
1683
1763#ifdef CONFIG_SMP 1684#ifdef CONFIG_SMP
1764 1685
1765/* 1686/*
1766 * double_rq_lock - safely lock two runqueues 1687 * double_rq_lock - safely lock two runqueues
1767 * 1688 *
1689 * We must take them in cpu order to match code in
1690 * dependent_sleeper and wake_dependent_sleeper.
1691 *
1768 * Note this does not disable interrupts like task_rq_lock, 1692 * Note this does not disable interrupts like task_rq_lock,
1769 * you need to do so manually before calling. 1693 * you need to do so manually before calling.
1770 */ 1694 */
@@ -1776,7 +1700,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1776 spin_lock(&rq1->lock); 1700 spin_lock(&rq1->lock);
1777 __acquire(rq2->lock); /* Fake it out ;) */ 1701 __acquire(rq2->lock); /* Fake it out ;) */
1778 } else { 1702 } else {
1779 if (rq1 < rq2) { 1703 if (rq1->cpu < rq2->cpu) {
1780 spin_lock(&rq1->lock); 1704 spin_lock(&rq1->lock);
1781 spin_lock(&rq2->lock); 1705 spin_lock(&rq2->lock);
1782 } else { 1706 } else {
@@ -1812,7 +1736,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1812 __acquires(this_rq->lock) 1736 __acquires(this_rq->lock)
1813{ 1737{
1814 if (unlikely(!spin_trylock(&busiest->lock))) { 1738 if (unlikely(!spin_trylock(&busiest->lock))) {
1815 if (busiest < this_rq) { 1739 if (busiest->cpu < this_rq->cpu) {
1816 spin_unlock(&this_rq->lock); 1740 spin_unlock(&this_rq->lock);
1817 spin_lock(&busiest->lock); 1741 spin_lock(&busiest->lock);
1818 spin_lock(&this_rq->lock); 1742 spin_lock(&this_rq->lock);
@@ -1875,9 +1799,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1875 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1799 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1876{ 1800{
1877 dequeue_task(p, src_array); 1801 dequeue_task(p, src_array);
1878 dec_nr_running(p, src_rq); 1802 src_rq->nr_running--;
1879 set_task_cpu(p, this_cpu); 1803 set_task_cpu(p, this_cpu);
1880 inc_nr_running(p, this_rq); 1804 this_rq->nr_running++;
1881 enqueue_task(p, this_array); 1805 enqueue_task(p, this_array);
1882 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1806 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1883 + this_rq->timestamp_last_tick; 1807 + this_rq->timestamp_last_tick;
@@ -2056,9 +1980,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2056 1980
2057 /* Bias balancing toward cpus of our domain */ 1981 /* Bias balancing toward cpus of our domain */
2058 if (local_group) 1982 if (local_group)
2059 load = __target_load(i, load_idx, idle); 1983 load = target_load(i, load_idx);
2060 else 1984 else
2061 load = __source_load(i, load_idx, idle); 1985 load = source_load(i, load_idx);
2062 1986
2063 avg_load += load; 1987 avg_load += load;
2064 } 1988 }
@@ -2171,7 +2095,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2171 int i; 2095 int i;
2172 2096
2173 for_each_cpu_mask(i, group->cpumask) { 2097 for_each_cpu_mask(i, group->cpumask) {
2174 load = __source_load(i, 0, idle); 2098 load = source_load(i, 0);
2175 2099
2176 if (load > max_load) { 2100 if (load > max_load) {
2177 max_load = load; 2101 max_load = load;
@@ -2959,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count);
2959 2883
2960#endif 2884#endif
2961 2885
2886static inline int interactive_sleep(enum sleep_type sleep_type)
2887{
2888 return (sleep_type == SLEEP_INTERACTIVE ||
2889 sleep_type == SLEEP_INTERRUPTED);
2890}
2891
2962/* 2892/*
2963 * schedule() is the main scheduler function. 2893 * schedule() is the main scheduler function.
2964 */ 2894 */
@@ -2978,13 +2908,11 @@ asmlinkage void __sched schedule(void)
2978 * schedule() atomically, we ignore that path for now. 2908 * schedule() atomically, we ignore that path for now.
2979 * Otherwise, whine if we are scheduling when we should not be. 2909 * Otherwise, whine if we are scheduling when we should not be.
2980 */ 2910 */
2981 if (likely(!current->exit_state)) { 2911 if (unlikely(in_atomic() && !current->exit_state)) {
2982 if (unlikely(in_atomic())) { 2912 printk(KERN_ERR "BUG: scheduling while atomic: "
2983 printk(KERN_ERR "scheduling while atomic: " 2913 "%s/0x%08x/%d\n",
2984 "%s/0x%08x/%d\n", 2914 current->comm, preempt_count(), current->pid);
2985 current->comm, preempt_count(), current->pid); 2915 dump_stack();
2986 dump_stack();
2987 }
2988 } 2916 }
2989 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2917 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2990 2918
@@ -3084,12 +3012,12 @@ go_idle:
3084 queue = array->queue + idx; 3012 queue = array->queue + idx;
3085 next = list_entry(queue->next, task_t, run_list); 3013 next = list_entry(queue->next, task_t, run_list);
3086 3014
3087 if (!rt_task(next) && next->activated > 0) { 3015 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
3088 unsigned long long delta = now - next->timestamp; 3016 unsigned long long delta = now - next->timestamp;
3089 if (unlikely((long long)(now - next->timestamp) < 0)) 3017 if (unlikely((long long)(now - next->timestamp) < 0))
3090 delta = 0; 3018 delta = 0;
3091 3019
3092 if (next->activated == 1) 3020 if (next->sleep_type == SLEEP_INTERACTIVE)
3093 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 3021 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3094 3022
3095 array = next->array; 3023 array = next->array;
@@ -3099,10 +3027,9 @@ go_idle:
3099 dequeue_task(next, array); 3027 dequeue_task(next, array);
3100 next->prio = new_prio; 3028 next->prio = new_prio;
3101 enqueue_task(next, array); 3029 enqueue_task(next, array);
3102 } else 3030 }
3103 requeue_task(next, array);
3104 } 3031 }
3105 next->activated = 0; 3032 next->sleep_type = SLEEP_NORMAL;
3106switch_tasks: 3033switch_tasks:
3107 if (next == rq->idle) 3034 if (next == rq->idle)
3108 schedstat_inc(rq, sched_goidle); 3035 schedstat_inc(rq, sched_goidle);
@@ -3571,10 +3498,8 @@ void set_user_nice(task_t *p, long nice)
3571 goto out_unlock; 3498 goto out_unlock;
3572 } 3499 }
3573 array = p->array; 3500 array = p->array;
3574 if (array) { 3501 if (array)
3575 dequeue_task(p, array); 3502 dequeue_task(p, array);
3576 dec_prio_bias(rq, p->static_prio);
3577 }
3578 3503
3579 old_prio = p->prio; 3504 old_prio = p->prio;
3580 new_prio = NICE_TO_PRIO(nice); 3505 new_prio = NICE_TO_PRIO(nice);
@@ -3584,7 +3509,6 @@ void set_user_nice(task_t *p, long nice)
3584 3509
3585 if (array) { 3510 if (array) {
3586 enqueue_task(p, array); 3511 enqueue_task(p, array);
3587 inc_prio_bias(rq, p->static_prio);
3588 /* 3512 /*
3589 * If the task increased its priority or is running and 3513 * If the task increased its priority or is running and
3590 * lowered its priority, then reschedule its CPU: 3514 * lowered its priority, then reschedule its CPU:
@@ -4129,6 +4053,8 @@ static inline void __cond_resched(void)
4129 */ 4053 */
4130 if (unlikely(preempt_count())) 4054 if (unlikely(preempt_count()))
4131 return; 4055 return;
4056 if (unlikely(system_state != SYSTEM_RUNNING))
4057 return;
4132 do { 4058 do {
4133 add_preempt_count(PREEMPT_ACTIVE); 4059 add_preempt_count(PREEMPT_ACTIVE);
4134 schedule(); 4060 schedule();
@@ -4434,6 +4360,7 @@ void __devinit init_idle(task_t *idle, int cpu)
4434 runqueue_t *rq = cpu_rq(cpu); 4360 runqueue_t *rq = cpu_rq(cpu);
4435 unsigned long flags; 4361 unsigned long flags;
4436 4362
4363 idle->timestamp = sched_clock();
4437 idle->sleep_avg = 0; 4364 idle->sleep_avg = 0;
4438 idle->array = NULL; 4365 idle->array = NULL;
4439 idle->prio = MAX_PRIO; 4366 idle->prio = MAX_PRIO;
@@ -4861,7 +4788,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4861/* Register at highest priority so that task migration (migrate_all_tasks) 4788/* Register at highest priority so that task migration (migrate_all_tasks)
4862 * happens before everything else. 4789 * happens before everything else.
4863 */ 4790 */
4864static struct notifier_block __devinitdata migration_notifier = { 4791static struct notifier_block migration_notifier = {
4865 .notifier_call = migration_call, 4792 .notifier_call = migration_call,
4866 .priority = 10 4793 .priority = 10
4867}; 4794};
@@ -5159,7 +5086,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5159#define MAX_DOMAIN_DISTANCE 32 5086#define MAX_DOMAIN_DISTANCE 32
5160 5087
5161static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = 5088static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5162 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; 5089 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5090/*
5091 * Architectures may override the migration cost and thus avoid
5092 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5093 * virtualized hardware:
5094 */
5095#ifdef CONFIG_DEFAULT_MIGRATION_COST
5096 CONFIG_DEFAULT_MIGRATION_COST
5097#else
5098 -1LL
5099#endif
5100};
5163 5101
5164/* 5102/*
5165 * Allow override of migration cost - in units of microseconds. 5103 * Allow override of migration cost - in units of microseconds.
@@ -5664,11 +5602,31 @@ static int cpu_to_cpu_group(int cpu)
5664} 5602}
5665#endif 5603#endif
5666 5604
5605#ifdef CONFIG_SCHED_MC
5606static DEFINE_PER_CPU(struct sched_domain, core_domains);
5607static struct sched_group sched_group_core[NR_CPUS];
5608#endif
5609
5610#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5611static int cpu_to_core_group(int cpu)
5612{
5613 return first_cpu(cpu_sibling_map[cpu]);
5614}
5615#elif defined(CONFIG_SCHED_MC)
5616static int cpu_to_core_group(int cpu)
5617{
5618 return cpu;
5619}
5620#endif
5621
5667static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5622static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5668static struct sched_group sched_group_phys[NR_CPUS]; 5623static struct sched_group sched_group_phys[NR_CPUS];
5669static int cpu_to_phys_group(int cpu) 5624static int cpu_to_phys_group(int cpu)
5670{ 5625{
5671#ifdef CONFIG_SCHED_SMT 5626#if defined(CONFIG_SCHED_MC)
5627 cpumask_t mask = cpu_coregroup_map(cpu);
5628 return first_cpu(mask);
5629#elif defined(CONFIG_SCHED_SMT)
5672 return first_cpu(cpu_sibling_map[cpu]); 5630 return first_cpu(cpu_sibling_map[cpu]);
5673#else 5631#else
5674 return cpu; 5632 return cpu;
@@ -5691,6 +5649,32 @@ static int cpu_to_allnodes_group(int cpu)
5691{ 5649{
5692 return cpu_to_node(cpu); 5650 return cpu_to_node(cpu);
5693} 5651}
5652static void init_numa_sched_groups_power(struct sched_group *group_head)
5653{
5654 struct sched_group *sg = group_head;
5655 int j;
5656
5657 if (!sg)
5658 return;
5659next_sg:
5660 for_each_cpu_mask(j, sg->cpumask) {
5661 struct sched_domain *sd;
5662
5663 sd = &per_cpu(phys_domains, j);
5664 if (j != first_cpu(sd->groups->cpumask)) {
5665 /*
5666 * Only add "power" once for each
5667 * physical package.
5668 */
5669 continue;
5670 }
5671
5672 sg->cpu_power += sd->groups->cpu_power;
5673 }
5674 sg = sg->next;
5675 if (sg != group_head)
5676 goto next_sg;
5677}
5694#endif 5678#endif
5695 5679
5696/* 5680/*
@@ -5766,6 +5750,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
5766 sd->parent = p; 5750 sd->parent = p;
5767 sd->groups = &sched_group_phys[group]; 5751 sd->groups = &sched_group_phys[group];
5768 5752
5753#ifdef CONFIG_SCHED_MC
5754 p = sd;
5755 sd = &per_cpu(core_domains, i);
5756 group = cpu_to_core_group(i);
5757 *sd = SD_MC_INIT;
5758 sd->span = cpu_coregroup_map(i);
5759 cpus_and(sd->span, sd->span, *cpu_map);
5760 sd->parent = p;
5761 sd->groups = &sched_group_core[group];
5762#endif
5763
5769#ifdef CONFIG_SCHED_SMT 5764#ifdef CONFIG_SCHED_SMT
5770 p = sd; 5765 p = sd;
5771 sd = &per_cpu(cpu_domains, i); 5766 sd = &per_cpu(cpu_domains, i);
@@ -5791,6 +5786,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
5791 } 5786 }
5792#endif 5787#endif
5793 5788
5789#ifdef CONFIG_SCHED_MC
5790 /* Set up multi-core groups */
5791 for_each_cpu_mask(i, *cpu_map) {
5792 cpumask_t this_core_map = cpu_coregroup_map(i);
5793 cpus_and(this_core_map, this_core_map, *cpu_map);
5794 if (i != first_cpu(this_core_map))
5795 continue;
5796 init_sched_build_groups(sched_group_core, this_core_map,
5797 &cpu_to_core_group);
5798 }
5799#endif
5800
5801
5794 /* Set up physical groups */ 5802 /* Set up physical groups */
5795 for (i = 0; i < MAX_NUMNODES; i++) { 5803 for (i = 0; i < MAX_NUMNODES; i++) {
5796 cpumask_t nodemask = node_to_cpumask(i); 5804 cpumask_t nodemask = node_to_cpumask(i);
@@ -5887,51 +5895,38 @@ void build_sched_domains(const cpumask_t *cpu_map)
5887 power = SCHED_LOAD_SCALE; 5895 power = SCHED_LOAD_SCALE;
5888 sd->groups->cpu_power = power; 5896 sd->groups->cpu_power = power;
5889#endif 5897#endif
5898#ifdef CONFIG_SCHED_MC
5899 sd = &per_cpu(core_domains, i);
5900 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5901 * SCHED_LOAD_SCALE / 10;
5902 sd->groups->cpu_power = power;
5890 5903
5891 sd = &per_cpu(phys_domains, i); 5904 sd = &per_cpu(phys_domains, i);
5905
5906 /*
5907 * This has to be < 2 * SCHED_LOAD_SCALE
5908 * Lets keep it SCHED_LOAD_SCALE, so that
5909 * while calculating NUMA group's cpu_power
5910 * we can simply do
5911 * numa_group->cpu_power += phys_group->cpu_power;
5912 *
5913 * See "only add power once for each physical pkg"
5914 * comment below
5915 */
5916 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5917#else
5918 sd = &per_cpu(phys_domains, i);
5892 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 5919 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5893 (cpus_weight(sd->groups->cpumask)-1) / 10; 5920 (cpus_weight(sd->groups->cpumask)-1) / 10;
5894 sd->groups->cpu_power = power; 5921 sd->groups->cpu_power = power;
5895
5896#ifdef CONFIG_NUMA
5897 sd = &per_cpu(allnodes_domains, i);
5898 if (sd->groups) {
5899 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5900 (cpus_weight(sd->groups->cpumask)-1) / 10;
5901 sd->groups->cpu_power = power;
5902 }
5903#endif 5922#endif
5904 } 5923 }
5905 5924
5906#ifdef CONFIG_NUMA 5925#ifdef CONFIG_NUMA
5907 for (i = 0; i < MAX_NUMNODES; i++) { 5926 for (i = 0; i < MAX_NUMNODES; i++)
5908 struct sched_group *sg = sched_group_nodes[i]; 5927 init_numa_sched_groups_power(sched_group_nodes[i]);
5909 int j;
5910
5911 if (sg == NULL)
5912 continue;
5913next_sg:
5914 for_each_cpu_mask(j, sg->cpumask) {
5915 struct sched_domain *sd;
5916 int power;
5917
5918 sd = &per_cpu(phys_domains, j);
5919 if (j != first_cpu(sd->groups->cpumask)) {
5920 /*
5921 * Only add "power" once for each
5922 * physical package.
5923 */
5924 continue;
5925 }
5926 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5927 (cpus_weight(sd->groups->cpumask)-1) / 10;
5928 5928
5929 sg->cpu_power += power; 5929 init_numa_sched_groups_power(sched_group_allnodes);
5930 }
5931 sg = sg->next;
5932 if (sg != sched_group_nodes[i])
5933 goto next_sg;
5934 }
5935#endif 5930#endif
5936 5931
5937 /* Attach the domains */ 5932 /* Attach the domains */
@@ -5939,6 +5934,8 @@ next_sg:
5939 struct sched_domain *sd; 5934 struct sched_domain *sd;
5940#ifdef CONFIG_SCHED_SMT 5935#ifdef CONFIG_SCHED_SMT
5941 sd = &per_cpu(cpu_domains, i); 5936 sd = &per_cpu(cpu_domains, i);
5937#elif defined(CONFIG_SCHED_MC)
5938 sd = &per_cpu(core_domains, i);
5942#else 5939#else
5943 sd = &per_cpu(phys_domains, i); 5940 sd = &per_cpu(phys_domains, i);
5944#endif 5941#endif
@@ -6111,7 +6108,7 @@ void __init sched_init(void)
6111 runqueue_t *rq; 6108 runqueue_t *rq;
6112 int i, j, k; 6109 int i, j, k;
6113 6110
6114 for_each_cpu(i) { 6111 for_each_possible_cpu(i) {
6115 prio_array_t *array; 6112 prio_array_t *array;
6116 6113
6117 rq = cpu_rq(i); 6114 rq = cpu_rq(i);
@@ -6129,6 +6126,7 @@ void __init sched_init(void)
6129 rq->push_cpu = 0; 6126 rq->push_cpu = 0;
6130 rq->migration_thread = NULL; 6127 rq->migration_thread = NULL;
6131 INIT_LIST_HEAD(&rq->migration_queue); 6128 INIT_LIST_HEAD(&rq->migration_queue);
6129 rq->cpu = i;
6132#endif 6130#endif
6133 atomic_set(&rq->nr_iowait, 0); 6131 atomic_set(&rq->nr_iowait, 0);
6134 6132
@@ -6169,7 +6167,7 @@ void __might_sleep(char *file, int line)
6169 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6167 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6170 return; 6168 return;
6171 prev_jiffy = jiffies; 6169 prev_jiffy = jiffies;
6172 printk(KERN_ERR "Debug: sleeping function called from invalid" 6170 printk(KERN_ERR "BUG: sleeping function called from invalid"
6173 " context at %s:%d\n", file, line); 6171 " context at %s:%d\n", file, line);
6174 printk("in_atomic():%d, irqs_disabled():%d\n", 6172 printk("in_atomic():%d, irqs_disabled():%d\n",
6175 in_atomic(), irqs_disabled()); 6173 in_atomic(), irqs_disabled());
diff --git a/kernel/signal.c b/kernel/signal.c
index b373fc2420..e5f8aea78f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/posix-timers.h>
26#include <linux/signal.h> 25#include <linux/signal.h>
27#include <linux/audit.h> 26#include <linux/audit.h>
28#include <linux/capability.h> 27#include <linux/capability.h>
@@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep;
147#define sig_kernel_stop(sig) \ 146#define sig_kernel_stop(sig) \
148 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) 147 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
149 148
149#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
150
150#define sig_user_defined(t, signr) \ 151#define sig_user_defined(t, signr) \
151 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ 152 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
152 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) 153 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
@@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q)
292 kmem_cache_free(sigqueue_cachep, q); 293 kmem_cache_free(sigqueue_cachep, q);
293} 294}
294 295
295static void flush_sigqueue(struct sigpending *queue) 296void flush_sigqueue(struct sigpending *queue)
296{ 297{
297 struct sigqueue *q; 298 struct sigqueue *q;
298 299
@@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue)
307/* 308/*
308 * Flush all pending signals for a task. 309 * Flush all pending signals for a task.
309 */ 310 */
310 311void flush_signals(struct task_struct *t)
311void
312flush_signals(struct task_struct *t)
313{ 312{
314 unsigned long flags; 313 unsigned long flags;
315 314
@@ -321,109 +320,6 @@ flush_signals(struct task_struct *t)
321} 320}
322 321
323/* 322/*
324 * This function expects the tasklist_lock write-locked.
325 */
326void __exit_sighand(struct task_struct *tsk)
327{
328 struct sighand_struct * sighand = tsk->sighand;
329
330 /* Ok, we're done with the signal handlers */
331 tsk->sighand = NULL;
332 if (atomic_dec_and_test(&sighand->count))
333 sighand_free(sighand);
334}
335
336void exit_sighand(struct task_struct *tsk)
337{
338 write_lock_irq(&tasklist_lock);
339 rcu_read_lock();
340 if (tsk->sighand != NULL) {
341 struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
342 spin_lock(&sighand->siglock);
343 __exit_sighand(tsk);
344 spin_unlock(&sighand->siglock);
345 }
346 rcu_read_unlock();
347 write_unlock_irq(&tasklist_lock);
348}
349
350/*
351 * This function expects the tasklist_lock write-locked.
352 */
353void __exit_signal(struct task_struct *tsk)
354{
355 struct signal_struct * sig = tsk->signal;
356 struct sighand_struct * sighand;
357
358 if (!sig)
359 BUG();
360 if (!atomic_read(&sig->count))
361 BUG();
362 rcu_read_lock();
363 sighand = rcu_dereference(tsk->sighand);
364 spin_lock(&sighand->siglock);
365 posix_cpu_timers_exit(tsk);
366 if (atomic_dec_and_test(&sig->count)) {
367 posix_cpu_timers_exit_group(tsk);
368 tsk->signal = NULL;
369 __exit_sighand(tsk);
370 spin_unlock(&sighand->siglock);
371 flush_sigqueue(&sig->shared_pending);
372 } else {
373 /*
374 * If there is any task waiting for the group exit
375 * then notify it:
376 */
377 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
378 wake_up_process(sig->group_exit_task);
379 sig->group_exit_task = NULL;
380 }
381 if (tsk == sig->curr_target)
382 sig->curr_target = next_thread(tsk);
383 tsk->signal = NULL;
384 /*
385 * Accumulate here the counters for all threads but the
386 * group leader as they die, so they can be added into
387 * the process-wide totals when those are taken.
388 * The group leader stays around as a zombie as long
389 * as there are other threads. When it gets reaped,
390 * the exit.c code will add its counts into these totals.
391 * We won't ever get here for the group leader, since it
392 * will have been the last reference on the signal_struct.
393 */
394 sig->utime = cputime_add(sig->utime, tsk->utime);
395 sig->stime = cputime_add(sig->stime, tsk->stime);
396 sig->min_flt += tsk->min_flt;
397 sig->maj_flt += tsk->maj_flt;
398 sig->nvcsw += tsk->nvcsw;
399 sig->nivcsw += tsk->nivcsw;
400 sig->sched_time += tsk->sched_time;
401 __exit_sighand(tsk);
402 spin_unlock(&sighand->siglock);
403 sig = NULL; /* Marker for below. */
404 }
405 rcu_read_unlock();
406 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
407 flush_sigqueue(&tsk->pending);
408 if (sig) {
409 /*
410 * We are cleaning up the signal_struct here.
411 */
412 exit_thread_group_keys(sig);
413 kmem_cache_free(signal_cachep, sig);
414 }
415}
416
417void exit_signal(struct task_struct *tsk)
418{
419 atomic_dec(&tsk->signal->live);
420
421 write_lock_irq(&tasklist_lock);
422 __exit_signal(tsk);
423 write_unlock_irq(&tasklist_lock);
424}
425
426/*
427 * Flush all handlers for a task. 323 * Flush all handlers for a task.
428 */ 324 */
429 325
@@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
695} 591}
696 592
697/* forward decl */ 593/* forward decl */
698static void do_notify_parent_cldstop(struct task_struct *tsk, 594static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
699 int to_self,
700 int why);
701 595
702/* 596/*
703 * Handle magic process-wide effects of stop/continue signals. 597 * Handle magic process-wide effects of stop/continue signals.
@@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
747 p->signal->group_stop_count = 0; 641 p->signal->group_stop_count = 0;
748 p->signal->flags = SIGNAL_STOP_CONTINUED; 642 p->signal->flags = SIGNAL_STOP_CONTINUED;
749 spin_unlock(&p->sighand->siglock); 643 spin_unlock(&p->sighand->siglock);
750 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); 644 do_notify_parent_cldstop(p, CLD_STOPPED);
751 spin_lock(&p->sighand->siglock); 645 spin_lock(&p->sighand->siglock);
752 } 646 }
753 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); 647 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
788 p->signal->flags = SIGNAL_STOP_CONTINUED; 682 p->signal->flags = SIGNAL_STOP_CONTINUED;
789 p->signal->group_exit_code = 0; 683 p->signal->group_exit_code = 0;
790 spin_unlock(&p->sighand->siglock); 684 spin_unlock(&p->sighand->siglock);
791 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); 685 do_notify_parent_cldstop(p, CLD_CONTINUED);
792 spin_lock(&p->sighand->siglock); 686 spin_lock(&p->sighand->siglock);
793 } else { 687 } else {
794 /* 688 /*
@@ -875,8 +769,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
875{ 769{
876 int ret = 0; 770 int ret = 0;
877 771
878 if (!irqs_disabled()) 772 BUG_ON(!irqs_disabled());
879 BUG();
880 assert_spin_locked(&t->sighand->siglock); 773 assert_spin_locked(&t->sighand->siglock);
881 774
882 /* Short-circuit ignored signals. */ 775 /* Short-circuit ignored signals. */
@@ -975,7 +868,6 @@ __group_complete_signal(int sig, struct task_struct *p)
975 if (t == NULL) 868 if (t == NULL)
976 /* restart balancing at this thread */ 869 /* restart balancing at this thread */
977 t = p->signal->curr_target = p; 870 t = p->signal->curr_target = p;
978 BUG_ON(t->tgid != p->tgid);
979 871
980 while (!wants_signal(sig, t)) { 872 while (!wants_signal(sig, t)) {
981 t = next_thread(t); 873 t = next_thread(t);
@@ -1120,27 +1012,37 @@ void zap_other_threads(struct task_struct *p)
1120/* 1012/*
1121 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 1013 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
1122 */ 1014 */
1015struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1016{
1017 struct sighand_struct *sighand;
1018
1019 for (;;) {
1020 sighand = rcu_dereference(tsk->sighand);
1021 if (unlikely(sighand == NULL))
1022 break;
1023
1024 spin_lock_irqsave(&sighand->siglock, *flags);
1025 if (likely(sighand == tsk->sighand))
1026 break;
1027 spin_unlock_irqrestore(&sighand->siglock, *flags);
1028 }
1029
1030 return sighand;
1031}
1032
1123int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1033int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1124{ 1034{
1125 unsigned long flags; 1035 unsigned long flags;
1126 struct sighand_struct *sp;
1127 int ret; 1036 int ret;
1128 1037
1129retry:
1130 ret = check_kill_permission(sig, info, p); 1038 ret = check_kill_permission(sig, info, p);
1131 if (!ret && sig && (sp = rcu_dereference(p->sighand))) { 1039
1132 spin_lock_irqsave(&sp->siglock, flags); 1040 if (!ret && sig) {
1133 if (p->sighand != sp) { 1041 ret = -ESRCH;
1134 spin_unlock_irqrestore(&sp->siglock, flags); 1042 if (lock_task_sighand(p, &flags)) {
1135 goto retry; 1043 ret = __group_send_sig_info(sig, info, p);
1136 } 1044 unlock_task_sighand(p, &flags);
1137 if ((atomic_read(&sp->count) == 0) ||
1138 (atomic_read(&p->usage) == 0)) {
1139 spin_unlock_irqrestore(&sp->siglock, flags);
1140 return -ESRCH;
1141 } 1045 }
1142 ret = __group_send_sig_info(sig, info, p);
1143 spin_unlock_irqrestore(&sp->siglock, flags);
1144 } 1046 }
1145 1047
1146 return ret; 1048 return ret;
@@ -1189,7 +1091,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1189 struct task_struct *p; 1091 struct task_struct *p;
1190 1092
1191 rcu_read_lock(); 1093 rcu_read_lock();
1192 if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { 1094 if (unlikely(sig_needs_tasklist(sig))) {
1193 read_lock(&tasklist_lock); 1095 read_lock(&tasklist_lock);
1194 acquired_tasklist_lock = 1; 1096 acquired_tasklist_lock = 1;
1195 } 1097 }
@@ -1405,12 +1307,10 @@ void sigqueue_free(struct sigqueue *q)
1405 __sigqueue_free(q); 1307 __sigqueue_free(q);
1406} 1308}
1407 1309
1408int 1310int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1409send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1410{ 1311{
1411 unsigned long flags; 1312 unsigned long flags;
1412 int ret = 0; 1313 int ret = 0;
1413 struct sighand_struct *sh;
1414 1314
1415 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1315 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1416 1316
@@ -1424,48 +1324,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1424 */ 1324 */
1425 rcu_read_lock(); 1325 rcu_read_lock();
1426 1326
1427 if (unlikely(p->flags & PF_EXITING)) { 1327 if (!likely(lock_task_sighand(p, &flags))) {
1428 ret = -1; 1328 ret = -1;
1429 goto out_err; 1329 goto out_err;
1430 } 1330 }
1431 1331
1432retry:
1433 sh = rcu_dereference(p->sighand);
1434
1435 spin_lock_irqsave(&sh->siglock, flags);
1436 if (p->sighand != sh) {
1437 /* We raced with exec() in a multithreaded process... */
1438 spin_unlock_irqrestore(&sh->siglock, flags);
1439 goto retry;
1440 }
1441
1442 /*
1443 * We do the check here again to handle the following scenario:
1444 *
1445 * CPU 0 CPU 1
1446 * send_sigqueue
1447 * check PF_EXITING
1448 * interrupt exit code running
1449 * __exit_signal
1450 * lock sighand->siglock
1451 * unlock sighand->siglock
1452 * lock sh->siglock
1453 * add(tsk->pending) flush_sigqueue(tsk->pending)
1454 *
1455 */
1456
1457 if (unlikely(p->flags & PF_EXITING)) {
1458 ret = -1;
1459 goto out;
1460 }
1461
1462 if (unlikely(!list_empty(&q->list))) { 1332 if (unlikely(!list_empty(&q->list))) {
1463 /* 1333 /*
1464 * If an SI_TIMER entry is already queue just increment 1334 * If an SI_TIMER entry is already queue just increment
1465 * the overrun count. 1335 * the overrun count.
1466 */ 1336 */
1467 if (q->info.si_code != SI_TIMER) 1337 BUG_ON(q->info.si_code != SI_TIMER);
1468 BUG();
1469 q->info.si_overrun++; 1338 q->info.si_overrun++;
1470 goto out; 1339 goto out;
1471 } 1340 }
@@ -1481,7 +1350,7 @@ retry:
1481 signal_wake_up(p, sig == SIGKILL); 1350 signal_wake_up(p, sig == SIGKILL);
1482 1351
1483out: 1352out:
1484 spin_unlock_irqrestore(&sh->siglock, flags); 1353 unlock_task_sighand(p, &flags);
1485out_err: 1354out_err:
1486 rcu_read_unlock(); 1355 rcu_read_unlock();
1487 1356
@@ -1513,8 +1382,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1513 * the overrun count. Other uses should not try to 1382 * the overrun count. Other uses should not try to
1514 * send the signal multiple times. 1383 * send the signal multiple times.
1515 */ 1384 */
1516 if (q->info.si_code != SI_TIMER) 1385 BUG_ON(q->info.si_code != SI_TIMER);
1517 BUG();
1518 q->info.si_overrun++; 1386 q->info.si_overrun++;
1519 goto out; 1387 goto out;
1520 } 1388 }
@@ -1613,14 +1481,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1613 spin_unlock_irqrestore(&psig->siglock, flags); 1481 spin_unlock_irqrestore(&psig->siglock, flags);
1614} 1482}
1615 1483
1616static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) 1484static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1617{ 1485{
1618 struct siginfo info; 1486 struct siginfo info;
1619 unsigned long flags; 1487 unsigned long flags;
1620 struct task_struct *parent; 1488 struct task_struct *parent;
1621 struct sighand_struct *sighand; 1489 struct sighand_struct *sighand;
1622 1490
1623 if (to_self) 1491 if (tsk->ptrace & PT_PTRACED)
1624 parent = tsk->parent; 1492 parent = tsk->parent;
1625 else { 1493 else {
1626 tsk = tsk->group_leader; 1494 tsk = tsk->group_leader;
@@ -1689,13 +1557,14 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1689 /* Let the debugger run. */ 1557 /* Let the debugger run. */
1690 set_current_state(TASK_TRACED); 1558 set_current_state(TASK_TRACED);
1691 spin_unlock_irq(&current->sighand->siglock); 1559 spin_unlock_irq(&current->sighand->siglock);
1560 try_to_freeze();
1692 read_lock(&tasklist_lock); 1561 read_lock(&tasklist_lock);
1693 if (likely(current->ptrace & PT_PTRACED) && 1562 if (likely(current->ptrace & PT_PTRACED) &&
1694 likely(current->parent != current->real_parent || 1563 likely(current->parent != current->real_parent ||
1695 !(current->ptrace & PT_ATTACHED)) && 1564 !(current->ptrace & PT_ATTACHED)) &&
1696 (likely(current->parent->signal != current->signal) || 1565 (likely(current->parent->signal != current->signal) ||
1697 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { 1566 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1698 do_notify_parent_cldstop(current, 1, CLD_TRAPPED); 1567 do_notify_parent_cldstop(current, CLD_TRAPPED);
1699 read_unlock(&tasklist_lock); 1568 read_unlock(&tasklist_lock);
1700 schedule(); 1569 schedule();
1701 } else { 1570 } else {
@@ -1744,25 +1613,17 @@ void ptrace_notify(int exit_code)
1744static void 1613static void
1745finish_stop(int stop_count) 1614finish_stop(int stop_count)
1746{ 1615{
1747 int to_self;
1748
1749 /* 1616 /*
1750 * If there are no other threads in the group, or if there is 1617 * If there are no other threads in the group, or if there is
1751 * a group stop in progress and we are the last to stop, 1618 * a group stop in progress and we are the last to stop,
1752 * report to the parent. When ptraced, every thread reports itself. 1619 * report to the parent. When ptraced, every thread reports itself.
1753 */ 1620 */
1754 if (stop_count < 0 || (current->ptrace & PT_PTRACED)) 1621 if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
1755 to_self = 1; 1622 read_lock(&tasklist_lock);
1756 else if (stop_count == 0) 1623 do_notify_parent_cldstop(current, CLD_STOPPED);
1757 to_self = 0; 1624 read_unlock(&tasklist_lock);
1758 else 1625 }
1759 goto out;
1760
1761 read_lock(&tasklist_lock);
1762 do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
1763 read_unlock(&tasklist_lock);
1764 1626
1765out:
1766 schedule(); 1627 schedule();
1767 /* 1628 /*
1768 * Now we don't run again until continued. 1629 * Now we don't run again until continued.
@@ -1776,12 +1637,10 @@ out:
1776 * Returns nonzero if we've actually stopped and released the siglock. 1637 * Returns nonzero if we've actually stopped and released the siglock.
1777 * Returns zero if we didn't stop and still hold the siglock. 1638 * Returns zero if we didn't stop and still hold the siglock.
1778 */ 1639 */
1779static int 1640static int do_signal_stop(int signr)
1780do_signal_stop(int signr)
1781{ 1641{
1782 struct signal_struct *sig = current->signal; 1642 struct signal_struct *sig = current->signal;
1783 struct sighand_struct *sighand = current->sighand; 1643 int stop_count;
1784 int stop_count = -1;
1785 1644
1786 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) 1645 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
1787 return 0; 1646 return 0;
@@ -1791,86 +1650,37 @@ do_signal_stop(int signr)
1791 * There is a group stop in progress. We don't need to 1650 * There is a group stop in progress. We don't need to
1792 * start another one. 1651 * start another one.
1793 */ 1652 */
1794 signr = sig->group_exit_code;
1795 stop_count = --sig->group_stop_count; 1653 stop_count = --sig->group_stop_count;
1796 current->exit_code = signr; 1654 } else {
1797 set_current_state(TASK_STOPPED);
1798 if (stop_count == 0)
1799 sig->flags = SIGNAL_STOP_STOPPED;
1800 spin_unlock_irq(&sighand->siglock);
1801 }
1802 else if (thread_group_empty(current)) {
1803 /*
1804 * Lock must be held through transition to stopped state.
1805 */
1806 current->exit_code = current->signal->group_exit_code = signr;
1807 set_current_state(TASK_STOPPED);
1808 sig->flags = SIGNAL_STOP_STOPPED;
1809 spin_unlock_irq(&sighand->siglock);
1810 }
1811 else {
1812 /* 1655 /*
1813 * There is no group stop already in progress. 1656 * There is no group stop already in progress.
1814 * We must initiate one now, but that requires 1657 * We must initiate one now.
1815 * dropping siglock to get both the tasklist lock
1816 * and siglock again in the proper order. Note that
1817 * this allows an intervening SIGCONT to be posted.
1818 * We need to check for that and bail out if necessary.
1819 */ 1658 */
1820 struct task_struct *t; 1659 struct task_struct *t;
1821 1660
1822 spin_unlock_irq(&sighand->siglock); 1661 sig->group_exit_code = signr;
1823
1824 /* signals can be posted during this window */
1825
1826 read_lock(&tasklist_lock);
1827 spin_lock_irq(&sighand->siglock);
1828 1662
1829 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) { 1663 stop_count = 0;
1664 for (t = next_thread(current); t != current; t = next_thread(t))
1830 /* 1665 /*
1831 * Another stop or continue happened while we 1666 * Setting state to TASK_STOPPED for a group
1832 * didn't have the lock. We can just swallow this 1667 * stop is always done with the siglock held,
1833 * signal now. If we raced with a SIGCONT, that 1668 * so this check has no races.
1834 * should have just cleared it now. If we raced
1835 * with another processor delivering a stop signal,
1836 * then the SIGCONT that wakes us up should clear it.
1837 */ 1669 */
1838 read_unlock(&tasklist_lock); 1670 if (!t->exit_state &&
1839 return 0; 1671 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1840 } 1672 stop_count++;
1841 1673 signal_wake_up(t, 0);
1842 if (sig->group_stop_count == 0) { 1674 }
1843 sig->group_exit_code = signr; 1675 sig->group_stop_count = stop_count;
1844 stop_count = 0;
1845 for (t = next_thread(current); t != current;
1846 t = next_thread(t))
1847 /*
1848 * Setting state to TASK_STOPPED for a group
1849 * stop is always done with the siglock held,
1850 * so this check has no races.
1851 */
1852 if (!t->exit_state &&
1853 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1854 stop_count++;
1855 signal_wake_up(t, 0);
1856 }
1857 sig->group_stop_count = stop_count;
1858 }
1859 else {
1860 /* A race with another thread while unlocked. */
1861 signr = sig->group_exit_code;
1862 stop_count = --sig->group_stop_count;
1863 }
1864
1865 current->exit_code = signr;
1866 set_current_state(TASK_STOPPED);
1867 if (stop_count == 0)
1868 sig->flags = SIGNAL_STOP_STOPPED;
1869
1870 spin_unlock_irq(&sighand->siglock);
1871 read_unlock(&tasklist_lock);
1872 } 1676 }
1873 1677
1678 if (stop_count == 0)
1679 sig->flags = SIGNAL_STOP_STOPPED;
1680 current->exit_code = sig->group_exit_code;
1681 __set_current_state(TASK_STOPPED);
1682
1683 spin_unlock_irq(&current->sighand->siglock);
1874 finish_stop(stop_count); 1684 finish_stop(stop_count);
1875 return 1; 1685 return 1;
1876} 1686}
@@ -1922,6 +1732,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1922 sigset_t *mask = &current->blocked; 1732 sigset_t *mask = &current->blocked;
1923 int signr = 0; 1733 int signr = 0;
1924 1734
1735 try_to_freeze();
1736
1925relock: 1737relock:
1926 spin_lock_irq(&current->sighand->siglock); 1738 spin_lock_irq(&current->sighand->siglock);
1927 for (;;) { 1739 for (;;) {
@@ -1942,9 +1754,9 @@ relock:
1942 /* Let the debugger run. */ 1754 /* Let the debugger run. */
1943 ptrace_stop(signr, signr, info); 1755 ptrace_stop(signr, signr, info);
1944 1756
1945 /* We're back. Did the debugger cancel the sig or group_exit? */ 1757 /* We're back. Did the debugger cancel the sig? */
1946 signr = current->exit_code; 1758 signr = current->exit_code;
1947 if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) 1759 if (signr == 0)
1948 continue; 1760 continue;
1949 1761
1950 current->exit_code = 0; 1762 current->exit_code = 0;
@@ -1988,7 +1800,7 @@ relock:
1988 continue; 1800 continue;
1989 1801
1990 /* Init gets no signals it doesn't want. */ 1802 /* Init gets no signals it doesn't want. */
1991 if (current->pid == 1) 1803 if (current == child_reaper)
1992 continue; 1804 continue;
1993 1805
1994 if (sig_kernel_stop(signr)) { 1806 if (sig_kernel_stop(signr)) {
@@ -2099,10 +1911,11 @@ long do_no_restart_syscall(struct restart_block *param)
2099int sigprocmask(int how, sigset_t *set, sigset_t *oldset) 1911int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2100{ 1912{
2101 int error; 1913 int error;
2102 sigset_t old_block;
2103 1914
2104 spin_lock_irq(&current->sighand->siglock); 1915 spin_lock_irq(&current->sighand->siglock);
2105 old_block = current->blocked; 1916 if (oldset)
1917 *oldset = current->blocked;
1918
2106 error = 0; 1919 error = 0;
2107 switch (how) { 1920 switch (how) {
2108 case SIG_BLOCK: 1921 case SIG_BLOCK:
@@ -2119,8 +1932,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2119 } 1932 }
2120 recalc_sigpending(); 1933 recalc_sigpending();
2121 spin_unlock_irq(&current->sighand->siglock); 1934 spin_unlock_irq(&current->sighand->siglock);
2122 if (oldset) 1935
2123 *oldset = old_block;
2124 return error; 1936 return error;
2125} 1937}
2126 1938
@@ -2307,7 +2119,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2307 2119
2308 timeout = schedule_timeout_interruptible(timeout); 2120 timeout = schedule_timeout_interruptible(timeout);
2309 2121
2310 try_to_freeze();
2311 spin_lock_irq(&current->sighand->siglock); 2122 spin_lock_irq(&current->sighand->siglock);
2312 sig = dequeue_signal(current, &these, &info); 2123 sig = dequeue_signal(current, &these, &info);
2313 current->blocked = current->real_blocked; 2124 current->blocked = current->real_blocked;
@@ -2429,8 +2240,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
2429 return kill_proc_info(sig, &info, pid); 2240 return kill_proc_info(sig, &info, pid);
2430} 2241}
2431 2242
2432int 2243int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2433do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2434{ 2244{
2435 struct k_sigaction *k; 2245 struct k_sigaction *k;
2436 sigset_t mask; 2246 sigset_t mask;
@@ -2454,6 +2264,9 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2454 *oact = *k; 2264 *oact = *k;
2455 2265
2456 if (act) { 2266 if (act) {
2267 sigdelsetmask(&act->sa.sa_mask,
2268 sigmask(SIGKILL) | sigmask(SIGSTOP));
2269 *k = *act;
2457 /* 2270 /*
2458 * POSIX 3.3.1.3: 2271 * POSIX 3.3.1.3:
2459 * "Setting a signal action to SIG_IGN for a signal that is 2272 * "Setting a signal action to SIG_IGN for a signal that is
@@ -2466,21 +2279,8 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2466 * be discarded, whether or not it is blocked" 2279 * be discarded, whether or not it is blocked"
2467 */ 2280 */
2468 if (act->sa.sa_handler == SIG_IGN || 2281 if (act->sa.sa_handler == SIG_IGN ||
2469 (act->sa.sa_handler == SIG_DFL && 2282 (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
2470 sig_kernel_ignore(sig))) {
2471 /*
2472 * This is a fairly rare case, so we only take the
2473 * tasklist_lock once we're sure we'll need it.
2474 * Now we must do this little unlock and relock
2475 * dance to maintain the lock hierarchy.
2476 */
2477 struct task_struct *t = current; 2283 struct task_struct *t = current;
2478 spin_unlock_irq(&t->sighand->siglock);
2479 read_lock(&tasklist_lock);
2480 spin_lock_irq(&t->sighand->siglock);
2481 *k = *act;
2482 sigdelsetmask(&k->sa.sa_mask,
2483 sigmask(SIGKILL) | sigmask(SIGSTOP));
2484 sigemptyset(&mask); 2284 sigemptyset(&mask);
2485 sigaddset(&mask, sig); 2285 sigaddset(&mask, sig);
2486 rm_from_queue_full(&mask, &t->signal->shared_pending); 2286 rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2489,14 +2289,7 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2489 recalc_sigpending_tsk(t); 2289 recalc_sigpending_tsk(t);
2490 t = next_thread(t); 2290 t = next_thread(t);
2491 } while (t != current); 2291 } while (t != current);
2492 spin_unlock_irq(&current->sighand->siglock);
2493 read_unlock(&tasklist_lock);
2494 return 0;
2495 } 2292 }
2496
2497 *k = *act;
2498 sigdelsetmask(&k->sa.sa_mask,
2499 sigmask(SIGKILL) | sigmask(SIGSTOP));
2500 } 2293 }
2501 2294
2502 spin_unlock_irq(&current->sighand->siglock); 2295 spin_unlock_irq(&current->sighand->siglock);
@@ -2702,6 +2495,7 @@ sys_signal(int sig, __sighandler_t handler)
2702 2495
2703 new_sa.sa.sa_handler = handler; 2496 new_sa.sa.sa_handler = handler;
2704 new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; 2497 new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
2498 sigemptyset(&new_sa.sa.sa_mask);
2705 2499
2706 ret = do_sigaction(sig, &new_sa, &old_sa); 2500 ret = do_sigaction(sig, &new_sa, &old_sa);
2707 2501
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdde..336f92d64e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/smp.h>
19 20
20#include <asm/irq.h> 21#include <asm/irq.h>
21/* 22/*
@@ -445,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
445} 446}
446#endif /* CONFIG_HOTPLUG_CPU */ 447#endif /* CONFIG_HOTPLUG_CPU */
447 448
448static int __devinit cpu_callback(struct notifier_block *nfb, 449static int cpu_callback(struct notifier_block *nfb,
449 unsigned long action, 450 unsigned long action,
450 void *hcpu) 451 void *hcpu)
451{ 452{
@@ -483,7 +484,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
483 return NOTIFY_OK; 484 return NOTIFY_OK;
484} 485}
485 486
486static struct notifier_block __devinitdata cpu_nfb = { 487static struct notifier_block cpu_nfb = {
487 .notifier_call = cpu_callback 488 .notifier_call = cpu_callback
488}; 489};
489 490
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
495 register_cpu_notifier(&cpu_nfb); 496 register_cpu_notifier(&cpu_nfb);
496 return 0; 497 return 0;
497} 498}
499
500#ifdef CONFIG_SMP
501/*
502 * Call a function on all processors
503 */
504int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
505{
506 int ret = 0;
507
508 preempt_disable();
509 ret = smp_call_function(func, info, retry, wait);
510 local_irq_disable();
511 func(info);
512 local_irq_enable();
513 preempt_enable();
514 return ret;
515}
516EXPORT_SYMBOL(on_each_cpu);
517#endif
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index c67189a25d..14c7faf029 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -1,12 +1,11 @@
1/* 1/*
2 * Detect Soft Lockups 2 * Detect Soft Lockups
3 * 3 *
4 * started by Ingo Molnar, (C) 2005, Red Hat 4 * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
5 * 5 *
6 * this code detects soft lockups: incidents in where on a CPU 6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more. 7 * the kernel does not reschedule for 10 seconds or more.
8 */ 8 */
9
10#include <linux/mm.h> 9#include <linux/mm.h>
11#include <linux/cpu.h> 10#include <linux/cpu.h>
12#include <linux/init.h> 11#include <linux/init.h>
@@ -17,13 +16,14 @@
17 16
18static DEFINE_SPINLOCK(print_lock); 17static DEFINE_SPINLOCK(print_lock);
19 18
20static DEFINE_PER_CPU(unsigned long, timestamp) = 0; 19static DEFINE_PER_CPU(unsigned long, touch_timestamp);
21static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; 20static DEFINE_PER_CPU(unsigned long, print_timestamp);
22static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 21static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
23 22
24static int did_panic = 0; 23static int did_panic = 0;
25static int softlock_panic(struct notifier_block *this, unsigned long event, 24
26 void *ptr) 25static int
26softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
27{ 27{
28 did_panic = 1; 28 did_panic = 1;
29 29
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
36 36
37void touch_softlockup_watchdog(void) 37void touch_softlockup_watchdog(void)
38{ 38{
39 per_cpu(timestamp, raw_smp_processor_id()) = jiffies; 39 per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
40} 40}
41EXPORT_SYMBOL(touch_softlockup_watchdog); 41EXPORT_SYMBOL(touch_softlockup_watchdog);
42 42
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog);
44 * This callback runs from the timer interrupt, and checks 44 * This callback runs from the timer interrupt, and checks
45 * whether the watchdog thread has hung or not: 45 * whether the watchdog thread has hung or not:
46 */ 46 */
47void softlockup_tick(struct pt_regs *regs) 47void softlockup_tick(void)
48{ 48{
49 int this_cpu = smp_processor_id(); 49 int this_cpu = smp_processor_id();
50 unsigned long timestamp = per_cpu(timestamp, this_cpu); 50 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
51 51
52 if (per_cpu(print_timestamp, this_cpu) == timestamp) 52 /* prevent double reports: */
53 if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
54 did_panic ||
55 !per_cpu(watchdog_task, this_cpu))
53 return; 56 return;
54 57
55 /* Do not cause a second panic when there already was one */ 58 /* do not print during early bootup: */
56 if (did_panic) 59 if (unlikely(system_state != SYSTEM_RUNNING)) {
60 touch_softlockup_watchdog();
57 return; 61 return;
62 }
58 63
59 if (time_after(jiffies, timestamp + 10*HZ)) { 64 /* Wake up the high-prio watchdog task every second: */
60 per_cpu(print_timestamp, this_cpu) = timestamp; 65 if (time_after(jiffies, touch_timestamp + HZ))
66 wake_up_process(per_cpu(watchdog_task, this_cpu));
67
68 /* Warn about unreasonable 10+ seconds delays: */
69 if (time_after(jiffies, touch_timestamp + 10*HZ)) {
70 per_cpu(print_timestamp, this_cpu) = touch_timestamp;
61 71
62 spin_lock(&print_lock); 72 spin_lock(&print_lock);
63 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", 73 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
64 this_cpu); 74 this_cpu);
65 show_regs(regs); 75 dump_stack();
66 spin_unlock(&print_lock); 76 spin_unlock(&print_lock);
67 } 77 }
68} 78}
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu)
77 sched_setscheduler(current, SCHED_FIFO, &param); 87 sched_setscheduler(current, SCHED_FIFO, &param);
78 current->flags |= PF_NOFREEZE; 88 current->flags |= PF_NOFREEZE;
79 89
80 set_current_state(TASK_INTERRUPTIBLE);
81
82 /* 90 /*
83 * Run briefly once per second - if this gets delayed for 91 * Run briefly once per second to reset the softlockup timestamp.
84 * more than 10 seconds then the debug-printout triggers 92 * If this gets delayed for more than 10 seconds then the
85 * in softlockup_tick(): 93 * debug-printout triggers in softlockup_tick().
86 */ 94 */
87 while (!kthread_should_stop()) { 95 while (!kthread_should_stop()) {
88 msleep_interruptible(1000); 96 set_current_state(TASK_INTERRUPTIBLE);
89 touch_softlockup_watchdog(); 97 touch_softlockup_watchdog();
98 schedule();
90 } 99 }
91 __set_current_state(TASK_RUNNING);
92 100
93 return 0; 101 return 0;
94} 102}
@@ -96,7 +104,7 @@ static int watchdog(void * __bind_cpu)
96/* 104/*
97 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
98 */ 106 */
99static int __devinit 107static int
100cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
101{ 109{
102 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -110,11 +118,11 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
110 printk("watchdog for %i failed\n", hotcpu); 118 printk("watchdog for %i failed\n", hotcpu);
111 return NOTIFY_BAD; 119 return NOTIFY_BAD;
112 } 120 }
121 per_cpu(touch_timestamp, hotcpu) = jiffies;
113 per_cpu(watchdog_task, hotcpu) = p; 122 per_cpu(watchdog_task, hotcpu) = p;
114 kthread_bind(p, hotcpu); 123 kthread_bind(p, hotcpu);
115 break; 124 break;
116 case CPU_ONLINE: 125 case CPU_ONLINE:
117
118 wake_up_process(per_cpu(watchdog_task, hotcpu)); 126 wake_up_process(per_cpu(watchdog_task, hotcpu));
119 break; 127 break;
120#ifdef CONFIG_HOTPLUG_CPU 128#ifdef CONFIG_HOTPLUG_CPU
@@ -132,7 +140,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
132 return NOTIFY_OK; 140 return NOTIFY_OK;
133} 141}
134 142
135static struct notifier_block __devinitdata cpu_nfb = { 143static struct notifier_block cpu_nfb = {
136 .notifier_call = cpu_callback 144 .notifier_call = cpu_callback
137}; 145};
138 146
@@ -144,6 +152,5 @@ __init void spawn_softlockup_task(void)
144 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 152 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
145 register_cpu_notifier(&cpu_nfb); 153 register_cpu_notifier(&cpu_nfb);
146 154
147 notifier_chain_register(&panic_notifier_list, &panic_block); 155 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
148} 156}
149
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0375fcd592..d1b810782b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock);
179#define BUILD_LOCK_OPS(op, locktype) \ 179#define BUILD_LOCK_OPS(op, locktype) \
180void __lockfunc _##op##_lock(locktype##_t *lock) \ 180void __lockfunc _##op##_lock(locktype##_t *lock) \
181{ \ 181{ \
182 preempt_disable(); \
183 for (;;) { \ 182 for (;;) { \
183 preempt_disable(); \
184 if (likely(_raw_##op##_trylock(lock))) \ 184 if (likely(_raw_##op##_trylock(lock))) \
185 break; \ 185 break; \
186 preempt_enable(); \ 186 preempt_enable(); \
187 \
187 if (!(lock)->break_lock) \ 188 if (!(lock)->break_lock) \
188 (lock)->break_lock = 1; \ 189 (lock)->break_lock = 1; \
189 while (!op##_can_lock(lock) && (lock)->break_lock) \ 190 while (!op##_can_lock(lock) && (lock)->break_lock) \
190 cpu_relax(); \ 191 cpu_relax(); \
191 preempt_disable(); \
192 } \ 192 } \
193 (lock)->break_lock = 0; \ 193 (lock)->break_lock = 0; \
194} \ 194} \
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
199{ \ 199{ \
200 unsigned long flags; \ 200 unsigned long flags; \
201 \ 201 \
202 preempt_disable(); \
203 for (;;) { \ 202 for (;;) { \
203 preempt_disable(); \
204 local_irq_save(flags); \ 204 local_irq_save(flags); \
205 if (likely(_raw_##op##_trylock(lock))) \ 205 if (likely(_raw_##op##_trylock(lock))) \
206 break; \ 206 break; \
207 local_irq_restore(flags); \ 207 local_irq_restore(flags); \
208 \
209 preempt_enable(); \ 208 preempt_enable(); \
209 \
210 if (!(lock)->break_lock) \ 210 if (!(lock)->break_lock) \
211 (lock)->break_lock = 1; \ 211 (lock)->break_lock = 1; \
212 while (!op##_can_lock(lock) && (lock)->break_lock) \ 212 while (!op##_can_lock(lock) && (lock)->break_lock) \
213 cpu_relax(); \ 213 cpu_relax(); \
214 preempt_disable(); \
215 } \ 214 } \
216 (lock)->break_lock = 0; \ 215 (lock)->break_lock = 0; \
217 return flags; \ 216 return flags; \
diff --git a/kernel/sys.c b/kernel/sys.c
index f91218a546..0b6ec0e793 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
95 * and the like. 95 * and the like.
96 */ 96 */
97 97
98static struct notifier_block *reboot_notifier_list; 98static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
99static DEFINE_RWLOCK(notifier_lock); 99
100/*
101 * Notifier chain core routines. The exported routines below
102 * are layered on top of these, with appropriate locking added.
103 */
104
105static int notifier_chain_register(struct notifier_block **nl,
106 struct notifier_block *n)
107{
108 while ((*nl) != NULL) {
109 if (n->priority > (*nl)->priority)
110 break;
111 nl = &((*nl)->next);
112 }
113 n->next = *nl;
114 rcu_assign_pointer(*nl, n);
115 return 0;
116}
117
118static int notifier_chain_unregister(struct notifier_block **nl,
119 struct notifier_block *n)
120{
121 while ((*nl) != NULL) {
122 if ((*nl) == n) {
123 rcu_assign_pointer(*nl, n->next);
124 return 0;
125 }
126 nl = &((*nl)->next);
127 }
128 return -ENOENT;
129}
130
131static int __kprobes notifier_call_chain(struct notifier_block **nl,
132 unsigned long val, void *v)
133{
134 int ret = NOTIFY_DONE;
135 struct notifier_block *nb;
136
137 nb = rcu_dereference(*nl);
138 while (nb) {
139 ret = nb->notifier_call(nb, val, v);
140 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
141 break;
142 nb = rcu_dereference(nb->next);
143 }
144 return ret;
145}
146
147/*
148 * Atomic notifier chain routines. Registration and unregistration
149 * use a mutex, and call_chain is synchronized by RCU (no locks).
150 */
100 151
101/** 152/**
102 * notifier_chain_register - Add notifier to a notifier chain 153 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
103 * @list: Pointer to root list pointer 154 * @nh: Pointer to head of the atomic notifier chain
104 * @n: New entry in notifier chain 155 * @n: New entry in notifier chain
105 * 156 *
106 * Adds a notifier to a notifier chain. 157 * Adds a notifier to an atomic notifier chain.
107 * 158 *
108 * Currently always returns zero. 159 * Currently always returns zero.
109 */ 160 */
161
162int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
163 struct notifier_block *n)
164{
165 unsigned long flags;
166 int ret;
167
168 spin_lock_irqsave(&nh->lock, flags);
169 ret = notifier_chain_register(&nh->head, n);
170 spin_unlock_irqrestore(&nh->lock, flags);
171 return ret;
172}
173
174EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
175
176/**
177 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
178 * @nh: Pointer to head of the atomic notifier chain
179 * @n: Entry to remove from notifier chain
180 *
181 * Removes a notifier from an atomic notifier chain.
182 *
183 * Returns zero on success or %-ENOENT on failure.
184 */
185int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
186 struct notifier_block *n)
187{
188 unsigned long flags;
189 int ret;
190
191 spin_lock_irqsave(&nh->lock, flags);
192 ret = notifier_chain_unregister(&nh->head, n);
193 spin_unlock_irqrestore(&nh->lock, flags);
194 synchronize_rcu();
195 return ret;
196}
197
198EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
199
200/**
201 * atomic_notifier_call_chain - Call functions in an atomic notifier chain
202 * @nh: Pointer to head of the atomic notifier chain
203 * @val: Value passed unmodified to notifier function
204 * @v: Pointer passed unmodified to notifier function
205 *
206 * Calls each function in a notifier chain in turn. The functions
207 * run in an atomic context, so they must not block.
208 * This routine uses RCU to synchronize with changes to the chain.
209 *
210 * If the return value of the notifier can be and'ed
211 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
212 * will return immediately, with the return value of
213 * the notifier function which halted execution.
214 * Otherwise the return value is the return value
215 * of the last notifier function called.
216 */
110 217
111int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) 218int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
219 unsigned long val, void *v)
112{ 220{
113 write_lock(&notifier_lock); 221 int ret;
114 while(*list) 222
115 { 223 rcu_read_lock();
116 if(n->priority > (*list)->priority) 224 ret = notifier_call_chain(&nh->head, val, v);
117 break; 225 rcu_read_unlock();
118 list= &((*list)->next); 226 return ret;
119 }
120 n->next = *list;
121 *list=n;
122 write_unlock(&notifier_lock);
123 return 0;
124} 227}
125 228
126EXPORT_SYMBOL(notifier_chain_register); 229EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
230
231/*
232 * Blocking notifier chain routines. All access to the chain is
233 * synchronized by an rwsem.
234 */
127 235
128/** 236/**
129 * notifier_chain_unregister - Remove notifier from a notifier chain 237 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
130 * @nl: Pointer to root list pointer 238 * @nh: Pointer to head of the blocking notifier chain
131 * @n: New entry in notifier chain 239 * @n: New entry in notifier chain
132 * 240 *
133 * Removes a notifier from a notifier chain. 241 * Adds a notifier to a blocking notifier chain.
242 * Must be called in process context.
134 * 243 *
135 * Returns zero on success, or %-ENOENT on failure. 244 * Currently always returns zero.
136 */ 245 */
137 246
138int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) 247int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
248 struct notifier_block *n)
139{ 249{
140 write_lock(&notifier_lock); 250 int ret;
141 while((*nl)!=NULL) 251
142 { 252 /*
143 if((*nl)==n) 253 * This code gets used during boot-up, when task switching is
144 { 254 * not yet working and interrupts must remain disabled. At
145 *nl=n->next; 255 * such times we must not call down_write().
146 write_unlock(&notifier_lock); 256 */
147 return 0; 257 if (unlikely(system_state == SYSTEM_BOOTING))
148 } 258 return notifier_chain_register(&nh->head, n);
149 nl=&((*nl)->next); 259
150 } 260 down_write(&nh->rwsem);
151 write_unlock(&notifier_lock); 261 ret = notifier_chain_register(&nh->head, n);
152 return -ENOENT; 262 up_write(&nh->rwsem);
263 return ret;
264}
265
266EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
267
268/**
269 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
270 * @nh: Pointer to head of the blocking notifier chain
271 * @n: Entry to remove from notifier chain
272 *
273 * Removes a notifier from a blocking notifier chain.
274 * Must be called from process context.
275 *
276 * Returns zero on success or %-ENOENT on failure.
277 */
278int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
279 struct notifier_block *n)
280{
281 int ret;
282
283 /*
284 * This code gets used during boot-up, when task switching is
285 * not yet working and interrupts must remain disabled. At
286 * such times we must not call down_write().
287 */
288 if (unlikely(system_state == SYSTEM_BOOTING))
289 return notifier_chain_unregister(&nh->head, n);
290
291 down_write(&nh->rwsem);
292 ret = notifier_chain_unregister(&nh->head, n);
293 up_write(&nh->rwsem);
294 return ret;
153} 295}
154 296
155EXPORT_SYMBOL(notifier_chain_unregister); 297EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
156 298
157/** 299/**
158 * notifier_call_chain - Call functions in a notifier chain 300 * blocking_notifier_call_chain - Call functions in a blocking notifier chain
159 * @n: Pointer to root pointer of notifier chain 301 * @nh: Pointer to head of the blocking notifier chain
160 * @val: Value passed unmodified to notifier function 302 * @val: Value passed unmodified to notifier function
161 * @v: Pointer passed unmodified to notifier function 303 * @v: Pointer passed unmodified to notifier function
162 * 304 *
163 * Calls each function in a notifier chain in turn. 305 * Calls each function in a notifier chain in turn. The functions
306 * run in a process context, so they are allowed to block.
164 * 307 *
165 * If the return value of the notifier can be and'd 308 * If the return value of the notifier can be and'ed
166 * with %NOTIFY_STOP_MASK, then notifier_call_chain 309 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
167 * will return immediately, with the return value of 310 * will return immediately, with the return value of
168 * the notifier function which halted execution. 311 * the notifier function which halted execution.
169 * Otherwise, the return value is the return value 312 * Otherwise the return value is the return value
170 * of the last notifier function called. 313 * of the last notifier function called.
171 */ 314 */
172 315
173int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) 316int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
317 unsigned long val, void *v)
174{ 318{
175 int ret=NOTIFY_DONE; 319 int ret;
176 struct notifier_block *nb = *n;
177 320
178 while(nb) 321 down_read(&nh->rwsem);
179 { 322 ret = notifier_call_chain(&nh->head, val, v);
180 ret=nb->notifier_call(nb,val,v); 323 up_read(&nh->rwsem);
181 if(ret&NOTIFY_STOP_MASK)
182 {
183 return ret;
184 }
185 nb=nb->next;
186 }
187 return ret; 324 return ret;
188} 325}
189 326
190EXPORT_SYMBOL(notifier_call_chain); 327EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
328
329/*
330 * Raw notifier chain routines. There is no protection;
331 * the caller must provide it. Use at your own risk!
332 */
333
334/**
335 * raw_notifier_chain_register - Add notifier to a raw notifier chain
336 * @nh: Pointer to head of the raw notifier chain
337 * @n: New entry in notifier chain
338 *
339 * Adds a notifier to a raw notifier chain.
340 * All locking must be provided by the caller.
341 *
342 * Currently always returns zero.
343 */
344
345int raw_notifier_chain_register(struct raw_notifier_head *nh,
346 struct notifier_block *n)
347{
348 return notifier_chain_register(&nh->head, n);
349}
350
351EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
352
353/**
354 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
355 * @nh: Pointer to head of the raw notifier chain
356 * @n: Entry to remove from notifier chain
357 *
358 * Removes a notifier from a raw notifier chain.
359 * All locking must be provided by the caller.
360 *
361 * Returns zero on success or %-ENOENT on failure.
362 */
363int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
364 struct notifier_block *n)
365{
366 return notifier_chain_unregister(&nh->head, n);
367}
368
369EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
370
371/**
372 * raw_notifier_call_chain - Call functions in a raw notifier chain
373 * @nh: Pointer to head of the raw notifier chain
374 * @val: Value passed unmodified to notifier function
375 * @v: Pointer passed unmodified to notifier function
376 *
377 * Calls each function in a notifier chain in turn. The functions
378 * run in an undefined context.
379 * All locking must be provided by the caller.
380 *
381 * If the return value of the notifier can be and'ed
382 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain
383 * will return immediately, with the return value of
384 * the notifier function which halted execution.
385 * Otherwise the return value is the return value
386 * of the last notifier function called.
387 */
388
389int raw_notifier_call_chain(struct raw_notifier_head *nh,
390 unsigned long val, void *v)
391{
392 return notifier_call_chain(&nh->head, val, v);
393}
394
395EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
191 396
192/** 397/**
193 * register_reboot_notifier - Register function to be called at reboot time 398 * register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
196 * Registers a function with the list of functions 401 * Registers a function with the list of functions
197 * to be called at reboot time. 402 * to be called at reboot time.
198 * 403 *
199 * Currently always returns zero, as notifier_chain_register 404 * Currently always returns zero, as blocking_notifier_chain_register
200 * always returns zero. 405 * always returns zero.
201 */ 406 */
202 407
203int register_reboot_notifier(struct notifier_block * nb) 408int register_reboot_notifier(struct notifier_block * nb)
204{ 409{
205 return notifier_chain_register(&reboot_notifier_list, nb); 410 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
206} 411}
207 412
208EXPORT_SYMBOL(register_reboot_notifier); 413EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,23 +424,11 @@ EXPORT_SYMBOL(register_reboot_notifier);
219 424
220int unregister_reboot_notifier(struct notifier_block * nb) 425int unregister_reboot_notifier(struct notifier_block * nb)
221{ 426{
222 return notifier_chain_unregister(&reboot_notifier_list, nb); 427 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
223} 428}
224 429
225EXPORT_SYMBOL(unregister_reboot_notifier); 430EXPORT_SYMBOL(unregister_reboot_notifier);
226 431
227#ifndef CONFIG_SECURITY
228int capable(int cap)
229{
230 if (cap_raised(current->cap_effective, cap)) {
231 current->flags |= PF_SUPERPRIV;
232 return 1;
233 }
234 return 0;
235}
236EXPORT_SYMBOL(capable);
237#endif
238
239static int set_one_prio(struct task_struct *p, int niceval, int error) 432static int set_one_prio(struct task_struct *p, int niceval, int error)
240{ 433{
241 int no_nice; 434 int no_nice;
@@ -392,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
392 585
393void kernel_restart_prepare(char *cmd) 586void kernel_restart_prepare(char *cmd)
394{ 587{
395 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 588 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
396 system_state = SYSTEM_RESTART; 589 system_state = SYSTEM_RESTART;
397 device_shutdown(); 590 device_shutdown();
398} 591}
@@ -442,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
442 635
443void kernel_shutdown_prepare(enum system_states state) 636void kernel_shutdown_prepare(enum system_states state)
444{ 637{
445 notifier_call_chain(&reboot_notifier_list, 638 blocking_notifier_call_chain(&reboot_notifier_list,
446 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 639 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
447 system_state = state; 640 system_state = state;
448 device_shutdown(); 641 device_shutdown();
@@ -1009,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf)
1009 */ 1202 */
1010 if (tbuf) { 1203 if (tbuf) {
1011 struct tms tmp; 1204 struct tms tmp;
1205 struct task_struct *tsk = current;
1206 struct task_struct *t;
1012 cputime_t utime, stime, cutime, cstime; 1207 cputime_t utime, stime, cutime, cstime;
1013 1208
1014#ifdef CONFIG_SMP 1209 spin_lock_irq(&tsk->sighand->siglock);
1015 if (thread_group_empty(current)) { 1210 utime = tsk->signal->utime;
1016 /* 1211 stime = tsk->signal->stime;
1017 * Single thread case without the use of any locks. 1212 t = tsk;
1018 * 1213 do {
1019 * We may race with release_task if two threads are 1214 utime = cputime_add(utime, t->utime);
1020 * executing. However, release task first adds up the 1215 stime = cputime_add(stime, t->stime);
1021 * counters (__exit_signal) before removing the task 1216 t = next_thread(t);
1022 * from the process tasklist (__unhash_process). 1217 } while (t != tsk);
1023 * __exit_signal also acquires and releases the
1024 * siglock which results in the proper memory ordering
1025 * so that the list modifications are always visible
1026 * after the counters have been updated.
1027 *
1028 * If the counters have been updated by the second thread
1029 * but the thread has not yet been removed from the list
1030 * then the other branch will be executing which will
1031 * block on tasklist_lock until the exit handling of the
1032 * other task is finished.
1033 *
1034 * This also implies that the sighand->siglock cannot
1035 * be held by another processor. So we can also
1036 * skip acquiring that lock.
1037 */
1038 utime = cputime_add(current->signal->utime, current->utime);
1039 stime = cputime_add(current->signal->utime, current->stime);
1040 cutime = current->signal->cutime;
1041 cstime = current->signal->cstime;
1042 } else
1043#endif
1044 {
1045 1218
1046 /* Process with multiple threads */ 1219 cutime = tsk->signal->cutime;
1047 struct task_struct *tsk = current; 1220 cstime = tsk->signal->cstime;
1048 struct task_struct *t; 1221 spin_unlock_irq(&tsk->sighand->siglock);
1049 1222
1050 read_lock(&tasklist_lock);
1051 utime = tsk->signal->utime;
1052 stime = tsk->signal->stime;
1053 t = tsk;
1054 do {
1055 utime = cputime_add(utime, t->utime);
1056 stime = cputime_add(stime, t->stime);
1057 t = next_thread(t);
1058 } while (t != tsk);
1059
1060 /*
1061 * While we have tasklist_lock read-locked, no dying thread
1062 * can be updating current->signal->[us]time. Instead,
1063 * we got their counts included in the live thread loop.
1064 * However, another thread can come in right now and
1065 * do a wait call that updates current->signal->c[us]time.
1066 * To make sure we always see that pair updated atomically,
1067 * we take the siglock around fetching them.
1068 */
1069 spin_lock_irq(&tsk->sighand->siglock);
1070 cutime = tsk->signal->cutime;
1071 cstime = tsk->signal->cstime;
1072 spin_unlock_irq(&tsk->sighand->siglock);
1073 read_unlock(&tasklist_lock);
1074 }
1075 tmp.tms_utime = cputime_to_clock_t(utime); 1223 tmp.tms_utime = cputime_to_clock_t(utime);
1076 tmp.tms_stime = cputime_to_clock_t(stime); 1224 tmp.tms_stime = cputime_to_clock_t(stime);
1077 tmp.tms_cutime = cputime_to_clock_t(cutime); 1225 tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1224,24 +1372,35 @@ asmlinkage long sys_getsid(pid_t pid)
1224asmlinkage long sys_setsid(void) 1372asmlinkage long sys_setsid(void)
1225{ 1373{
1226 struct task_struct *group_leader = current->group_leader; 1374 struct task_struct *group_leader = current->group_leader;
1227 struct pid *pid; 1375 pid_t session;
1228 int err = -EPERM; 1376 int err = -EPERM;
1229 1377
1230 down(&tty_sem); 1378 mutex_lock(&tty_mutex);
1231 write_lock_irq(&tasklist_lock); 1379 write_lock_irq(&tasklist_lock);
1232 1380
1233 pid = find_pid(PIDTYPE_PGID, group_leader->pid); 1381 /* Fail if I am already a session leader */
1234 if (pid) 1382 if (group_leader->signal->leader)
1383 goto out;
1384
1385 session = group_leader->pid;
1386 /* Fail if a process group id already exists that equals the
1387 * proposed session id.
1388 *
1389 * Don't check if session id == 1 because kernel threads use this
1390 * session id and so the check will always fail and make it so
1391 * init cannot successfully call setsid.
1392 */
1393 if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
1235 goto out; 1394 goto out;
1236 1395
1237 group_leader->signal->leader = 1; 1396 group_leader->signal->leader = 1;
1238 __set_special_pids(group_leader->pid, group_leader->pid); 1397 __set_special_pids(session, session);
1239 group_leader->signal->tty = NULL; 1398 group_leader->signal->tty = NULL;
1240 group_leader->signal->tty_old_pgrp = 0; 1399 group_leader->signal->tty_old_pgrp = 0;
1241 err = process_group(group_leader); 1400 err = process_group(group_leader);
1242out: 1401out:
1243 write_unlock_irq(&tasklist_lock); 1402 write_unlock_irq(&tasklist_lock);
1244 up(&tty_sem); 1403 mutex_unlock(&tty_mutex);
1245 return err; 1404 return err;
1246} 1405}
1247 1406
@@ -1375,7 +1534,7 @@ static void groups_sort(struct group_info *group_info)
1375/* a simple bsearch */ 1534/* a simple bsearch */
1376int groups_search(struct group_info *group_info, gid_t grp) 1535int groups_search(struct group_info *group_info, gid_t grp)
1377{ 1536{
1378 int left, right; 1537 unsigned int left, right;
1379 1538
1380 if (!group_info) 1539 if (!group_info)
1381 return 0; 1540 return 0;
@@ -1383,7 +1542,7 @@ int groups_search(struct group_info *group_info, gid_t grp)
1383 left = 0; 1542 left = 0;
1384 right = group_info->ngroups; 1543 right = group_info->ngroups;
1385 while (left < right) { 1544 while (left < right) {
1386 int mid = (left+right)/2; 1545 unsigned int mid = (left+right)/2;
1387 int cmp = grp - GROUP_AT(group_info, mid); 1546 int cmp = grp - GROUP_AT(group_info, mid);
1388 if (cmp > 0) 1547 if (cmp > 0)
1389 left = mid + 1; 1548 left = mid + 1;
@@ -1433,7 +1592,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1433 return -EINVAL; 1592 return -EINVAL;
1434 1593
1435 /* no need to grab task_lock here; it cannot change */ 1594 /* no need to grab task_lock here; it cannot change */
1436 get_group_info(current->group_info);
1437 i = current->group_info->ngroups; 1595 i = current->group_info->ngroups;
1438 if (gidsetsize) { 1596 if (gidsetsize) {
1439 if (i > gidsetsize) { 1597 if (i > gidsetsize) {
@@ -1446,7 +1604,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1446 } 1604 }
1447 } 1605 }
1448out: 1606out:
1449 put_group_info(current->group_info);
1450 return i; 1607 return i;
1451} 1608}
1452 1609
@@ -1487,9 +1644,7 @@ int in_group_p(gid_t grp)
1487{ 1644{
1488 int retval = 1; 1645 int retval = 1;
1489 if (grp != current->fsgid) { 1646 if (grp != current->fsgid) {
1490 get_group_info(current->group_info);
1491 retval = groups_search(current->group_info, grp); 1647 retval = groups_search(current->group_info, grp);
1492 put_group_info(current->group_info);
1493 } 1648 }
1494 return retval; 1649 return retval;
1495} 1650}
@@ -1500,9 +1655,7 @@ int in_egroup_p(gid_t grp)
1500{ 1655{
1501 int retval = 1; 1656 int retval = 1;
1502 if (grp != current->egid) { 1657 if (grp != current->egid) {
1503 get_group_info(current->group_info);
1504 retval = groups_search(current->group_info, grp); 1658 retval = groups_search(current->group_info, grp);
1505 put_group_info(current->group_info);
1506 } 1659 }
1507 return retval; 1660 return retval;
1508} 1661}
@@ -1630,20 +1783,21 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1630asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1783asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1631{ 1784{
1632 struct rlimit new_rlim, *old_rlim; 1785 struct rlimit new_rlim, *old_rlim;
1786 unsigned long it_prof_secs;
1633 int retval; 1787 int retval;
1634 1788
1635 if (resource >= RLIM_NLIMITS) 1789 if (resource >= RLIM_NLIMITS)
1636 return -EINVAL; 1790 return -EINVAL;
1637 if(copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1791 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1638 return -EFAULT; 1792 return -EFAULT;
1639 if (new_rlim.rlim_cur > new_rlim.rlim_max) 1793 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1640 return -EINVAL; 1794 return -EINVAL;
1641 old_rlim = current->signal->rlim + resource; 1795 old_rlim = current->signal->rlim + resource;
1642 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1796 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1643 !capable(CAP_SYS_RESOURCE)) 1797 !capable(CAP_SYS_RESOURCE))
1644 return -EPERM; 1798 return -EPERM;
1645 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) 1799 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
1646 return -EPERM; 1800 return -EPERM;
1647 1801
1648 retval = security_task_setrlimit(resource, &new_rlim); 1802 retval = security_task_setrlimit(resource, &new_rlim);
1649 if (retval) 1803 if (retval)
@@ -1653,19 +1807,40 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1653 *old_rlim = new_rlim; 1807 *old_rlim = new_rlim;
1654 task_unlock(current->group_leader); 1808 task_unlock(current->group_leader);
1655 1809
1656 if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY && 1810 if (resource != RLIMIT_CPU)
1657 (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 1811 goto out;
1658 new_rlim.rlim_cur <= cputime_to_secs( 1812
1659 current->signal->it_prof_expires))) { 1813 /*
1660 cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); 1814 * RLIMIT_CPU handling. Note that the kernel fails to return an error
1815 * code if it rejected the user's attempt to set RLIMIT_CPU. This is a
1816 * very long-standing error, and fixing it now risks breakage of
1817 * applications, so we live with it
1818 */
1819 if (new_rlim.rlim_cur == RLIM_INFINITY)
1820 goto out;
1821
1822 it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
1823 if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
1824 unsigned long rlim_cur = new_rlim.rlim_cur;
1825 cputime_t cputime;
1826
1827 if (rlim_cur == 0) {
1828 /*
1829 * The caller is asking for an immediate RLIMIT_CPU
1830 * expiry. But we use the zero value to mean "it was
1831 * never set". So let's cheat and make it one second
1832 * instead
1833 */
1834 rlim_cur = 1;
1835 }
1836 cputime = secs_to_cputime(rlim_cur);
1661 read_lock(&tasklist_lock); 1837 read_lock(&tasklist_lock);
1662 spin_lock_irq(&current->sighand->siglock); 1838 spin_lock_irq(&current->sighand->siglock);
1663 set_process_cpu_timer(current, CPUCLOCK_PROF, 1839 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
1664 &cputime, NULL);
1665 spin_unlock_irq(&current->sighand->siglock); 1840 spin_unlock_irq(&current->sighand->siglock);
1666 read_unlock(&tasklist_lock); 1841 read_unlock(&tasklist_lock);
1667 } 1842 }
1668 1843out:
1669 return 0; 1844 return 0;
1670} 1845}
1671 1846
@@ -1677,9 +1852,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1677 * a lot simpler! (Which we're not doing right now because we're not 1852 * a lot simpler! (Which we're not doing right now because we're not
1678 * measuring them yet). 1853 * measuring them yet).
1679 * 1854 *
1680 * This expects to be called with tasklist_lock read-locked or better,
1681 * and the siglock not locked. It may momentarily take the siglock.
1682 *
1683 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have 1855 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
1684 * races with threads incrementing their own counters. But since word 1856 * races with threads incrementing their own counters. But since word
1685 * reads are atomic, we either get new values or old values and we don't 1857 * reads are atomic, we either get new values or old values and we don't
@@ -1687,6 +1859,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1687 * the c* fields from p->signal from races with exit.c updating those 1859 * the c* fields from p->signal from races with exit.c updating those
1688 * fields when reaping, so a sample either gets all the additions of a 1860 * fields when reaping, so a sample either gets all the additions of a
1689 * given child after it's reaped, or none so this sample is before reaping. 1861 * given child after it's reaped, or none so this sample is before reaping.
1862 *
1863 * tasklist_lock locking optimisation:
1864 * If we are current and single threaded, we do not need to take the tasklist
1865 * lock or the siglock. No one else can take our signal_struct away,
1866 * no one else can reap the children to update signal->c* counters, and
1867 * no one else can race with the signal-> fields.
1868 * If we do not take the tasklist_lock, the signal-> fields could be read
1869 * out of order while another thread was just exiting. So we place a
1870 * read memory barrier when we avoid the lock. On the writer side,
1871 * write memory barrier is implied in __exit_signal as __exit_signal releases
1872 * the siglock spinlock after updating the signal-> fields.
1873 *
1874 * We don't really need the siglock when we access the non c* fields
1875 * of the signal_struct (for RUSAGE_SELF) even in multithreaded
1876 * case, since we take the tasklist lock for read and the non c* signal->
1877 * fields are updated only in __exit_signal, which is called with
1878 * tasklist_lock taken for write, hence these two threads cannot execute
1879 * concurrently.
1880 *
1690 */ 1881 */
1691 1882
1692static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1883static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
@@ -1694,13 +1885,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1694 struct task_struct *t; 1885 struct task_struct *t;
1695 unsigned long flags; 1886 unsigned long flags;
1696 cputime_t utime, stime; 1887 cputime_t utime, stime;
1888 int need_lock = 0;
1697 1889
1698 memset((char *) r, 0, sizeof *r); 1890 memset((char *) r, 0, sizeof *r);
1891 utime = stime = cputime_zero;
1699 1892
1700 if (unlikely(!p->signal)) 1893 if (p != current || !thread_group_empty(p))
1701 return; 1894 need_lock = 1;
1702 1895
1703 utime = stime = cputime_zero; 1896 if (need_lock) {
1897 read_lock(&tasklist_lock);
1898 if (unlikely(!p->signal)) {
1899 read_unlock(&tasklist_lock);
1900 return;
1901 }
1902 } else
1903 /* See locking comments above */
1904 smp_rmb();
1704 1905
1705 switch (who) { 1906 switch (who) {
1706 case RUSAGE_BOTH: 1907 case RUSAGE_BOTH:
@@ -1740,6 +1941,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1740 BUG(); 1941 BUG();
1741 } 1942 }
1742 1943
1944 if (need_lock)
1945 read_unlock(&tasklist_lock);
1743 cputime_to_timeval(utime, &r->ru_utime); 1946 cputime_to_timeval(utime, &r->ru_utime);
1744 cputime_to_timeval(stime, &r->ru_stime); 1947 cputime_to_timeval(stime, &r->ru_stime);
1745} 1948}
@@ -1747,9 +1950,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1747int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1950int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1748{ 1951{
1749 struct rusage r; 1952 struct rusage r;
1750 read_lock(&tasklist_lock);
1751 k_getrusage(p, who, &r); 1953 k_getrusage(p, who, &r);
1752 read_unlock(&tasklist_lock);
1753 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1954 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1754} 1955}
1755 1956
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 17313b99e5..5433195040 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
42cond_syscall(sys_socketcall); 42cond_syscall(sys_socketcall);
43cond_syscall(sys_futex); 43cond_syscall(sys_futex);
44cond_syscall(compat_sys_futex); 44cond_syscall(compat_sys_futex);
45cond_syscall(sys_set_robust_list);
46cond_syscall(compat_sys_set_robust_list);
47cond_syscall(sys_get_robust_list);
48cond_syscall(compat_sys_get_robust_list);
45cond_syscall(sys_epoll_create); 49cond_syscall(sys_epoll_create);
46cond_syscall(sys_epoll_ctl); 50cond_syscall(sys_epoll_ctl);
47cond_syscall(sys_epoll_wait); 51cond_syscall(sys_epoll_wait);
@@ -104,6 +108,8 @@ cond_syscall(sys_setreuid16);
104cond_syscall(sys_setuid16); 108cond_syscall(sys_setuid16);
105cond_syscall(sys_vm86old); 109cond_syscall(sys_vm86old);
106cond_syscall(sys_vm86); 110cond_syscall(sys_vm86);
111cond_syscall(compat_sys_ipc);
112cond_syscall(compat_sys_sysctl);
107 113
108/* arch-specific weak syscall entries */ 114/* arch-specific weak syscall entries */
109cond_syscall(sys_pciconfig_read); 115cond_syscall(sys_pciconfig_read);
@@ -114,3 +120,15 @@ cond_syscall(sys32_sysctl);
114cond_syscall(ppc_rtas); 120cond_syscall(ppc_rtas);
115cond_syscall(sys_spu_run); 121cond_syscall(sys_spu_run);
116cond_syscall(sys_spu_create); 122cond_syscall(sys_spu_create);
123
124/* mmu depending weak syscall entries */
125cond_syscall(sys_mprotect);
126cond_syscall(sys_msync);
127cond_syscall(sys_mlock);
128cond_syscall(sys_munlock);
129cond_syscall(sys_mlockall);
130cond_syscall(sys_munlockall);
131cond_syscall(sys_mincore);
132cond_syscall(sys_madvise);
133cond_syscall(sys_mremap);
134cond_syscall(sys_remap_file_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 71dd6f62ef..e82726faee 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,13 +44,14 @@
44#include <linux/limits.h> 44#include <linux/limits.h>
45#include <linux/dcache.h> 45#include <linux/dcache.h>
46#include <linux/syscalls.h> 46#include <linux/syscalls.h>
47#include <linux/nfs_fs.h>
48#include <linux/acpi.h>
47 49
48#include <asm/uaccess.h> 50#include <asm/uaccess.h>
49#include <asm/processor.h> 51#include <asm/processor.h>
50 52
51#ifdef CONFIG_ROOT_NFS 53extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
52#include <linux/nfs_fs.h> 54 void __user *buffer, size_t *lenp, loff_t *ppos);
53#endif
54 55
55#if defined(CONFIG_SYSCTL) 56#if defined(CONFIG_SYSCTL)
56 57
@@ -126,7 +127,9 @@ extern int sysctl_hz_timer;
126extern int acct_parm[]; 127extern int acct_parm[];
127#endif 128#endif
128 129
129int randomize_va_space = 1; 130#ifdef CONFIG_IA64
131extern int no_unaligned_warning;
132#endif
130 133
131static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, 134static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
132 ctl_table *, void **); 135 ctl_table *, void **);
@@ -640,6 +643,7 @@ static ctl_table kern_table[] = {
640 .proc_handler = &proc_dointvec, 643 .proc_handler = &proc_dointvec,
641 }, 644 },
642#endif 645#endif
646#if defined(CONFIG_MMU)
643 { 647 {
644 .ctl_name = KERN_RANDOMIZE, 648 .ctl_name = KERN_RANDOMIZE,
645 .procname = "randomize_va_space", 649 .procname = "randomize_va_space",
@@ -648,6 +652,7 @@ static ctl_table kern_table[] = {
648 .mode = 0644, 652 .mode = 0644,
649 .proc_handler = &proc_dointvec, 653 .proc_handler = &proc_dointvec,
650 }, 654 },
655#endif
651#if defined(CONFIG_S390) && defined(CONFIG_SMP) 656#if defined(CONFIG_S390) && defined(CONFIG_SMP)
652 { 657 {
653 .ctl_name = KERN_SPIN_RETRY, 658 .ctl_name = KERN_SPIN_RETRY,
@@ -658,6 +663,26 @@ static ctl_table kern_table[] = {
658 .proc_handler = &proc_dointvec, 663 .proc_handler = &proc_dointvec,
659 }, 664 },
660#endif 665#endif
666#ifdef CONFIG_ACPI_SLEEP
667 {
668 .ctl_name = KERN_ACPI_VIDEO_FLAGS,
669 .procname = "acpi_video_flags",
670 .data = &acpi_video_flags,
671 .maxlen = sizeof (unsigned long),
672 .mode = 0644,
673 .proc_handler = &proc_doulongvec_minmax,
674 },
675#endif
676#ifdef CONFIG_IA64
677 {
678 .ctl_name = KERN_IA64_UNALIGNED,
679 .procname = "ignore-unaligned-usertrap",
680 .data = &no_unaligned_warning,
681 .maxlen = sizeof (int),
682 .mode = 0644,
683 .proc_handler = &proc_dointvec,
684 },
685#endif
661 { .ctl_name = 0 } 686 { .ctl_name = 0 }
662}; 687};
663 688
@@ -717,18 +742,18 @@ static ctl_table vm_table[] = {
717 { 742 {
718 .ctl_name = VM_DIRTY_WB_CS, 743 .ctl_name = VM_DIRTY_WB_CS,
719 .procname = "dirty_writeback_centisecs", 744 .procname = "dirty_writeback_centisecs",
720 .data = &dirty_writeback_centisecs, 745 .data = &dirty_writeback_interval,
721 .maxlen = sizeof(dirty_writeback_centisecs), 746 .maxlen = sizeof(dirty_writeback_interval),
722 .mode = 0644, 747 .mode = 0644,
723 .proc_handler = &dirty_writeback_centisecs_handler, 748 .proc_handler = &dirty_writeback_centisecs_handler,
724 }, 749 },
725 { 750 {
726 .ctl_name = VM_DIRTY_EXPIRE_CS, 751 .ctl_name = VM_DIRTY_EXPIRE_CS,
727 .procname = "dirty_expire_centisecs", 752 .procname = "dirty_expire_centisecs",
728 .data = &dirty_expire_centisecs, 753 .data = &dirty_expire_interval,
729 .maxlen = sizeof(dirty_expire_centisecs), 754 .maxlen = sizeof(dirty_expire_interval),
730 .mode = 0644, 755 .mode = 0644,
731 .proc_handler = &proc_dointvec, 756 .proc_handler = &proc_dointvec_userhz_jiffies,
732 }, 757 },
733 { 758 {
734 .ctl_name = VM_NR_PDFLUSH_THREADS, 759 .ctl_name = VM_NR_PDFLUSH_THREADS,
@@ -823,9 +848,8 @@ static ctl_table vm_table[] = {
823 .data = &laptop_mode, 848 .data = &laptop_mode,
824 .maxlen = sizeof(laptop_mode), 849 .maxlen = sizeof(laptop_mode),
825 .mode = 0644, 850 .mode = 0644,
826 .proc_handler = &proc_dointvec, 851 .proc_handler = &proc_dointvec_jiffies,
827 .strategy = &sysctl_intvec, 852 .strategy = &sysctl_jiffies,
828 .extra1 = &zero,
829 }, 853 },
830 { 854 {
831 .ctl_name = VM_BLOCK_DUMP, 855 .ctl_name = VM_BLOCK_DUMP,
@@ -921,7 +945,7 @@ static ctl_table fs_table[] = {
921 .data = &files_stat, 945 .data = &files_stat,
922 .maxlen = 3*sizeof(int), 946 .maxlen = 3*sizeof(int),
923 .mode = 0444, 947 .mode = 0444,
924 .proc_handler = &proc_dointvec, 948 .proc_handler = &proc_nr_files,
925 }, 949 },
926 { 950 {
927 .ctl_name = FS_MAXFILE, 951 .ctl_name = FS_MAXFILE,
@@ -2029,6 +2053,8 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2029 int write, void *data) 2053 int write, void *data)
2030{ 2054{
2031 if (write) { 2055 if (write) {
2056 if (*lvalp > LONG_MAX / HZ)
2057 return 1;
2032 *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); 2058 *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
2033 } else { 2059 } else {
2034 int val = *valp; 2060 int val = *valp;
@@ -2050,6 +2076,8 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2050 int write, void *data) 2076 int write, void *data)
2051{ 2077{
2052 if (write) { 2078 if (write) {
2079 if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
2080 return 1;
2053 *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); 2081 *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
2054 } else { 2082 } else {
2055 int val = *valp; 2083 int val = *valp;
diff --git a/kernel/time.c b/kernel/time.c
index 804539165d..b00ddc71ce 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
202 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 202 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
203} 203}
204 204
205long pps_offset; /* pps time offset (us) */
206long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */
207
208long pps_freq; /* frequency offset (scaled ppm) */
209long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
210
211long pps_valid = PPS_VALID; /* pps signal watchdog counter */
212
213int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
214
215long pps_jitcnt; /* jitter limit exceeded */
216long pps_calcnt; /* calibration intervals */
217long pps_errcnt; /* calibration errors */
218long pps_stbcnt; /* stability limit exceeded */
219
220/* hook for a loadable hardpps kernel module */
221void (*hardpps_ptr)(struct timeval *);
222
223/* we call this to notify the arch when the clock is being 205/* we call this to notify the arch when the clock is being
224 * controlled. If no such arch routine, do nothing. 206 * controlled. If no such arch routine, do nothing.
225 */ 207 */
@@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc)
279 result = -EINVAL; 261 result = -EINVAL;
280 goto leave; 262 goto leave;
281 } 263 }
282 time_freq = txc->freq - pps_freq; 264 time_freq = txc->freq;
283 } 265 }
284 266
285 if (txc->modes & ADJ_MAXERROR) { 267 if (txc->modes & ADJ_MAXERROR) {
@@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc)
312 if ((time_next_adjust = txc->offset) == 0) 294 if ((time_next_adjust = txc->offset) == 0)
313 time_adjust = 0; 295 time_adjust = 0;
314 } 296 }
315 else if ( time_status & (STA_PLL | STA_PPSTIME) ) { 297 else if (time_status & STA_PLL) {
316 ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) == 298 ltemp = txc->offset;
317 (STA_PPSTIME | STA_PPSSIGNAL) ?
318 pps_offset : txc->offset;
319 299
320 /* 300 /*
321 * Scale the phase adjustment and 301 * Scale the phase adjustment and
@@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc)
356 } 336 }
357 time_freq = min(time_freq, time_tolerance); 337 time_freq = min(time_freq, time_tolerance);
358 time_freq = max(time_freq, -time_tolerance); 338 time_freq = max(time_freq, -time_tolerance);
359 } /* STA_PLL || STA_PPSTIME */ 339 } /* STA_PLL */
360 } /* txc->modes & ADJ_OFFSET */ 340 } /* txc->modes & ADJ_OFFSET */
361 if (txc->modes & ADJ_TICK) { 341 if (txc->modes & ADJ_TICK) {
362 tick_usec = txc->tick; 342 tick_usec = txc->tick;
363 tick_nsec = TICK_USEC_TO_NSEC(tick_usec); 343 tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
364 } 344 }
365 } /* txc->modes */ 345 } /* txc->modes */
366leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 346leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
367 || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
368 && (time_status & STA_PPSSIGNAL) == 0)
369 /* p. 24, (b) */
370 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
371 == (STA_PPSTIME|STA_PPSJITTER))
372 /* p. 24, (c) */
373 || ((time_status & STA_PPSFREQ) != 0
374 && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
375 /* p. 24, (d) */
376 result = TIME_ERROR; 347 result = TIME_ERROR;
377 348
378 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 349 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
@@ -380,7 +351,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
380 else { 351 else {
381 txc->offset = shift_right(time_offset, SHIFT_UPDATE); 352 txc->offset = shift_right(time_offset, SHIFT_UPDATE);
382 } 353 }
383 txc->freq = time_freq + pps_freq; 354 txc->freq = time_freq;
384 txc->maxerror = time_maxerror; 355 txc->maxerror = time_maxerror;
385 txc->esterror = time_esterror; 356 txc->esterror = time_esterror;
386 txc->status = time_status; 357 txc->status = time_status;
@@ -388,14 +359,16 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
388 txc->precision = time_precision; 359 txc->precision = time_precision;
389 txc->tolerance = time_tolerance; 360 txc->tolerance = time_tolerance;
390 txc->tick = tick_usec; 361 txc->tick = tick_usec;
391 txc->ppsfreq = pps_freq; 362
392 txc->jitter = pps_jitter >> PPS_AVG; 363 /* PPS is not implemented, so these are zero */
393 txc->shift = pps_shift; 364 txc->ppsfreq = 0;
394 txc->stabil = pps_stabil; 365 txc->jitter = 0;
395 txc->jitcnt = pps_jitcnt; 366 txc->shift = 0;
396 txc->calcnt = pps_calcnt; 367 txc->stabil = 0;
397 txc->errcnt = pps_errcnt; 368 txc->jitcnt = 0;
398 txc->stbcnt = pps_stbcnt; 369 txc->calcnt = 0;
370 txc->errcnt = 0;
371 txc->stbcnt = 0;
399 write_sequnlock_irq(&xtime_lock); 372 write_sequnlock_irq(&xtime_lock);
400 do_gettimeofday(&txc->time); 373 do_gettimeofday(&txc->time);
401 notify_arch_cmos_timer(); 374 notify_arch_cmos_timer();
@@ -437,7 +410,7 @@ EXPORT_SYMBOL(current_kernel_time);
437 * current_fs_time - Return FS time 410 * current_fs_time - Return FS time
438 * @sb: Superblock. 411 * @sb: Superblock.
439 * 412 *
440 * Return the current time truncated to the time granuality supported by 413 * Return the current time truncated to the time granularity supported by
441 * the fs. 414 * the fs.
442 */ 415 */
443struct timespec current_fs_time(struct super_block *sb) 416struct timespec current_fs_time(struct super_block *sb)
@@ -448,11 +421,11 @@ struct timespec current_fs_time(struct super_block *sb)
448EXPORT_SYMBOL(current_fs_time); 421EXPORT_SYMBOL(current_fs_time);
449 422
450/** 423/**
451 * timespec_trunc - Truncate timespec to a granuality 424 * timespec_trunc - Truncate timespec to a granularity
452 * @t: Timespec 425 * @t: Timespec
453 * @gran: Granuality in ns. 426 * @gran: Granularity in ns.
454 * 427 *
455 * Truncate a timespec to a granuality. gran must be smaller than a second. 428 * Truncate a timespec to a granularity. gran must be smaller than a second.
456 * Always rounds down. 429 * Always rounds down.
457 * 430 *
458 * This function should be only used for timestamps returned by 431 * This function should be only used for timestamps returned by
@@ -637,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
637 * 610 *
638 * Returns the timespec representation of the nsec parameter. 611 * Returns the timespec representation of the nsec parameter.
639 */ 612 */
640struct timespec ns_to_timespec(const nsec_t nsec) 613struct timespec ns_to_timespec(const s64 nsec)
641{ 614{
642 struct timespec ts; 615 struct timespec ts;
643 616
@@ -657,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec)
657 * 630 *
658 * Returns the timeval representation of the nsec parameter. 631 * Returns the timeval representation of the nsec parameter.
659 */ 632 */
660struct timeval ns_to_timeval(const nsec_t nsec) 633struct timeval ns_to_timeval(const s64 nsec)
661{ 634{
662 struct timespec ts = ns_to_timespec(nsec); 635 struct timespec ts = ns_to_timespec(nsec);
663 struct timeval tv; 636 struct timeval tv;
diff --git a/kernel/timer.c b/kernel/timer.c
index b9dad39946..9e49deed46 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
54/* 54/*
55 * per-CPU timer vector definitions: 55 * per-CPU timer vector definitions:
56 */ 56 */
57
58#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) 57#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
59#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) 58#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
60#define TVN_SIZE (1 << TVN_BITS) 59#define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
62#define TVN_MASK (TVN_SIZE - 1) 61#define TVN_MASK (TVN_SIZE - 1)
63#define TVR_MASK (TVR_SIZE - 1) 62#define TVR_MASK (TVR_SIZE - 1)
64 63
65struct timer_base_s {
66 spinlock_t lock;
67 struct timer_list *running_timer;
68};
69
70typedef struct tvec_s { 64typedef struct tvec_s {
71 struct list_head vec[TVN_SIZE]; 65 struct list_head vec[TVN_SIZE];
72} tvec_t; 66} tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
76} tvec_root_t; 70} tvec_root_t;
77 71
78struct tvec_t_base_s { 72struct tvec_t_base_s {
79 struct timer_base_s t_base; 73 spinlock_t lock;
74 struct timer_list *running_timer;
80 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
81 tvec_root_t tv1; 76 tvec_root_t tv1;
82 tvec_t tv2; 77 tvec_t tv2;
@@ -86,13 +81,16 @@ struct tvec_t_base_s {
86} ____cacheline_aligned_in_smp; 81} ____cacheline_aligned_in_smp;
87 82
88typedef struct tvec_t_base_s tvec_base_t; 83typedef struct tvec_t_base_s tvec_base_t;
89static DEFINE_PER_CPU(tvec_base_t, tvec_bases); 84
85tvec_base_t boot_tvec_bases;
86EXPORT_SYMBOL(boot_tvec_bases);
87static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
90 88
91static inline void set_running_timer(tvec_base_t *base, 89static inline void set_running_timer(tvec_base_t *base,
92 struct timer_list *timer) 90 struct timer_list *timer)
93{ 91{
94#ifdef CONFIG_SMP 92#ifdef CONFIG_SMP
95 base->t_base.running_timer = timer; 93 base->running_timer = timer;
96#endif 94#endif
97} 95}
98 96
@@ -138,15 +136,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
138 list_add_tail(&timer->entry, vec); 136 list_add_tail(&timer->entry, vec);
139} 137}
140 138
141typedef struct timer_base_s timer_base_t;
142/*
143 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
144 * at compile time, and we need timer->base to lock the timer.
145 */
146timer_base_t __init_timer_base
147 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
148EXPORT_SYMBOL(__init_timer_base);
149
150/*** 139/***
151 * init_timer - initialize a timer. 140 * init_timer - initialize a timer.
152 * @timer: the timer to be initialized 141 * @timer: the timer to be initialized
@@ -157,7 +146,7 @@ EXPORT_SYMBOL(__init_timer_base);
157void fastcall init_timer(struct timer_list *timer) 146void fastcall init_timer(struct timer_list *timer)
158{ 147{
159 timer->entry.next = NULL; 148 timer->entry.next = NULL;
160 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; 149 timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
161} 150}
162EXPORT_SYMBOL(init_timer); 151EXPORT_SYMBOL(init_timer);
163 152
@@ -173,7 +162,7 @@ static inline void detach_timer(struct timer_list *timer,
173} 162}
174 163
175/* 164/*
176 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock 165 * We are using hashed locking: holding per_cpu(tvec_bases).lock
177 * means that all timers which are tied to this base via timer->base are 166 * means that all timers which are tied to this base via timer->base are
178 * locked, and the base itself is locked too. 167 * locked, and the base itself is locked too.
179 * 168 *
@@ -184,10 +173,10 @@ static inline void detach_timer(struct timer_list *timer,
184 * possible to set timer->base = NULL and drop the lock: the timer remains 173 * possible to set timer->base = NULL and drop the lock: the timer remains
185 * locked. 174 * locked.
186 */ 175 */
187static timer_base_t *lock_timer_base(struct timer_list *timer, 176static tvec_base_t *lock_timer_base(struct timer_list *timer,
188 unsigned long *flags) 177 unsigned long *flags)
189{ 178{
190 timer_base_t *base; 179 tvec_base_t *base;
191 180
192 for (;;) { 181 for (;;) {
193 base = timer->base; 182 base = timer->base;
@@ -204,8 +193,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
204 193
205int __mod_timer(struct timer_list *timer, unsigned long expires) 194int __mod_timer(struct timer_list *timer, unsigned long expires)
206{ 195{
207 timer_base_t *base; 196 tvec_base_t *base, *new_base;
208 tvec_base_t *new_base;
209 unsigned long flags; 197 unsigned long flags;
210 int ret = 0; 198 int ret = 0;
211 199
@@ -218,9 +206,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
218 ret = 1; 206 ret = 1;
219 } 207 }
220 208
221 new_base = &__get_cpu_var(tvec_bases); 209 new_base = __get_cpu_var(tvec_bases);
222 210
223 if (base != &new_base->t_base) { 211 if (base != new_base) {
224 /* 212 /*
225 * We are trying to schedule the timer on the local CPU. 213 * We are trying to schedule the timer on the local CPU.
226 * However we can't change timer's base while it is running, 214 * However we can't change timer's base while it is running,
@@ -228,21 +216,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
228 * handler yet has not finished. This also guarantees that 216 * handler yet has not finished. This also guarantees that
229 * the timer is serialized wrt itself. 217 * the timer is serialized wrt itself.
230 */ 218 */
231 if (unlikely(base->running_timer == timer)) { 219 if (likely(base->running_timer != timer)) {
232 /* The timer remains on a former base */
233 new_base = container_of(base, tvec_base_t, t_base);
234 } else {
235 /* See the comment in lock_timer_base() */ 220 /* See the comment in lock_timer_base() */
236 timer->base = NULL; 221 timer->base = NULL;
237 spin_unlock(&base->lock); 222 spin_unlock(&base->lock);
238 spin_lock(&new_base->t_base.lock); 223 base = new_base;
239 timer->base = &new_base->t_base; 224 spin_lock(&base->lock);
225 timer->base = base;
240 } 226 }
241 } 227 }
242 228
243 timer->expires = expires; 229 timer->expires = expires;
244 internal_add_timer(new_base, timer); 230 internal_add_timer(base, timer);
245 spin_unlock_irqrestore(&new_base->t_base.lock, flags); 231 spin_unlock_irqrestore(&base->lock, flags);
246 232
247 return ret; 233 return ret;
248} 234}
@@ -258,14 +244,14 @@ EXPORT_SYMBOL(__mod_timer);
258 */ 244 */
259void add_timer_on(struct timer_list *timer, int cpu) 245void add_timer_on(struct timer_list *timer, int cpu)
260{ 246{
261 tvec_base_t *base = &per_cpu(tvec_bases, cpu); 247 tvec_base_t *base = per_cpu(tvec_bases, cpu);
262 unsigned long flags; 248 unsigned long flags;
263 249
264 BUG_ON(timer_pending(timer) || !timer->function); 250 BUG_ON(timer_pending(timer) || !timer->function);
265 spin_lock_irqsave(&base->t_base.lock, flags); 251 spin_lock_irqsave(&base->lock, flags);
266 timer->base = &base->t_base; 252 timer->base = base;
267 internal_add_timer(base, timer); 253 internal_add_timer(base, timer);
268 spin_unlock_irqrestore(&base->t_base.lock, flags); 254 spin_unlock_irqrestore(&base->lock, flags);
269} 255}
270 256
271 257
@@ -318,7 +304,7 @@ EXPORT_SYMBOL(mod_timer);
318 */ 304 */
319int del_timer(struct timer_list *timer) 305int del_timer(struct timer_list *timer)
320{ 306{
321 timer_base_t *base; 307 tvec_base_t *base;
322 unsigned long flags; 308 unsigned long flags;
323 int ret = 0; 309 int ret = 0;
324 310
@@ -345,7 +331,7 @@ EXPORT_SYMBOL(del_timer);
345 */ 331 */
346int try_to_del_timer_sync(struct timer_list *timer) 332int try_to_del_timer_sync(struct timer_list *timer)
347{ 333{
348 timer_base_t *base; 334 tvec_base_t *base;
349 unsigned long flags; 335 unsigned long flags;
350 int ret = -1; 336 int ret = -1;
351 337
@@ -409,7 +395,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
409 struct timer_list *tmp; 395 struct timer_list *tmp;
410 396
411 tmp = list_entry(curr, struct timer_list, entry); 397 tmp = list_entry(curr, struct timer_list, entry);
412 BUG_ON(tmp->base != &base->t_base); 398 BUG_ON(tmp->base != base);
413 curr = curr->next; 399 curr = curr->next;
414 internal_add_timer(base, tmp); 400 internal_add_timer(base, tmp);
415 } 401 }
@@ -431,7 +417,7 @@ static inline void __run_timers(tvec_base_t *base)
431{ 417{
432 struct timer_list *timer; 418 struct timer_list *timer;
433 419
434 spin_lock_irq(&base->t_base.lock); 420 spin_lock_irq(&base->lock);
435 while (time_after_eq(jiffies, base->timer_jiffies)) { 421 while (time_after_eq(jiffies, base->timer_jiffies)) {
436 struct list_head work_list = LIST_HEAD_INIT(work_list); 422 struct list_head work_list = LIST_HEAD_INIT(work_list);
437 struct list_head *head = &work_list; 423 struct list_head *head = &work_list;
@@ -457,7 +443,7 @@ static inline void __run_timers(tvec_base_t *base)
457 443
458 set_running_timer(base, timer); 444 set_running_timer(base, timer);
459 detach_timer(timer, 1); 445 detach_timer(timer, 1);
460 spin_unlock_irq(&base->t_base.lock); 446 spin_unlock_irq(&base->lock);
461 { 447 {
462 int preempt_count = preempt_count(); 448 int preempt_count = preempt_count();
463 fn(data); 449 fn(data);
@@ -470,11 +456,11 @@ static inline void __run_timers(tvec_base_t *base)
470 BUG(); 456 BUG();
471 } 457 }
472 } 458 }
473 spin_lock_irq(&base->t_base.lock); 459 spin_lock_irq(&base->lock);
474 } 460 }
475 } 461 }
476 set_running_timer(base, NULL); 462 set_running_timer(base, NULL);
477 spin_unlock_irq(&base->t_base.lock); 463 spin_unlock_irq(&base->lock);
478} 464}
479 465
480#ifdef CONFIG_NO_IDLE_HZ 466#ifdef CONFIG_NO_IDLE_HZ
@@ -489,11 +475,23 @@ unsigned long next_timer_interrupt(void)
489 struct list_head *list; 475 struct list_head *list;
490 struct timer_list *nte; 476 struct timer_list *nte;
491 unsigned long expires; 477 unsigned long expires;
478 unsigned long hr_expires = MAX_JIFFY_OFFSET;
479 ktime_t hr_delta;
492 tvec_t *varray[4]; 480 tvec_t *varray[4];
493 int i, j; 481 int i, j;
494 482
495 base = &__get_cpu_var(tvec_bases); 483 hr_delta = hrtimer_get_next_event();
496 spin_lock(&base->t_base.lock); 484 if (hr_delta.tv64 != KTIME_MAX) {
485 struct timespec tsdelta;
486 tsdelta = ktime_to_timespec(hr_delta);
487 hr_expires = timespec_to_jiffies(&tsdelta);
488 if (hr_expires < 3)
489 return hr_expires + jiffies;
490 }
491 hr_expires += jiffies;
492
493 base = __get_cpu_var(tvec_bases);
494 spin_lock(&base->lock);
497 expires = base->timer_jiffies + (LONG_MAX >> 1); 495 expires = base->timer_jiffies + (LONG_MAX >> 1);
498 list = NULL; 496 list = NULL;
499 497
@@ -541,7 +539,27 @@ found:
541 expires = nte->expires; 539 expires = nte->expires;
542 } 540 }
543 } 541 }
544 spin_unlock(&base->t_base.lock); 542 spin_unlock(&base->lock);
543
544 /*
545 * It can happen that other CPUs service timer IRQs and increment
546 * jiffies, but we have not yet got a local timer tick to process
547 * the timer wheels. In that case, the expiry time can be before
548 * jiffies, but since the high-resolution timer here is relative to
549 * jiffies, the default expression when high-resolution timers are
550 * not active,
551 *
552 * time_before(MAX_JIFFY_OFFSET + jiffies, expires)
553 *
554 * would falsely evaluate to true. If that is the case, just
555 * return jiffies so that we can immediately fire the local timer
556 */
557 if (time_before(expires, jiffies))
558 return jiffies;
559
560 if (time_before(hr_expires, expires))
561 return hr_expires;
562
545 return expires; 563 return expires;
546} 564}
547#endif 565#endif
@@ -680,18 +698,9 @@ static void second_overflow(void)
680 698
681 /* 699 /*
682 * Compute the frequency estimate and additional phase adjustment due 700 * Compute the frequency estimate and additional phase adjustment due
683 * to frequency error for the next second. When the PPS signal is 701 * to frequency error for the next second.
684 * engaged, gnaw on the watchdog counter and update the frequency
685 * computed by the pll and the PPS signal.
686 */ 702 */
687 pps_valid++; 703 ltemp = time_freq;
688 if (pps_valid == PPS_VALID) { /* PPS signal lost */
689 pps_jitter = MAXTIME;
690 pps_stabil = MAXFREQ;
691 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
692 STA_PPSWANDER | STA_PPSERROR);
693 }
694 ltemp = time_freq + pps_freq;
695 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); 704 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
696 705
697#if HZ == 100 706#if HZ == 100
@@ -717,12 +726,16 @@ static void second_overflow(void)
717#endif 726#endif
718} 727}
719 728
720/* in the NTP reference this is called "hardclock()" */ 729/*
721static void update_wall_time_one_tick(void) 730 * Returns how many microseconds we need to add to xtime this tick
731 * in doing an adjustment requested with adjtime.
732 */
733static long adjtime_adjustment(void)
722{ 734{
723 long time_adjust_step, delta_nsec; 735 long time_adjust_step;
724 736
725 if ((time_adjust_step = time_adjust) != 0 ) { 737 time_adjust_step = time_adjust;
738 if (time_adjust_step) {
726 /* 739 /*
727 * We are doing an adjtime thing. Prepare time_adjust_step to 740 * We are doing an adjtime thing. Prepare time_adjust_step to
728 * be within bounds. Note that a positive time_adjust means we 741 * be within bounds. Note that a positive time_adjust means we
@@ -733,10 +746,19 @@ static void update_wall_time_one_tick(void)
733 */ 746 */
734 time_adjust_step = min(time_adjust_step, (long)tickadj); 747 time_adjust_step = min(time_adjust_step, (long)tickadj);
735 time_adjust_step = max(time_adjust_step, (long)-tickadj); 748 time_adjust_step = max(time_adjust_step, (long)-tickadj);
749 }
750 return time_adjust_step;
751}
736 752
753/* in the NTP reference this is called "hardclock()" */
754static void update_wall_time_one_tick(void)
755{
756 long time_adjust_step, delta_nsec;
757
758 time_adjust_step = adjtime_adjustment();
759 if (time_adjust_step)
737 /* Reduce by this step the amount of time left */ 760 /* Reduce by this step the amount of time left */
738 time_adjust -= time_adjust_step; 761 time_adjust -= time_adjust_step;
739 }
740 delta_nsec = tick_nsec + time_adjust_step * 1000; 762 delta_nsec = tick_nsec + time_adjust_step * 1000;
741 /* 763 /*
742 * Advance the phase, once it gets to one microsecond, then 764 * Advance the phase, once it gets to one microsecond, then
@@ -759,6 +781,22 @@ static void update_wall_time_one_tick(void)
759} 781}
760 782
761/* 783/*
784 * Return how long ticks are at the moment, that is, how much time
785 * update_wall_time_one_tick will add to xtime next time we call it
786 * (assuming no calls to do_adjtimex in the meantime).
787 * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
788 * bits to the right of the binary point.
789 * This function has no side-effects.
790 */
791u64 current_tick_length(void)
792{
793 long delta_nsec;
794
795 delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
796 return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
797}
798
799/*
762 * Using a loop looks inefficient, but "ticks" is 800 * Using a loop looks inefficient, but "ticks" is
763 * usually just one (we shouldn't be losing ticks, 801 * usually just one (we shouldn't be losing ticks,
764 * we're doing this this way mainly for interrupt 802 * we're doing this this way mainly for interrupt
@@ -804,7 +842,7 @@ void update_process_times(int user_tick)
804 */ 842 */
805static unsigned long count_active_tasks(void) 843static unsigned long count_active_tasks(void)
806{ 844{
807 return (nr_running() + nr_uninterruptible()) * FIXED_1; 845 return nr_active() * FIXED_1;
808} 846}
809 847
810/* 848/*
@@ -856,7 +894,7 @@ EXPORT_SYMBOL(xtime_lock);
856 */ 894 */
857static void run_timer_softirq(struct softirq_action *h) 895static void run_timer_softirq(struct softirq_action *h)
858{ 896{
859 tvec_base_t *base = &__get_cpu_var(tvec_bases); 897 tvec_base_t *base = __get_cpu_var(tvec_bases);
860 898
861 hrtimer_run_queues(); 899 hrtimer_run_queues();
862 if (time_after_eq(jiffies, base->timer_jiffies)) 900 if (time_after_eq(jiffies, base->timer_jiffies))
@@ -869,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h)
869void run_local_timers(void) 907void run_local_timers(void)
870{ 908{
871 raise_softirq(TIMER_SOFTIRQ); 909 raise_softirq(TIMER_SOFTIRQ);
910 softlockup_tick();
872} 911}
873 912
874/* 913/*
@@ -896,8 +935,9 @@ static inline void update_times(void)
896void do_timer(struct pt_regs *regs) 935void do_timer(struct pt_regs *regs)
897{ 936{
898 jiffies_64++; 937 jiffies_64++;
938 /* prevent loading jiffies before storing new jiffies_64 value. */
939 barrier();
899 update_times(); 940 update_times();
900 softlockup_tick(regs);
901} 941}
902 942
903#ifdef __ARCH_WANT_SYS_ALARM 943#ifdef __ARCH_WANT_SYS_ALARM
@@ -908,19 +948,7 @@ void do_timer(struct pt_regs *regs)
908 */ 948 */
909asmlinkage unsigned long sys_alarm(unsigned int seconds) 949asmlinkage unsigned long sys_alarm(unsigned int seconds)
910{ 950{
911 struct itimerval it_new, it_old; 951 return alarm_setitimer(seconds);
912 unsigned int oldalarm;
913
914 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
915 it_new.it_value.tv_sec = seconds;
916 it_new.it_value.tv_usec = 0;
917 do_setitimer(ITIMER_REAL, &it_new, &it_old);
918 oldalarm = it_old.it_value.tv_sec;
919 /* ehhh.. We can't return 0 if we have an alarm pending.. */
920 /* And we'd better return too much than too little anyway */
921 if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
922 oldalarm++;
923 return oldalarm;
924} 952}
925 953
926#endif 954#endif
@@ -1209,13 +1237,41 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1209 return 0; 1237 return 0;
1210} 1238}
1211 1239
1212static void __devinit init_timers_cpu(int cpu) 1240static int __devinit init_timers_cpu(int cpu)
1213{ 1241{
1214 int j; 1242 int j;
1215 tvec_base_t *base; 1243 tvec_base_t *base;
1244 static char __devinitdata tvec_base_done[NR_CPUS];
1245
1246 if (!tvec_base_done[cpu]) {
1247 static char boot_done;
1216 1248
1217 base = &per_cpu(tvec_bases, cpu); 1249 if (boot_done) {
1218 spin_lock_init(&base->t_base.lock); 1250 /*
1251 * The APs use this path later in boot
1252 */
1253 base = kmalloc_node(sizeof(*base), GFP_KERNEL,
1254 cpu_to_node(cpu));
1255 if (!base)
1256 return -ENOMEM;
1257 memset(base, 0, sizeof(*base));
1258 per_cpu(tvec_bases, cpu) = base;
1259 } else {
1260 /*
1261 * This is for the boot CPU - we use compile-time
1262 * static initialisation because per-cpu memory isn't
1263 * ready yet and because the memory allocators are not
1264 * initialised either.
1265 */
1266 boot_done = 1;
1267 base = &boot_tvec_bases;
1268 }
1269 tvec_base_done[cpu] = 1;
1270 } else {
1271 base = per_cpu(tvec_bases, cpu);
1272 }
1273
1274 spin_lock_init(&base->lock);
1219 for (j = 0; j < TVN_SIZE; j++) { 1275 for (j = 0; j < TVN_SIZE; j++) {
1220 INIT_LIST_HEAD(base->tv5.vec + j); 1276 INIT_LIST_HEAD(base->tv5.vec + j);
1221 INIT_LIST_HEAD(base->tv4.vec + j); 1277 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1226,6 +1282,7 @@ static void __devinit init_timers_cpu(int cpu)
1226 INIT_LIST_HEAD(base->tv1.vec + j); 1282 INIT_LIST_HEAD(base->tv1.vec + j);
1227 1283
1228 base->timer_jiffies = jiffies; 1284 base->timer_jiffies = jiffies;
1285 return 0;
1229} 1286}
1230 1287
1231#ifdef CONFIG_HOTPLUG_CPU 1288#ifdef CONFIG_HOTPLUG_CPU
@@ -1236,7 +1293,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1236 while (!list_empty(head)) { 1293 while (!list_empty(head)) {
1237 timer = list_entry(head->next, struct timer_list, entry); 1294 timer = list_entry(head->next, struct timer_list, entry);
1238 detach_timer(timer, 0); 1295 detach_timer(timer, 0);
1239 timer->base = &new_base->t_base; 1296 timer->base = new_base;
1240 internal_add_timer(new_base, timer); 1297 internal_add_timer(new_base, timer);
1241 } 1298 }
1242} 1299}
@@ -1248,15 +1305,15 @@ static void __devinit migrate_timers(int cpu)
1248 int i; 1305 int i;
1249 1306
1250 BUG_ON(cpu_online(cpu)); 1307 BUG_ON(cpu_online(cpu));
1251 old_base = &per_cpu(tvec_bases, cpu); 1308 old_base = per_cpu(tvec_bases, cpu);
1252 new_base = &get_cpu_var(tvec_bases); 1309 new_base = get_cpu_var(tvec_bases);
1253 1310
1254 local_irq_disable(); 1311 local_irq_disable();
1255 spin_lock(&new_base->t_base.lock); 1312 spin_lock(&new_base->lock);
1256 spin_lock(&old_base->t_base.lock); 1313 spin_lock(&old_base->lock);
1314
1315 BUG_ON(old_base->running_timer);
1257 1316
1258 if (old_base->t_base.running_timer)
1259 BUG();
1260 for (i = 0; i < TVR_SIZE; i++) 1317 for (i = 0; i < TVR_SIZE; i++)
1261 migrate_timer_list(new_base, old_base->tv1.vec + i); 1318 migrate_timer_list(new_base, old_base->tv1.vec + i);
1262 for (i = 0; i < TVN_SIZE; i++) { 1319 for (i = 0; i < TVN_SIZE; i++) {
@@ -1266,20 +1323,21 @@ static void __devinit migrate_timers(int cpu)
1266 migrate_timer_list(new_base, old_base->tv5.vec + i); 1323 migrate_timer_list(new_base, old_base->tv5.vec + i);
1267 } 1324 }
1268 1325
1269 spin_unlock(&old_base->t_base.lock); 1326 spin_unlock(&old_base->lock);
1270 spin_unlock(&new_base->t_base.lock); 1327 spin_unlock(&new_base->lock);
1271 local_irq_enable(); 1328 local_irq_enable();
1272 put_cpu_var(tvec_bases); 1329 put_cpu_var(tvec_bases);
1273} 1330}
1274#endif /* CONFIG_HOTPLUG_CPU */ 1331#endif /* CONFIG_HOTPLUG_CPU */
1275 1332
1276static int __devinit timer_cpu_notify(struct notifier_block *self, 1333static int timer_cpu_notify(struct notifier_block *self,
1277 unsigned long action, void *hcpu) 1334 unsigned long action, void *hcpu)
1278{ 1335{
1279 long cpu = (long)hcpu; 1336 long cpu = (long)hcpu;
1280 switch(action) { 1337 switch(action) {
1281 case CPU_UP_PREPARE: 1338 case CPU_UP_PREPARE:
1282 init_timers_cpu(cpu); 1339 if (init_timers_cpu(cpu) < 0)
1340 return NOTIFY_BAD;
1283 break; 1341 break;
1284#ifdef CONFIG_HOTPLUG_CPU 1342#ifdef CONFIG_HOTPLUG_CPU
1285 case CPU_DEAD: 1343 case CPU_DEAD:
@@ -1292,7 +1350,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
1292 return NOTIFY_OK; 1350 return NOTIFY_OK;
1293} 1351}
1294 1352
1295static struct notifier_block __devinitdata timers_nb = { 1353static struct notifier_block timers_nb = {
1296 .notifier_call = timer_cpu_notify, 1354 .notifier_call = timer_cpu_notify,
1297}; 1355};
1298 1356
@@ -1307,8 +1365,8 @@ void __init init_timers(void)
1307 1365
1308#ifdef CONFIG_TIME_INTERPOLATION 1366#ifdef CONFIG_TIME_INTERPOLATION
1309 1367
1310struct time_interpolator *time_interpolator; 1368struct time_interpolator *time_interpolator __read_mostly;
1311static struct time_interpolator *time_interpolator_list; 1369static struct time_interpolator *time_interpolator_list __read_mostly;
1312static DEFINE_SPINLOCK(time_interpolator_lock); 1370static DEFINE_SPINLOCK(time_interpolator_lock);
1313 1371
1314static inline u64 time_interpolator_get_cycles(unsigned int src) 1372static inline u64 time_interpolator_get_cycles(unsigned int src)
@@ -1322,10 +1380,10 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
1322 return x(); 1380 return x();
1323 1381
1324 case TIME_SOURCE_MMIO64 : 1382 case TIME_SOURCE_MMIO64 :
1325 return readq((void __iomem *) time_interpolator->addr); 1383 return readq_relaxed((void __iomem *)time_interpolator->addr);
1326 1384
1327 case TIME_SOURCE_MMIO32 : 1385 case TIME_SOURCE_MMIO32 :
1328 return readl((void __iomem *) time_interpolator->addr); 1386 return readl_relaxed((void __iomem *)time_interpolator->addr);
1329 1387
1330 default: return get_cycles(); 1388 default: return get_cycles();
1331 } 1389 }
@@ -1422,7 +1480,7 @@ static void time_interpolator_update(long delta_nsec)
1422 */ 1480 */
1423 if (jiffies % INTERPOLATOR_ADJUST == 0) 1481 if (jiffies % INTERPOLATOR_ADJUST == 0)
1424 { 1482 {
1425 if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC) 1483 if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
1426 time_interpolator->nsec_per_cyc--; 1484 time_interpolator->nsec_per_cyc--;
1427 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) 1485 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1428 time_interpolator->nsec_per_cyc++; 1486 time_interpolator->nsec_per_cyc++;
@@ -1446,8 +1504,7 @@ register_time_interpolator(struct time_interpolator *ti)
1446 unsigned long flags; 1504 unsigned long flags;
1447 1505
1448 /* Sanity check */ 1506 /* Sanity check */
1449 if (ti->frequency == 0 || ti->mask == 0) 1507 BUG_ON(ti->frequency == 0 || ti->mask == 0);
1450 BUG();
1451 1508
1452 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; 1509 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1453 spin_lock(&time_interpolator_lock); 1510 spin_lock(&time_interpolator_lock);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index aa25605027..187e2a4238 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -20,43 +20,67 @@
20 20
21asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) 21asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
22{ 22{
23 return sys_chown(filename, low2highuid(user), low2highgid(group)); 23 long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
24 /* avoid REGPARM breakage on x86: */
25 prevent_tail_call(ret);
26 return ret;
24} 27}
25 28
26asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) 29asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
27{ 30{
28 return sys_lchown(filename, low2highuid(user), low2highgid(group)); 31 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
32 /* avoid REGPARM breakage on x86: */
33 prevent_tail_call(ret);
34 return ret;
29} 35}
30 36
31asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) 37asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
32{ 38{
33 return sys_fchown(fd, low2highuid(user), low2highgid(group)); 39 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
40 /* avoid REGPARM breakage on x86: */
41 prevent_tail_call(ret);
42 return ret;
34} 43}
35 44
36asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) 45asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
37{ 46{
38 return sys_setregid(low2highgid(rgid), low2highgid(egid)); 47 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
48 /* avoid REGPARM breakage on x86: */
49 prevent_tail_call(ret);
50 return ret;
39} 51}
40 52
41asmlinkage long sys_setgid16(old_gid_t gid) 53asmlinkage long sys_setgid16(old_gid_t gid)
42{ 54{
43 return sys_setgid(low2highgid(gid)); 55 long ret = sys_setgid(low2highgid(gid));
56 /* avoid REGPARM breakage on x86: */
57 prevent_tail_call(ret);
58 return ret;
44} 59}
45 60
46asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) 61asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
47{ 62{
48 return sys_setreuid(low2highuid(ruid), low2highuid(euid)); 63 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
64 /* avoid REGPARM breakage on x86: */
65 prevent_tail_call(ret);
66 return ret;
49} 67}
50 68
51asmlinkage long sys_setuid16(old_uid_t uid) 69asmlinkage long sys_setuid16(old_uid_t uid)
52{ 70{
53 return sys_setuid(low2highuid(uid)); 71 long ret = sys_setuid(low2highuid(uid));
72 /* avoid REGPARM breakage on x86: */
73 prevent_tail_call(ret);
74 return ret;
54} 75}
55 76
56asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) 77asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
57{ 78{
58 return sys_setresuid(low2highuid(ruid), low2highuid(euid), 79 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
59 low2highuid(suid)); 80 low2highuid(suid));
81 /* avoid REGPARM breakage on x86: */
82 prevent_tail_call(ret);
83 return ret;
60} 84}
61 85
62asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) 86asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
@@ -72,8 +96,11 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
72 96
73asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) 97asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
74{ 98{
75 return sys_setresgid(low2highgid(rgid), low2highgid(egid), 99 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
76 low2highgid(sgid)); 100 low2highgid(sgid));
101 /* avoid REGPARM breakage on x86: */
102 prevent_tail_call(ret);
103 return ret;
77} 104}
78 105
79asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) 106asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
@@ -89,12 +116,18 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
89 116
90asmlinkage long sys_setfsuid16(old_uid_t uid) 117asmlinkage long sys_setfsuid16(old_uid_t uid)
91{ 118{
92 return sys_setfsuid(low2highuid(uid)); 119 long ret = sys_setfsuid(low2highuid(uid));
120 /* avoid REGPARM breakage on x86: */
121 prevent_tail_call(ret);
122 return ret;
93} 123}
94 124
95asmlinkage long sys_setfsgid16(old_gid_t gid) 125asmlinkage long sys_setfsgid16(old_gid_t gid)
96{ 126{
97 return sys_setfsgid(low2highgid(gid)); 127 long ret = sys_setfsgid(low2highgid(gid));
128 /* avoid REGPARM breakage on x86: */
129 prevent_tail_call(ret);
130 return ret;
98} 131}
99 132
100static int groups16_to_user(old_gid_t __user *grouplist, 133static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/user.c b/kernel/user.c
index d9deae43a9..2116642f42 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -105,15 +105,19 @@ void free_uid(struct user_struct *up)
105{ 105{
106 unsigned long flags; 106 unsigned long flags;
107 107
108 if (!up)
109 return;
110
108 local_irq_save(flags); 111 local_irq_save(flags);
109 if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { 112 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
110 uid_hash_remove(up); 113 uid_hash_remove(up);
114 spin_unlock_irqrestore(&uidhash_lock, flags);
111 key_put(up->uid_keyring); 115 key_put(up->uid_keyring);
112 key_put(up->session_keyring); 116 key_put(up->session_keyring);
113 kmem_cache_free(uid_cachep, up); 117 kmem_cache_free(uid_cachep, up);
114 spin_unlock(&uidhash_lock); 118 } else {
119 local_irq_restore(flags);
115 } 120 }
116 local_irq_restore(flags);
117} 121}
118 122
119struct user_struct * alloc_uid(uid_t uid) 123struct user_struct * alloc_uid(uid_t uid)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b052e2c4c7..880fb415a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -27,6 +27,7 @@
27#include <linux/cpu.h> 27#include <linux/cpu.h>
28#include <linux/notifier.h> 28#include <linux/notifier.h>
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30#include <linux/hardirq.h>
30 31
31/* 32/*
32 * The per-CPU workqueue (if single thread, we always use the first 33 * The per-CPU workqueue (if single thread, we always use the first
@@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work)
476} 477}
477EXPORT_SYMBOL(cancel_rearming_delayed_work); 478EXPORT_SYMBOL(cancel_rearming_delayed_work);
478 479
480/**
481 * execute_in_process_context - reliably execute the routine with user context
482 * @fn: the function to execute
483 * @data: data to pass to the function
484 * @ew: guaranteed storage for the execute work structure (must
485 * be available when the work executes)
486 *
487 * Executes the function immediately if process context is available,
488 * otherwise schedules the function for delayed execution.
489 *
490 * Returns: 0 - function was executed
491 * 1 - function was scheduled for execution
492 */
493int execute_in_process_context(void (*fn)(void *data), void *data,
494 struct execute_work *ew)
495{
496 if (!in_interrupt()) {
497 fn(data);
498 return 0;
499 }
500
501 INIT_WORK(&ew->work, fn, data);
502 schedule_work(&ew->work);
503
504 return 1;
505}
506EXPORT_SYMBOL_GPL(execute_in_process_context);
507
479int keventd_up(void) 508int keventd_up(void)
480{ 509{
481 return keventd_wq != NULL; 510 return keventd_wq != NULL;
@@ -518,7 +547,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
518} 547}
519 548
520/* We're holding the cpucontrol mutex here */ 549/* We're holding the cpucontrol mutex here */
521static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 550static int workqueue_cpu_callback(struct notifier_block *nfb,
522 unsigned long action, 551 unsigned long action,
523 void *hcpu) 552 void *hcpu)
524{ 553{