aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c117
-rw-r--r--kernel/audit.c205
-rw-r--r--kernel/audit.h61
-rw-r--r--kernel/auditfilter.c899
-rw-r--r--kernel/auditsc.c649
-rw-r--r--kernel/compat.c30
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/cpuset.c42
-rw-r--r--kernel/exit.c29
-rw-r--r--kernel/fork.c21
-rw-r--r--kernel/futex.c8
-rw-r--r--kernel/hrtimer.c19
-rw-r--r--kernel/intermodule.c184
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/migration.c4
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/kprobes.c58
-rw-r--r--kernel/ksysfs.c19
-rw-r--r--kernel/kthread.c61
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/mutex-debug.c12
-rw-r--r--kernel/mutex-debug.h25
-rw-r--r--kernel/mutex.c21
-rw-r--r--kernel/mutex.h6
-rw-r--r--kernel/posix-cpu-timers.c48
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/disk.c2
-rw-r--r--kernel/power/main.c8
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c148
-rw-r--r--kernel/power/swsusp.c20
-rw-r--r--kernel/printk.c80
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcupdate.c13
-rw-r--r--kernel/sched.c29
-rw-r--r--kernel/signal.c37
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/stop_machine.c17
-rw-r--r--kernel/sys.c80
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c349
-rw-r--r--kernel/time/jiffies.c73
-rw-r--r--kernel/timer.c428
-rw-r--r--kernel/unwind.c918
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/workqueue.c34
53 files changed, 3997 insertions, 900 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9d156a..752bd7d383af 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,6 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o 11 hrtimer.o
12 12
13obj-y += time/
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 14obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
14obj-$(CONFIG_FUTEX) += futex.o 15obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y) 16ifeq ($(CONFIG_COMPAT),y)
@@ -20,8 +21,8 @@ obj-$(CONFIG_SMP) += cpu.o spinlock.o
20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 21obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
21obj-$(CONFIG_UID16) += uid16.o 22obj-$(CONFIG_UID16) += uid16.o
22obj-$(CONFIG_MODULES) += module.o 23obj-$(CONFIG_MODULES) += module.o
23obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
24obj-$(CONFIG_KALLSYMS) += kallsyms.o 24obj-$(CONFIG_KALLSYMS) += kallsyms.o
25obj-$(CONFIG_STACK_UNWIND) += unwind.o
25obj-$(CONFIG_PM) += power/ 26obj-$(CONFIG_PM) += power/
26obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 27obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
27obj-$(CONFIG_KEXEC) += kexec.o 28obj-$(CONFIG_KEXEC) += kexec.o
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d20104..368c4f03fe0e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
75/* 75/*
76 * External references and all of the globals. 76 * External references and all of the globals.
77 */ 77 */
78static void do_acct_process(long, struct file *); 78static void do_acct_process(struct file *);
79 79
80/* 80/*
81 * This structure is used so that all the data protected by lock 81 * This structure is used so that all the data protected by lock
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
118 spin_unlock(&acct_globals.lock); 118 spin_unlock(&acct_globals.lock);
119 119
120 /* May block */ 120 /* May block */
121 if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) 121 if (vfs_statfs(file->f_dentry, &sbuf))
122 return res; 122 return res;
123 suspend = sbuf.f_blocks * SUSPEND; 123 suspend = sbuf.f_blocks * SUSPEND;
124 resume = sbuf.f_blocks * RESUME; 124 resume = sbuf.f_blocks * RESUME;
@@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file)
196 if (old_acct) { 196 if (old_acct) {
197 mnt_unpin(old_acct->f_vfsmnt); 197 mnt_unpin(old_acct->f_vfsmnt);
198 spin_unlock(&acct_globals.lock); 198 spin_unlock(&acct_globals.lock);
199 do_acct_process(0, old_acct); 199 do_acct_process(old_acct);
200 filp_close(old_acct, NULL); 200 filp_close(old_acct, NULL);
201 spin_lock(&acct_globals.lock); 201 spin_lock(&acct_globals.lock);
202 } 202 }
@@ -419,16 +419,15 @@ static u32 encode_float(u64 value)
419/* 419/*
420 * do_acct_process does all actual work. Caller holds the reference to file. 420 * do_acct_process does all actual work. Caller holds the reference to file.
421 */ 421 */
422static void do_acct_process(long exitcode, struct file *file) 422static void do_acct_process(struct file *file)
423{ 423{
424 struct pacct_struct *pacct = &current->signal->pacct;
424 acct_t ac; 425 acct_t ac;
425 mm_segment_t fs; 426 mm_segment_t fs;
426 unsigned long vsize;
427 unsigned long flim; 427 unsigned long flim;
428 u64 elapsed; 428 u64 elapsed;
429 u64 run_time; 429 u64 run_time;
430 struct timespec uptime; 430 struct timespec uptime;
431 unsigned long jiffies;
432 431
433 /* 432 /*
434 * First check to see if there is enough free_space to continue 433 * First check to see if there is enough free_space to continue
@@ -469,12 +468,6 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 468#endif
470 do_div(elapsed, AHZ); 469 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 470 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 471 /* we really need to bite the bullet and change layout */
479 ac.ac_uid = current->uid; 472 ac.ac_uid = current->uid;
480 ac.ac_gid = current->gid; 473 ac.ac_gid = current->gid;
@@ -496,37 +489,18 @@ static void do_acct_process(long exitcode, struct file *file)
496 old_encode_dev(tty_devnum(current->signal->tty)) : 0; 489 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
497 read_unlock(&tasklist_lock); 490 read_unlock(&tasklist_lock);
498 491
499 ac.ac_flag = 0; 492 spin_lock(&current->sighand->siglock);
500 if (current->flags & PF_FORKNOEXEC) 493 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
501 ac.ac_flag |= AFORK; 494 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
502 if (current->flags & PF_SUPERPRIV) 495 ac.ac_flag = pacct->ac_flag;
503 ac.ac_flag |= ASU; 496 ac.ac_mem = encode_comp_t(pacct->ac_mem);
504 if (current->flags & PF_DUMPCORE) 497 ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
505 ac.ac_flag |= ACORE; 498 ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
506 if (current->flags & PF_SIGNALED) 499 ac.ac_exitcode = pacct->ac_exitcode;
507 ac.ac_flag |= AXSIG; 500 spin_unlock(&current->sighand->siglock);
508
509 vsize = 0;
510 if (current->mm) {
511 struct vm_area_struct *vma;
512 down_read(&current->mm->mmap_sem);
513 vma = current->mm->mmap;
514 while (vma) {
515 vsize += vma->vm_end - vma->vm_start;
516 vma = vma->vm_next;
517 }
518 up_read(&current->mm->mmap_sem);
519 }
520 vsize = vsize / 1024;
521 ac.ac_mem = encode_comp_t(vsize);
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 501 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 502 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 503 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode;
530 504
531 /* 505 /*
532 * Kernel segment override to datasegment and write it 506 * Kernel segment override to datasegment and write it
@@ -546,12 +520,63 @@ static void do_acct_process(long exitcode, struct file *file)
546} 520}
547 521
548/** 522/**
523 * acct_init_pacct - initialize a new pacct_struct
524 */
525void acct_init_pacct(struct pacct_struct *pacct)
526{
527 memset(pacct, 0, sizeof(struct pacct_struct));
528 pacct->ac_utime = pacct->ac_stime = cputime_zero;
529}
530
531/**
532 * acct_collect - collect accounting information into pacct_struct
533 * @exitcode: task exit code
534 * @group_dead: not 0, if this thread is the last one in the process.
535 */
536void acct_collect(long exitcode, int group_dead)
537{
538 struct pacct_struct *pacct = &current->signal->pacct;
539 unsigned long vsize = 0;
540
541 if (group_dead && current->mm) {
542 struct vm_area_struct *vma;
543 down_read(&current->mm->mmap_sem);
544 vma = current->mm->mmap;
545 while (vma) {
546 vsize += vma->vm_end - vma->vm_start;
547 vma = vma->vm_next;
548 }
549 up_read(&current->mm->mmap_sem);
550 }
551
552 spin_lock_irq(&current->sighand->siglock);
553 if (group_dead)
554 pacct->ac_mem = vsize / 1024;
555 if (thread_group_leader(current)) {
556 pacct->ac_exitcode = exitcode;
557 if (current->flags & PF_FORKNOEXEC)
558 pacct->ac_flag |= AFORK;
559 }
560 if (current->flags & PF_SUPERPRIV)
561 pacct->ac_flag |= ASU;
562 if (current->flags & PF_DUMPCORE)
563 pacct->ac_flag |= ACORE;
564 if (current->flags & PF_SIGNALED)
565 pacct->ac_flag |= AXSIG;
566 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
567 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
568 pacct->ac_minflt += current->min_flt;
569 pacct->ac_majflt += current->maj_flt;
570 spin_unlock_irq(&current->sighand->siglock);
571}
572
573/**
549 * acct_process - now just a wrapper around do_acct_process 574 * acct_process - now just a wrapper around do_acct_process
550 * @exitcode: task exit code 575 * @exitcode: task exit code
551 * 576 *
552 * handles process accounting for an exiting task 577 * handles process accounting for an exiting task
553 */ 578 */
554void acct_process(long exitcode) 579void acct_process()
555{ 580{
556 struct file *file = NULL; 581 struct file *file = NULL;
557 582
@@ -570,7 +595,7 @@ void acct_process(long exitcode)
570 get_file(file); 595 get_file(file);
571 spin_unlock(&acct_globals.lock); 596 spin_unlock(&acct_globals.lock);
572 597
573 do_acct_process(exitcode, file); 598 do_acct_process(file);
574 fput(file); 599 fput(file);
575} 600}
576 601
@@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk)
599 */ 624 */
600void acct_clear_integrals(struct task_struct *tsk) 625void acct_clear_integrals(struct task_struct *tsk)
601{ 626{
602 if (tsk) { 627 tsk->acct_stimexpd = 0;
603 tsk->acct_stimexpd = 0; 628 tsk->acct_rss_mem1 = 0;
604 tsk->acct_rss_mem1 = 0; 629 tsk->acct_vm_mem1 = 0;
605 tsk->acct_vm_mem1 = 0;
606 }
607} 630}
diff --git a/kernel/audit.c b/kernel/audit.c
index df57b493e1cb..7dfac7031bd7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,6 +56,7 @@
56#include <linux/skbuff.h> 56#include <linux/skbuff.h>
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h>
59 60
60#include "audit.h" 61#include "audit.h"
61 62
@@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0;
89/* The identity of the user shutting down the audit system. */ 90/* The identity of the user shutting down the audit system. */
90uid_t audit_sig_uid = -1; 91uid_t audit_sig_uid = -1;
91pid_t audit_sig_pid = -1; 92pid_t audit_sig_pid = -1;
93u32 audit_sig_sid = 0;
92 94
93/* Records can be lost in several ways: 95/* Records can be lost in several ways:
94 0) [suppressed in audit_alloc] 96 0) [suppressed in audit_alloc]
@@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
102/* The netlink socket. */ 104/* The netlink socket. */
103static struct sock *audit_sock; 105static struct sock *audit_sock;
104 106
107/* Inotify handle. */
108struct inotify_handle *audit_ih;
109
110/* Hash for inode-based rules */
111struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
112
105/* The audit_freelist is a list of pre-allocated audit buffers (if more 113/* The audit_freelist is a list of pre-allocated audit buffers (if more
106 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of 114 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
107 * being placed on the freelist). */ 115 * being placed on the freelist). */
@@ -114,10 +122,8 @@ static struct task_struct *kauditd_task;
114static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 122static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
115static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 123static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
116 124
117/* The netlink socket is only to be read by 1 CPU, which lets us assume 125/* Serialize requests from userspace. */
118 * that list additions and deletions never happen simultaneously in 126static DEFINE_MUTEX(audit_cmd_mutex);
119 * auditsc.c */
120DEFINE_MUTEX(audit_netlink_mutex);
121 127
122/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 128/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
123 * audit records. Since printk uses a 1024 byte buffer, this buffer 129 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
250 "audit_rate_limit=%d old=%d by auid=%u", 256 "audit_rate_limit=%d old=%d by auid=%u",
251 limit, old, loginuid); 257 limit, old, loginuid);
252 audit_rate_limit = limit; 258 audit_rate_limit = limit;
253 return old; 259 return 0;
254} 260}
255 261
256static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 262static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
@@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
273 "audit_backlog_limit=%d old=%d by auid=%u", 279 "audit_backlog_limit=%d old=%d by auid=%u",
274 limit, old, loginuid); 280 limit, old, loginuid);
275 audit_backlog_limit = limit; 281 audit_backlog_limit = limit;
276 return old; 282 return 0;
277} 283}
278 284
279static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 285static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
@@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
299 "audit_enabled=%d old=%d by auid=%u", 305 "audit_enabled=%d old=%d by auid=%u",
300 state, old, loginuid); 306 state, old, loginuid);
301 audit_enabled = state; 307 audit_enabled = state;
302 return old; 308 return 0;
303} 309}
304 310
305static int audit_set_failure(int state, uid_t loginuid, u32 sid) 311static int audit_set_failure(int state, uid_t loginuid, u32 sid)
@@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
327 "audit_failure=%d old=%d by auid=%u", 333 "audit_failure=%d old=%d by auid=%u",
328 state, old, loginuid); 334 state, old, loginuid);
329 audit_failure = state; 335 audit_failure = state;
330 return old; 336 return 0;
331} 337}
332 338
333static int kauditd_thread(void *dummy) 339static int kauditd_thread(void *dummy)
@@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy)
363 remove_wait_queue(&kauditd_wait, &wait); 369 remove_wait_queue(&kauditd_wait, &wait);
364 } 370 }
365 } 371 }
372}
373
374int audit_send_list(void *_dest)
375{
376 struct audit_netlink_list *dest = _dest;
377 int pid = dest->pid;
378 struct sk_buff *skb;
379
380 /* wait for parent to finish and send an ACK */
381 mutex_lock(&audit_cmd_mutex);
382 mutex_unlock(&audit_cmd_mutex);
383
384 while ((skb = __skb_dequeue(&dest->q)) != NULL)
385 netlink_unicast(audit_sock, skb, pid, 0);
386
387 kfree(dest);
388
366 return 0; 389 return 0;
367} 390}
368 391
392struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
393 int multi, void *payload, int size)
394{
395 struct sk_buff *skb;
396 struct nlmsghdr *nlh;
397 int len = NLMSG_SPACE(size);
398 void *data;
399 int flags = multi ? NLM_F_MULTI : 0;
400 int t = done ? NLMSG_DONE : type;
401
402 skb = alloc_skb(len, GFP_KERNEL);
403 if (!skb)
404 return NULL;
405
406 nlh = NLMSG_PUT(skb, pid, seq, t, size);
407 nlh->nlmsg_flags = flags;
408 data = NLMSG_DATA(nlh);
409 memcpy(data, payload, size);
410 return skb;
411
412nlmsg_failure: /* Used by NLMSG_PUT */
413 if (skb)
414 kfree_skb(skb);
415 return NULL;
416}
417
369/** 418/**
370 * audit_send_reply - send an audit reply message via netlink 419 * audit_send_reply - send an audit reply message via netlink
371 * @pid: process id to send reply to 420 * @pid: process id to send reply to
@@ -383,29 +432,13 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
383 void *payload, int size) 432 void *payload, int size)
384{ 433{
385 struct sk_buff *skb; 434 struct sk_buff *skb;
386 struct nlmsghdr *nlh; 435 skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
387 int len = NLMSG_SPACE(size);
388 void *data;
389 int flags = multi ? NLM_F_MULTI : 0;
390 int t = done ? NLMSG_DONE : type;
391
392 skb = alloc_skb(len, GFP_KERNEL);
393 if (!skb) 436 if (!skb)
394 return; 437 return;
395
396 nlh = NLMSG_PUT(skb, pid, seq, t, size);
397 nlh->nlmsg_flags = flags;
398 data = NLMSG_DATA(nlh);
399 memcpy(data, payload, size);
400
401 /* Ignore failure. It'll only happen if the sender goes away, 438 /* Ignore failure. It'll only happen if the sender goes away,
402 because our timeout is set to infinite. */ 439 because our timeout is set to infinite. */
403 netlink_unicast(audit_sock, skb, pid, 0); 440 netlink_unicast(audit_sock, skb, pid, 0);
404 return; 441 return;
405
406nlmsg_failure: /* Used by NLMSG_PUT */
407 if (skb)
408 kfree_skb(skb);
409} 442}
410 443
411/* 444/*
@@ -451,7 +484,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
451 struct audit_buffer *ab; 484 struct audit_buffer *ab;
452 u16 msg_type = nlh->nlmsg_type; 485 u16 msg_type = nlh->nlmsg_type;
453 uid_t loginuid; /* loginuid of sender */ 486 uid_t loginuid; /* loginuid of sender */
454 struct audit_sig_info sig_data; 487 struct audit_sig_info *sig_data;
488 char *ctx;
489 u32 len;
455 490
456 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); 491 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
457 if (err) 492 if (err)
@@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
503 if (status_get->mask & AUDIT_STATUS_PID) { 538 if (status_get->mask & AUDIT_STATUS_PID) {
504 int old = audit_pid; 539 int old = audit_pid;
505 if (sid) { 540 if (sid) {
506 char *ctx = NULL; 541 if ((err = selinux_ctxid_to_string(
507 u32 len;
508 int rc;
509 if ((rc = selinux_ctxid_to_string(
510 sid, &ctx, &len))) 542 sid, &ctx, &len)))
511 return rc; 543 return err;
512 else 544 else
513 audit_log(NULL, GFP_KERNEL, 545 audit_log(NULL, GFP_KERNEL,
514 AUDIT_CONFIG_CHANGE, 546 AUDIT_CONFIG_CHANGE,
@@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
523 audit_pid = status_get->pid; 555 audit_pid = status_get->pid;
524 } 556 }
525 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 557 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
526 audit_set_rate_limit(status_get->rate_limit, 558 err = audit_set_rate_limit(status_get->rate_limit,
527 loginuid, sid); 559 loginuid, sid);
528 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 560 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
529 audit_set_backlog_limit(status_get->backlog_limit, 561 err = audit_set_backlog_limit(status_get->backlog_limit,
530 loginuid, sid); 562 loginuid, sid);
531 break; 563 break;
532 case AUDIT_USER: 564 case AUDIT_USER:
@@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
544 "user pid=%d uid=%u auid=%u", 576 "user pid=%d uid=%u auid=%u",
545 pid, uid, loginuid); 577 pid, uid, loginuid);
546 if (sid) { 578 if (sid) {
547 char *ctx = NULL;
548 u32 len;
549 if (selinux_ctxid_to_string( 579 if (selinux_ctxid_to_string(
550 sid, &ctx, &len)) { 580 sid, &ctx, &len)) {
551 audit_log_format(ab, 581 audit_log_format(ab,
@@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
584 loginuid, sid); 614 loginuid, sid);
585 break; 615 break;
586 case AUDIT_SIGNAL_INFO: 616 case AUDIT_SIGNAL_INFO:
587 sig_data.uid = audit_sig_uid; 617 err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
588 sig_data.pid = audit_sig_pid; 618 if (err)
619 return err;
620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
621 if (!sig_data) {
622 kfree(ctx);
623 return -ENOMEM;
624 }
625 sig_data->uid = audit_sig_uid;
626 sig_data->pid = audit_sig_pid;
627 memcpy(sig_data->ctx, ctx, len);
628 kfree(ctx);
589 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 629 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
590 0, 0, &sig_data, sizeof(sig_data)); 630 0, 0, sig_data, sizeof(*sig_data) + len);
631 kfree(sig_data);
591 break; 632 break;
592 default: 633 default:
593 err = -EINVAL; 634 err = -EINVAL;
@@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length)
629 struct sk_buff *skb; 670 struct sk_buff *skb;
630 unsigned int qlen; 671 unsigned int qlen;
631 672
632 mutex_lock(&audit_netlink_mutex); 673 mutex_lock(&audit_cmd_mutex);
633 674
634 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { 675 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
635 skb = skb_dequeue(&sk->sk_receive_queue); 676 skb = skb_dequeue(&sk->sk_receive_queue);
636 audit_receive_skb(skb); 677 audit_receive_skb(skb);
637 kfree_skb(skb); 678 kfree_skb(skb);
638 } 679 }
639 mutex_unlock(&audit_netlink_mutex); 680 mutex_unlock(&audit_cmd_mutex);
640} 681}
641 682
683#ifdef CONFIG_AUDITSYSCALL
684static const struct inotify_operations audit_inotify_ops = {
685 .handle_event = audit_handle_ievent,
686 .destroy_watch = audit_free_parent,
687};
688#endif
642 689
643/* Initialize audit support at boot time. */ 690/* Initialize audit support at boot time. */
644static int __init audit_init(void) 691static int __init audit_init(void)
645{ 692{
693#ifdef CONFIG_AUDITSYSCALL
694 int i;
695#endif
696
646 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 697 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
647 audit_default ? "enabled" : "disabled"); 698 audit_default ? "enabled" : "disabled");
648 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 699 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
@@ -661,6 +712,16 @@ static int __init audit_init(void)
661 selinux_audit_set_callback(&selinux_audit_rule_update); 712 selinux_audit_set_callback(&selinux_audit_rule_update);
662 713
663 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 714 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
715
716#ifdef CONFIG_AUDITSYSCALL
717 audit_ih = inotify_init(&audit_inotify_ops);
718 if (IS_ERR(audit_ih))
719 audit_panic("cannot initialize inotify handle");
720
721 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
722 INIT_LIST_HEAD(&audit_inode_hash[i]);
723#endif
724
664 return 0; 725 return 0;
665} 726}
666__initcall(audit_init); 727__initcall(audit_init);
@@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab)
690 kfree_skb(ab->skb); 751 kfree_skb(ab->skb);
691 752
692 spin_lock_irqsave(&audit_freelist_lock, flags); 753 spin_lock_irqsave(&audit_freelist_lock, flags);
693 if (++audit_freelist_count > AUDIT_MAXFREE) 754 if (audit_freelist_count > AUDIT_MAXFREE)
694 kfree(ab); 755 kfree(ab);
695 else 756 else {
757 audit_freelist_count++;
696 list_add(&ab->list, &audit_freelist); 758 list_add(&ab->list, &audit_freelist);
759 }
697 spin_unlock_irqrestore(&audit_freelist_lock, flags); 760 spin_unlock_irqrestore(&audit_freelist_lock, flags);
698} 761}
699 762
@@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
988 skb_put(skb, len << 1); /* new string is twice the old string */ 1051 skb_put(skb, len << 1); /* new string is twice the old string */
989} 1052}
990 1053
1054/*
1055 * Format a string of no more than slen characters into the audit buffer,
1056 * enclosed in quote marks.
1057 */
1058static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1059 const char *string)
1060{
1061 int avail, new_len;
1062 unsigned char *ptr;
1063 struct sk_buff *skb;
1064
1065 BUG_ON(!ab->skb);
1066 skb = ab->skb;
1067 avail = skb_tailroom(skb);
1068 new_len = slen + 3; /* enclosing quotes + null terminator */
1069 if (new_len > avail) {
1070 avail = audit_expand(ab, new_len);
1071 if (!avail)
1072 return;
1073 }
1074 ptr = skb->tail;
1075 *ptr++ = '"';
1076 memcpy(ptr, string, slen);
1077 ptr += slen;
1078 *ptr++ = '"';
1079 *ptr = 0;
1080 skb_put(skb, slen + 2); /* don't include null terminator */
1081}
1082
991/** 1083/**
992 * audit_log_unstrustedstring - log a string that may contain random characters 1084 * audit_log_n_unstrustedstring - log a string that may contain random characters
993 * @ab: audit_buffer 1085 * @ab: audit_buffer
1086 * @len: lenth of string (not including trailing null)
994 * @string: string to be logged 1087 * @string: string to be logged
995 * 1088 *
996 * This code will escape a string that is passed to it if the string 1089 * This code will escape a string that is passed to it if the string
997 * contains a control character, unprintable character, double quote mark, 1090 * contains a control character, unprintable character, double quote mark,
998 * or a space. Unescaped strings will start and end with a double quote mark. 1091 * or a space. Unescaped strings will start and end with a double quote mark.
999 * Strings that are escaped are printed in hex (2 digits per char). 1092 * Strings that are escaped are printed in hex (2 digits per char).
1093 *
1094 * The caller specifies the number of characters in the string to log, which may
1095 * or may not be the entire string.
1000 */ 1096 */
1001void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1097const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1098 const char *string)
1002{ 1099{
1003 const unsigned char *p = string; 1100 const unsigned char *p = string;
1004 1101
1005 while (*p) { 1102 while (*p) {
1006 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 1103 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
1007 audit_log_hex(ab, string, strlen(string)); 1104 audit_log_hex(ab, string, len);
1008 return; 1105 return string + len + 1;
1009 } 1106 }
1010 p++; 1107 p++;
1011 } 1108 }
1012 audit_log_format(ab, "\"%s\"", string); 1109 audit_log_n_string(ab, len, string);
1110 return p + 1;
1111}
1112
1113/**
1114 * audit_log_unstrustedstring - log a string that may contain random characters
1115 * @ab: audit_buffer
1116 * @string: string to be logged
1117 *
1118 * Same as audit_log_n_unstrustedstring(), except that strlen is used to
1119 * determine string length.
1120 */
1121const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1122{
1123 return audit_log_n_untrustedstring(ab, strlen(string), string);
1013} 1124}
1014 1125
1015/* This is a helper-function to print the escaped d_path */ 1126/* This is a helper-function to print the escaped d_path */
diff --git a/kernel/audit.h b/kernel/audit.h
index 6f733920fd32..8323e4132a33 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -19,9 +19,9 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/mutex.h>
23#include <linux/fs.h> 22#include <linux/fs.h>
24#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/skbuff.h>
25 25
26/* 0 = no checking 26/* 0 = no checking
27 1 = put_count checking 27 1 = put_count checking
@@ -53,6 +53,18 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
56struct audit_field { 68struct audit_field {
57 u32 type; 69 u32 type;
58 u32 val; 70 u32 val;
@@ -70,6 +82,9 @@ struct audit_krule {
70 u32 buflen; /* for data alloc on list rules */ 82 u32 buflen; /* for data alloc on list rules */
71 u32 field_count; 83 u32 field_count;
72 struct audit_field *fields; 84 struct audit_field *fields;
85 struct audit_field *inode_f; /* quick access to an inode field */
86 struct audit_watch *watch; /* associated watch */
87 struct list_head rlist; /* entry in audit_watch.rules list */
73}; 88};
74 89
75struct audit_entry { 90struct audit_entry {
@@ -78,15 +93,53 @@ struct audit_entry {
78 struct audit_krule rule; 93 struct audit_krule rule;
79}; 94};
80 95
81
82extern int audit_pid; 96extern int audit_pid;
83extern int audit_comparator(const u32 left, const u32 op, const u32 right);
84 97
98#define AUDIT_INODE_BUCKETS 32
99extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
100
101static inline int audit_hash_ino(u32 ino)
102{
103 return (ino & (AUDIT_INODE_BUCKETS-1));
104}
105
106extern int audit_comparator(const u32 left, const u32 op, const u32 right);
107extern int audit_compare_dname_path(const char *dname, const char *path,
108 int *dirlen);
109extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
110 int done, int multi,
111 void *payload, int size);
85extern void audit_send_reply(int pid, int seq, int type, 112extern void audit_send_reply(int pid, int seq, int type,
86 int done, int multi, 113 int done, int multi,
87 void *payload, int size); 114 void *payload, int size);
88extern void audit_log_lost(const char *message); 115extern void audit_log_lost(const char *message);
89extern void audit_panic(const char *message); 116extern void audit_panic(const char *message);
90extern struct mutex audit_netlink_mutex;
91 117
118struct audit_netlink_list {
119 int pid;
120 struct sk_buff_head q;
121};
122
123int audit_send_list(void *);
124
125struct inotify_watch;
126extern void audit_free_parent(struct inotify_watch *);
127extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
128 const char *, struct inode *);
92extern int selinux_audit_rule_update(void); 129extern int selinux_audit_rule_update(void);
130
131#ifdef CONFIG_AUDITSYSCALL
132extern void __audit_signal_info(int sig, struct task_struct *t);
133static inline void audit_signal_info(int sig, struct task_struct *t)
134{
135 if (unlikely(audit_pid && t->tgid == audit_pid))
136 __audit_signal_info(sig, t);
137}
138extern enum audit_state audit_filter_inodes(struct task_struct *,
139 struct audit_context *);
140extern void audit_set_auditable(struct audit_context *);
141#else
142#define audit_signal_info(s,t)
143#define audit_filter_inodes(t,c) AUDIT_DISABLED
144#define audit_set_auditable(c)
145#endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7c134906d689..4c99d2c586ed 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -22,13 +22,59 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
25#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
26#include <linux/selinux.h> 31#include <linux/selinux.h>
27#include "audit.h" 32#include "audit.h"
28 33
29/* There are three lists of rules -- one to search at task creation 34/*
30 * time, one to search at syscall entry time, and another to search at 35 * Locking model:
31 * syscall exit time. */ 36 *
37 * audit_filter_mutex:
38 * Synchronizes writes and blocking reads of audit's filterlist
39 * data. Rcu is used to traverse the filterlist and access
40 * contents of structs audit_entry, audit_watch and opaque
41 * selinux rules during filtering. If modified, these structures
42 * must be copied and replace their counterparts in the filterlist.
43 * An audit_parent struct is not accessed during filtering, so may
44 * be written directly provided audit_filter_mutex is held.
45 */
46
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */
32struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
33 LIST_HEAD_INIT(audit_filter_list[0]), 79 LIST_HEAD_INIT(audit_filter_list[0]),
34 LIST_HEAD_INIT(audit_filter_list[1]), 80 LIST_HEAD_INIT(audit_filter_list[1]),
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
41#endif 87#endif
42}; 88};
43 89
90static DEFINE_MUTEX(audit_filter_mutex);
91
92/* Inotify handle */
93extern struct inotify_handle *audit_ih;
94
95/* Inotify events we care about. */
96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
97
98void audit_free_parent(struct inotify_watch *i_watch)
99{
100 struct audit_parent *parent;
101
102 parent = container_of(i_watch, struct audit_parent, wdata);
103 WARN_ON(!list_empty(&parent->watches));
104 kfree(parent);
105}
106
107static inline void audit_get_watch(struct audit_watch *watch)
108{
109 atomic_inc(&watch->count);
110}
111
112static void audit_put_watch(struct audit_watch *watch)
113{
114 if (atomic_dec_and_test(&watch->count)) {
115 WARN_ON(watch->parent);
116 WARN_ON(!list_empty(&watch->rules));
117 kfree(watch->path);
118 kfree(watch);
119 }
120}
121
122static void audit_remove_watch(struct audit_watch *watch)
123{
124 list_del(&watch->wlist);
125 put_inotify_watch(&watch->parent->wdata);
126 watch->parent = NULL;
127 audit_put_watch(watch); /* match initial get */
128}
129
44static inline void audit_free_rule(struct audit_entry *e) 130static inline void audit_free_rule(struct audit_entry *e)
45{ 131{
46 int i; 132 int i;
133
134 /* some rules don't have associated watches */
135 if (e->rule.watch)
136 audit_put_watch(e->rule.watch);
47 if (e->rule.fields) 137 if (e->rule.fields)
48 for (i = 0; i < e->rule.field_count; i++) { 138 for (i = 0; i < e->rule.field_count; i++) {
49 struct audit_field *f = &e->rule.fields[i]; 139 struct audit_field *f = &e->rule.fields[i];
@@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
60 audit_free_rule(e); 150 audit_free_rule(e);
61} 151}
62 152
153/* Initialize a parent watch entry. */
154static struct audit_parent *audit_init_parent(struct nameidata *ndp)
155{
156 struct audit_parent *parent;
157 s32 wd;
158
159 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
160 if (unlikely(!parent))
161 return ERR_PTR(-ENOMEM);
162
163 INIT_LIST_HEAD(&parent->watches);
164 parent->flags = 0;
165
166 inotify_init_watch(&parent->wdata);
167 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
168 get_inotify_watch(&parent->wdata);
169 wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
170 AUDIT_IN_WATCH);
171 if (wd < 0) {
172 audit_free_parent(&parent->wdata);
173 return ERR_PTR(wd);
174 }
175
176 return parent;
177}
178
179/* Initialize a watch entry. */
180static struct audit_watch *audit_init_watch(char *path)
181{
182 struct audit_watch *watch;
183
184 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
185 if (unlikely(!watch))
186 return ERR_PTR(-ENOMEM);
187
188 INIT_LIST_HEAD(&watch->rules);
189 atomic_set(&watch->count, 1);
190 watch->path = path;
191 watch->dev = (dev_t)-1;
192 watch->ino = (unsigned long)-1;
193
194 return watch;
195}
196
63/* Initialize an audit filterlist entry. */ 197/* Initialize an audit filterlist entry. */
64static inline struct audit_entry *audit_init_entry(u32 field_count) 198static inline struct audit_entry *audit_init_entry(u32 field_count)
65{ 199{
@@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
107 return str; 241 return str;
108} 242}
109 243
244/* Translate an inode field to kernel respresentation. */
245static inline int audit_to_inode(struct audit_krule *krule,
246 struct audit_field *f)
247{
248 if (krule->listnr != AUDIT_FILTER_EXIT ||
249 krule->watch || krule->inode_f)
250 return -EINVAL;
251
252 krule->inode_f = f;
253 return 0;
254}
255
256/* Translate a watch string to kernel respresentation. */
257static int audit_to_watch(struct audit_krule *krule, char *path, int len,
258 u32 op)
259{
260 struct audit_watch *watch;
261
262 if (!audit_ih)
263 return -EOPNOTSUPP;
264
265 if (path[0] != '/' || path[len-1] == '/' ||
266 krule->listnr != AUDIT_FILTER_EXIT ||
267 op & ~AUDIT_EQUAL ||
268 krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */
269 return -EINVAL;
270
271 watch = audit_init_watch(path);
272 if (unlikely(IS_ERR(watch)))
273 return PTR_ERR(watch);
274
275 audit_get_watch(watch);
276 krule->watch = watch;
277
278 return 0;
279}
280
110/* Common user-space to kernel rule translation. */ 281/* Common user-space to kernel rule translation. */
111static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) 282static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
112{ 283{
@@ -128,8 +299,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
128#endif 299#endif
129 ; 300 ;
130 } 301 }
131 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && 302 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
132 rule->action != AUDIT_ALWAYS) 303 printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
304 goto exit_err;
305 }
306 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
133 goto exit_err; 307 goto exit_err;
134 if (rule->field_count > AUDIT_MAX_FIELDS) 308 if (rule->field_count > AUDIT_MAX_FIELDS)
135 goto exit_err; 309 goto exit_err;
@@ -158,6 +332,7 @@ exit_err:
158static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 332static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
159{ 333{
160 struct audit_entry *entry; 334 struct audit_entry *entry;
335 struct audit_field *f;
161 int err = 0; 336 int err = 0;
162 int i; 337 int i;
163 338
@@ -172,14 +347,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
172 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 347 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
173 f->val = rule->values[i]; 348 f->val = rule->values[i];
174 349
175 if (f->type & AUDIT_UNUSED_BITS || 350 err = -EINVAL;
176 f->type == AUDIT_SE_USER || 351 switch(f->type) {
177 f->type == AUDIT_SE_ROLE || 352 default:
178 f->type == AUDIT_SE_TYPE ||
179 f->type == AUDIT_SE_SEN ||
180 f->type == AUDIT_SE_CLR) {
181 err = -EINVAL;
182 goto exit_free; 353 goto exit_free;
354 case AUDIT_PID:
355 case AUDIT_UID:
356 case AUDIT_EUID:
357 case AUDIT_SUID:
358 case AUDIT_FSUID:
359 case AUDIT_GID:
360 case AUDIT_EGID:
361 case AUDIT_SGID:
362 case AUDIT_FSGID:
363 case AUDIT_LOGINUID:
364 case AUDIT_PERS:
365 case AUDIT_ARCH:
366 case AUDIT_MSGTYPE:
367 case AUDIT_DEVMAJOR:
368 case AUDIT_DEVMINOR:
369 case AUDIT_EXIT:
370 case AUDIT_SUCCESS:
371 case AUDIT_ARG0:
372 case AUDIT_ARG1:
373 case AUDIT_ARG2:
374 case AUDIT_ARG3:
375 break;
376 case AUDIT_INODE:
377 err = audit_to_inode(&entry->rule, f);
378 if (err)
379 goto exit_free;
380 break;
183 } 381 }
184 382
185 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; 383 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
@@ -196,6 +394,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
196 } 394 }
197 } 395 }
198 396
397 f = entry->rule.inode_f;
398 if (f) {
399 switch(f->op) {
400 case AUDIT_NOT_EQUAL:
401 entry->rule.inode_f = NULL;
402 case AUDIT_EQUAL:
403 break;
404 default:
405 goto exit_free;
406 }
407 }
408
199exit_nofree: 409exit_nofree:
200 return entry; 410 return entry;
201 411
@@ -210,6 +420,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
210{ 420{
211 int err = 0; 421 int err = 0;
212 struct audit_entry *entry; 422 struct audit_entry *entry;
423 struct audit_field *f;
213 void *bufp; 424 void *bufp;
214 size_t remain = datasz - sizeof(struct audit_rule_data); 425 size_t remain = datasz - sizeof(struct audit_rule_data);
215 int i; 426 int i;
@@ -235,6 +446,29 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
235 f->se_str = NULL; 446 f->se_str = NULL;
236 f->se_rule = NULL; 447 f->se_rule = NULL;
237 switch(f->type) { 448 switch(f->type) {
449 case AUDIT_PID:
450 case AUDIT_UID:
451 case AUDIT_EUID:
452 case AUDIT_SUID:
453 case AUDIT_FSUID:
454 case AUDIT_GID:
455 case AUDIT_EGID:
456 case AUDIT_SGID:
457 case AUDIT_FSGID:
458 case AUDIT_LOGINUID:
459 case AUDIT_PERS:
460 case AUDIT_ARCH:
461 case AUDIT_MSGTYPE:
462 case AUDIT_PPID:
463 case AUDIT_DEVMAJOR:
464 case AUDIT_DEVMINOR:
465 case AUDIT_EXIT:
466 case AUDIT_SUCCESS:
467 case AUDIT_ARG0:
468 case AUDIT_ARG1:
469 case AUDIT_ARG2:
470 case AUDIT_ARG3:
471 break;
238 case AUDIT_SE_USER: 472 case AUDIT_SE_USER:
239 case AUDIT_SE_ROLE: 473 case AUDIT_SE_ROLE:
240 case AUDIT_SE_TYPE: 474 case AUDIT_SE_TYPE:
@@ -260,6 +494,37 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
260 } else 494 } else
261 f->se_str = str; 495 f->se_str = str;
262 break; 496 break;
497 case AUDIT_WATCH:
498 str = audit_unpack_string(&bufp, &remain, f->val);
499 if (IS_ERR(str))
500 goto exit_free;
501 entry->rule.buflen += f->val;
502
503 err = audit_to_watch(&entry->rule, str, f->val, f->op);
504 if (err) {
505 kfree(str);
506 goto exit_free;
507 }
508 break;
509 case AUDIT_INODE:
510 err = audit_to_inode(&entry->rule, f);
511 if (err)
512 goto exit_free;
513 break;
514 default:
515 goto exit_free;
516 }
517 }
518
519 f = entry->rule.inode_f;
520 if (f) {
521 switch(f->op) {
522 case AUDIT_NOT_EQUAL:
523 entry->rule.inode_f = NULL;
524 case AUDIT_EQUAL:
525 break;
526 default:
527 goto exit_free;
263 } 528 }
264 } 529 }
265 530
@@ -291,7 +556,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
291 556
292 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 557 rule = kmalloc(sizeof(*rule), GFP_KERNEL);
293 if (unlikely(!rule)) 558 if (unlikely(!rule))
294 return ERR_PTR(-ENOMEM); 559 return NULL;
295 memset(rule, 0, sizeof(*rule)); 560 memset(rule, 0, sizeof(*rule));
296 561
297 rule->flags = krule->flags | krule->listnr; 562 rule->flags = krule->flags | krule->listnr;
@@ -322,7 +587,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
322 587
323 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); 588 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
324 if (unlikely(!data)) 589 if (unlikely(!data))
325 return ERR_PTR(-ENOMEM); 590 return NULL;
326 memset(data, 0, sizeof(*data)); 591 memset(data, 0, sizeof(*data));
327 592
328 data->flags = krule->flags | krule->listnr; 593 data->flags = krule->flags | krule->listnr;
@@ -343,6 +608,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
343 data->buflen += data->values[i] = 608 data->buflen += data->values[i] =
344 audit_pack_string(&bufp, f->se_str); 609 audit_pack_string(&bufp, f->se_str);
345 break; 610 break;
611 case AUDIT_WATCH:
612 data->buflen += data->values[i] =
613 audit_pack_string(&bufp, krule->watch->path);
614 break;
346 default: 615 default:
347 data->values[i] = f->val; 616 data->values[i] = f->val;
348 } 617 }
@@ -378,6 +647,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
378 if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) 647 if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
379 return 1; 648 return 1;
380 break; 649 break;
650 case AUDIT_WATCH:
651 if (strcmp(a->watch->path, b->watch->path))
652 return 1;
653 break;
381 default: 654 default:
382 if (a->fields[i].val != b->fields[i].val) 655 if (a->fields[i].val != b->fields[i].val)
383 return 1; 656 return 1;
@@ -391,6 +664,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
391 return 0; 664 return 0;
392} 665}
393 666
667/* Duplicate the given audit watch. The new watch's rules list is initialized
668 * to an empty list and wlist is undefined. */
669static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
670{
671 char *path;
672 struct audit_watch *new;
673
674 path = kstrdup(old->path, GFP_KERNEL);
675 if (unlikely(!path))
676 return ERR_PTR(-ENOMEM);
677
678 new = audit_init_watch(path);
679 if (unlikely(IS_ERR(new))) {
680 kfree(path);
681 goto out;
682 }
683
684 new->dev = old->dev;
685 new->ino = old->ino;
686 get_inotify_watch(&old->parent->wdata);
687 new->parent = old->parent;
688
689out:
690 return new;
691}
692
394/* Duplicate selinux field information. The se_rule is opaque, so must be 693/* Duplicate selinux field information. The se_rule is opaque, so must be
395 * re-initialized. */ 694 * re-initialized. */
396static inline int audit_dupe_selinux_field(struct audit_field *df, 695static inline int audit_dupe_selinux_field(struct audit_field *df,
@@ -422,8 +721,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
422/* Duplicate an audit rule. This will be a deep copy with the exception 721/* Duplicate an audit rule. This will be a deep copy with the exception
423 * of the watch - that pointer is carried over. The selinux specific fields 722 * of the watch - that pointer is carried over. The selinux specific fields
424 * will be updated in the copy. The point is to be able to replace the old 723 * will be updated in the copy. The point is to be able to replace the old
425 * rule with the new rule in the filterlist, then free the old rule. */ 724 * rule with the new rule in the filterlist, then free the old rule.
426static struct audit_entry *audit_dupe_rule(struct audit_krule *old) 725 * The rlist element is undefined; list manipulations are handled apart from
726 * the initial copy. */
727static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
728 struct audit_watch *watch)
427{ 729{
428 u32 fcount = old->field_count; 730 u32 fcount = old->field_count;
429 struct audit_entry *entry; 731 struct audit_entry *entry;
@@ -442,6 +744,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
442 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 744 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
443 new->mask[i] = old->mask[i]; 745 new->mask[i] = old->mask[i];
444 new->buflen = old->buflen; 746 new->buflen = old->buflen;
747 new->inode_f = old->inode_f;
748 new->watch = NULL;
445 new->field_count = old->field_count; 749 new->field_count = old->field_count;
446 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); 750 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
447 751
@@ -463,68 +767,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
463 } 767 }
464 } 768 }
465 769
770 if (watch) {
771 audit_get_watch(watch);
772 new->watch = watch;
773 }
774
466 return entry; 775 return entry;
467} 776}
468 777
469/* Add rule to given filterlist if not a duplicate. Protected by 778/* Update inode info in audit rules based on filesystem event. */
470 * audit_netlink_mutex. */ 779static void audit_update_watch(struct audit_parent *parent,
780 const char *dname, dev_t dev,
781 unsigned long ino, unsigned invalidating)
782{
783 struct audit_watch *owatch, *nwatch, *nextw;
784 struct audit_krule *r, *nextr;
785 struct audit_entry *oentry, *nentry;
786 struct audit_buffer *ab;
787
788 mutex_lock(&audit_filter_mutex);
789 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
790 if (audit_compare_dname_path(dname, owatch->path, NULL))
791 continue;
792
793 /* If the update involves invalidating rules, do the inode-based
794 * filtering now, so we don't omit records. */
795 if (invalidating &&
796 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
797 audit_set_auditable(current->audit_context);
798
799 nwatch = audit_dupe_watch(owatch);
800 if (unlikely(IS_ERR(nwatch))) {
801 mutex_unlock(&audit_filter_mutex);
802 audit_panic("error updating watch, skipping");
803 return;
804 }
805 nwatch->dev = dev;
806 nwatch->ino = ino;
807
808 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
809
810 oentry = container_of(r, struct audit_entry, rule);
811 list_del(&oentry->rule.rlist);
812 list_del_rcu(&oentry->list);
813
814 nentry = audit_dupe_rule(&oentry->rule, nwatch);
815 if (unlikely(IS_ERR(nentry)))
816 audit_panic("error updating watch, removing");
817 else {
818 int h = audit_hash_ino((u32)ino);
819 list_add(&nentry->rule.rlist, &nwatch->rules);
820 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
821 }
822
823 call_rcu(&oentry->rcu, audit_free_rule_rcu);
824 }
825
826 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
827 audit_log_format(ab, "audit updated rules specifying watch=");
828 audit_log_untrustedstring(ab, owatch->path);
829 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
830 audit_log_end(ab);
831
832 audit_remove_watch(owatch);
833 goto add_watch_to_parent; /* event applies to a single watch */
834 }
835 mutex_unlock(&audit_filter_mutex);
836 return;
837
838add_watch_to_parent:
839 list_add(&nwatch->wlist, &parent->watches);
840 mutex_unlock(&audit_filter_mutex);
841 return;
842}
843
844/* Remove all watches & rules associated with a parent that is going away. */
845static void audit_remove_parent_watches(struct audit_parent *parent)
846{
847 struct audit_watch *w, *nextw;
848 struct audit_krule *r, *nextr;
849 struct audit_entry *e;
850
851 mutex_lock(&audit_filter_mutex);
852 parent->flags |= AUDIT_PARENT_INVALID;
853 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
854 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
855 e = container_of(r, struct audit_entry, rule);
856 list_del(&r->rlist);
857 list_del_rcu(&e->list);
858 call_rcu(&e->rcu, audit_free_rule_rcu);
859
860 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
861 "audit implicitly removed rule from list=%d\n",
862 AUDIT_FILTER_EXIT);
863 }
864 audit_remove_watch(w);
865 }
866 mutex_unlock(&audit_filter_mutex);
867}
868
869/* Unregister inotify watches for parents on in_list.
870 * Generates an IN_IGNORED event. */
871static void audit_inotify_unregister(struct list_head *in_list)
872{
873 struct audit_parent *p, *n;
874
875 list_for_each_entry_safe(p, n, in_list, ilist) {
876 list_del(&p->ilist);
877 inotify_rm_watch(audit_ih, &p->wdata);
878 /* the put matching the get in audit_do_del_rule() */
879 put_inotify_watch(&p->wdata);
880 }
881}
882
883/* Find an existing audit rule.
884 * Caller must hold audit_filter_mutex to prevent stale rule data. */
885static struct audit_entry *audit_find_rule(struct audit_entry *entry,
886 struct list_head *list)
887{
888 struct audit_entry *e, *found = NULL;
889 int h;
890
891 if (entry->rule.watch) {
892 /* we don't know the inode number, so must walk entire hash */
893 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
894 list = &audit_inode_hash[h];
895 list_for_each_entry(e, list, list)
896 if (!audit_compare_rule(&entry->rule, &e->rule)) {
897 found = e;
898 goto out;
899 }
900 }
901 goto out;
902 }
903
904 list_for_each_entry(e, list, list)
905 if (!audit_compare_rule(&entry->rule, &e->rule)) {
906 found = e;
907 goto out;
908 }
909
910out:
911 return found;
912}
913
914/* Get path information necessary for adding watches. */
915static int audit_get_nd(char *path, struct nameidata **ndp,
916 struct nameidata **ndw)
917{
918 struct nameidata *ndparent, *ndwatch;
919 int err;
920
921 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
922 if (unlikely(!ndparent))
923 return -ENOMEM;
924
925 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
926 if (unlikely(!ndwatch)) {
927 kfree(ndparent);
928 return -ENOMEM;
929 }
930
931 err = path_lookup(path, LOOKUP_PARENT, ndparent);
932 if (err) {
933 kfree(ndparent);
934 kfree(ndwatch);
935 return err;
936 }
937
938 err = path_lookup(path, 0, ndwatch);
939 if (err) {
940 kfree(ndwatch);
941 ndwatch = NULL;
942 }
943
944 *ndp = ndparent;
945 *ndw = ndwatch;
946
947 return 0;
948}
949
950/* Release resources used for watch path information. */
951static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
952{
953 if (ndp) {
954 path_release(ndp);
955 kfree(ndp);
956 }
957 if (ndw) {
958 path_release(ndw);
959 kfree(ndw);
960 }
961}
962
963/* Associate the given rule with an existing parent inotify_watch.
964 * Caller must hold audit_filter_mutex. */
965static void audit_add_to_parent(struct audit_krule *krule,
966 struct audit_parent *parent)
967{
968 struct audit_watch *w, *watch = krule->watch;
969 int watch_found = 0;
970
971 list_for_each_entry(w, &parent->watches, wlist) {
972 if (strcmp(watch->path, w->path))
973 continue;
974
975 watch_found = 1;
976
977 /* put krule's and initial refs to temporary watch */
978 audit_put_watch(watch);
979 audit_put_watch(watch);
980
981 audit_get_watch(w);
982 krule->watch = watch = w;
983 break;
984 }
985
986 if (!watch_found) {
987 get_inotify_watch(&parent->wdata);
988 watch->parent = parent;
989
990 list_add(&watch->wlist, &parent->watches);
991 }
992 list_add(&krule->rlist, &watch->rules);
993}
994
995/* Find a matching watch entry, or add this one.
996 * Caller must hold audit_filter_mutex. */
997static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
998 struct nameidata *ndw)
999{
1000 struct audit_watch *watch = krule->watch;
1001 struct inotify_watch *i_watch;
1002 struct audit_parent *parent;
1003 int ret = 0;
1004
1005 /* update watch filter fields */
1006 if (ndw) {
1007 watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
1008 watch->ino = ndw->dentry->d_inode->i_ino;
1009 }
1010
1011 /* The audit_filter_mutex must not be held during inotify calls because
1012 * we hold it during inotify event callback processing. If an existing
1013 * inotify watch is found, inotify_find_watch() grabs a reference before
1014 * returning.
1015 */
1016 mutex_unlock(&audit_filter_mutex);
1017
1018 if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
1019 parent = audit_init_parent(ndp);
1020 if (IS_ERR(parent)) {
1021 /* caller expects mutex locked */
1022 mutex_lock(&audit_filter_mutex);
1023 return PTR_ERR(parent);
1024 }
1025 } else
1026 parent = container_of(i_watch, struct audit_parent, wdata);
1027
1028 mutex_lock(&audit_filter_mutex);
1029
1030 /* parent was moved before we took audit_filter_mutex */
1031 if (parent->flags & AUDIT_PARENT_INVALID)
1032 ret = -ENOENT;
1033 else
1034 audit_add_to_parent(krule, parent);
1035
1036 /* match get in audit_init_parent or inotify_find_watch */
1037 put_inotify_watch(&parent->wdata);
1038 return ret;
1039}
1040
1041/* Add rule to given filterlist if not a duplicate. */
471static inline int audit_add_rule(struct audit_entry *entry, 1042static inline int audit_add_rule(struct audit_entry *entry,
472 struct list_head *list) 1043 struct list_head *list)
473{ 1044{
474 struct audit_entry *e; 1045 struct audit_entry *e;
1046 struct audit_field *inode_f = entry->rule.inode_f;
1047 struct audit_watch *watch = entry->rule.watch;
1048 struct nameidata *ndp, *ndw;
1049 int h, err, putnd_needed = 0;
1050
1051 if (inode_f) {
1052 h = audit_hash_ino(inode_f->val);
1053 list = &audit_inode_hash[h];
1054 }
475 1055
476 /* Do not use the _rcu iterator here, since this is the only 1056 mutex_lock(&audit_filter_mutex);
477 * addition routine. */ 1057 e = audit_find_rule(entry, list);
478 list_for_each_entry(e, list, list) { 1058 mutex_unlock(&audit_filter_mutex);
479 if (!audit_compare_rule(&entry->rule, &e->rule)) 1059 if (e) {
480 return -EEXIST; 1060 err = -EEXIST;
1061 goto error;
1062 }
1063
1064 /* Avoid calling path_lookup under audit_filter_mutex. */
1065 if (watch) {
1066 err = audit_get_nd(watch->path, &ndp, &ndw);
1067 if (err)
1068 goto error;
1069 putnd_needed = 1;
1070 }
1071
1072 mutex_lock(&audit_filter_mutex);
1073 if (watch) {
1074 /* audit_filter_mutex is dropped and re-taken during this call */
1075 err = audit_add_watch(&entry->rule, ndp, ndw);
1076 if (err) {
1077 mutex_unlock(&audit_filter_mutex);
1078 goto error;
1079 }
1080 h = audit_hash_ino((u32)watch->ino);
1081 list = &audit_inode_hash[h];
481 } 1082 }
482 1083
483 if (entry->rule.flags & AUDIT_FILTER_PREPEND) { 1084 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
484 list_add_rcu(&entry->list, list); 1085 list_add_rcu(&entry->list, list);
1086 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
485 } else { 1087 } else {
486 list_add_tail_rcu(&entry->list, list); 1088 list_add_tail_rcu(&entry->list, list);
487 } 1089 }
1090 mutex_unlock(&audit_filter_mutex);
488 1091
489 return 0; 1092 if (putnd_needed)
1093 audit_put_nd(ndp, ndw);
1094
1095 return 0;
1096
1097error:
1098 if (putnd_needed)
1099 audit_put_nd(ndp, ndw);
1100 if (watch)
1101 audit_put_watch(watch); /* tmp watch, matches initial get */
1102 return err;
490} 1103}
491 1104
492/* Remove an existing rule from filterlist. Protected by 1105/* Remove an existing rule from filterlist. */
493 * audit_netlink_mutex. */
494static inline int audit_del_rule(struct audit_entry *entry, 1106static inline int audit_del_rule(struct audit_entry *entry,
495 struct list_head *list) 1107 struct list_head *list)
496{ 1108{
497 struct audit_entry *e; 1109 struct audit_entry *e;
1110 struct audit_field *inode_f = entry->rule.inode_f;
1111 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1112 LIST_HEAD(inotify_list);
1113 int h, ret = 0;
1114
1115 if (inode_f) {
1116 h = audit_hash_ino(inode_f->val);
1117 list = &audit_inode_hash[h];
1118 }
498 1119
499 /* Do not use the _rcu iterator here, since this is the only 1120 mutex_lock(&audit_filter_mutex);
500 * deletion routine. */ 1121 e = audit_find_rule(entry, list);
501 list_for_each_entry(e, list, list) { 1122 if (!e) {
502 if (!audit_compare_rule(&entry->rule, &e->rule)) { 1123 mutex_unlock(&audit_filter_mutex);
503 list_del_rcu(&e->list); 1124 ret = -ENOENT;
504 call_rcu(&e->rcu, audit_free_rule_rcu); 1125 goto out;
505 return 0; 1126 }
1127
1128 watch = e->rule.watch;
1129 if (watch) {
1130 struct audit_parent *parent = watch->parent;
1131
1132 list_del(&e->rule.rlist);
1133
1134 if (list_empty(&watch->rules)) {
1135 audit_remove_watch(watch);
1136
1137 if (list_empty(&parent->watches)) {
1138 /* Put parent on the inotify un-registration
1139 * list. Grab a reference before releasing
1140 * audit_filter_mutex, to be released in
1141 * audit_inotify_unregister(). */
1142 list_add(&parent->ilist, &inotify_list);
1143 get_inotify_watch(&parent->wdata);
1144 }
506 } 1145 }
507 } 1146 }
508 return -ENOENT; /* No matching rule */ 1147
1148 list_del_rcu(&e->list);
1149 call_rcu(&e->rcu, audit_free_rule_rcu);
1150
1151 mutex_unlock(&audit_filter_mutex);
1152
1153 if (!list_empty(&inotify_list))
1154 audit_inotify_unregister(&inotify_list);
1155
1156out:
1157 if (tmp_watch)
1158 audit_put_watch(tmp_watch); /* match initial get */
1159
1160 return ret;
509} 1161}
510 1162
511/* List rules using struct audit_rule. Exists for backward 1163/* List rules using struct audit_rule. Exists for backward
512 * compatibility with userspace. */ 1164 * compatibility with userspace. */
513static int audit_list(void *_dest) 1165static void audit_list(int pid, int seq, struct sk_buff_head *q)
514{ 1166{
515 int pid, seq; 1167 struct sk_buff *skb;
516 int *dest = _dest;
517 struct audit_entry *entry; 1168 struct audit_entry *entry;
518 int i; 1169 int i;
519 1170
520 pid = dest[0]; 1171 /* This is a blocking read, so use audit_filter_mutex instead of rcu
521 seq = dest[1]; 1172 * iterator to sync with list writers. */
522 kfree(dest);
523
524 mutex_lock(&audit_netlink_mutex);
525
526 /* The *_rcu iterators not needed here because we are
527 always called with audit_netlink_mutex held. */
528 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1173 for (i=0; i<AUDIT_NR_FILTERS; i++) {
529 list_for_each_entry(entry, &audit_filter_list[i], list) { 1174 list_for_each_entry(entry, &audit_filter_list[i], list) {
530 struct audit_rule *rule; 1175 struct audit_rule *rule;
@@ -532,33 +1177,41 @@ static int audit_list(void *_dest)
532 rule = audit_krule_to_rule(&entry->rule); 1177 rule = audit_krule_to_rule(&entry->rule);
533 if (unlikely(!rule)) 1178 if (unlikely(!rule))
534 break; 1179 break;
535 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, 1180 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
536 rule, sizeof(*rule)); 1181 rule, sizeof(*rule));
1182 if (skb)
1183 skb_queue_tail(q, skb);
537 kfree(rule); 1184 kfree(rule);
538 } 1185 }
539 } 1186 }
540 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); 1187 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
541 1188 list_for_each_entry(entry, &audit_inode_hash[i], list) {
542 mutex_unlock(&audit_netlink_mutex); 1189 struct audit_rule *rule;
543 return 0; 1190
1191 rule = audit_krule_to_rule(&entry->rule);
1192 if (unlikely(!rule))
1193 break;
1194 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1195 rule, sizeof(*rule));
1196 if (skb)
1197 skb_queue_tail(q, skb);
1198 kfree(rule);
1199 }
1200 }
1201 skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
1202 if (skb)
1203 skb_queue_tail(q, skb);
544} 1204}
545 1205
546/* List rules using struct audit_rule_data. */ 1206/* List rules using struct audit_rule_data. */
547static int audit_list_rules(void *_dest) 1207static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
548{ 1208{
549 int pid, seq; 1209 struct sk_buff *skb;
550 int *dest = _dest;
551 struct audit_entry *e; 1210 struct audit_entry *e;
552 int i; 1211 int i;
553 1212
554 pid = dest[0]; 1213 /* This is a blocking read, so use audit_filter_mutex instead of rcu
555 seq = dest[1]; 1214 * iterator to sync with list writers. */
556 kfree(dest);
557
558 mutex_lock(&audit_netlink_mutex);
559
560 /* The *_rcu iterators not needed here because we are
561 always called with audit_netlink_mutex held. */
562 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1215 for (i=0; i<AUDIT_NR_FILTERS; i++) {
563 list_for_each_entry(e, &audit_filter_list[i], list) { 1216 list_for_each_entry(e, &audit_filter_list[i], list) {
564 struct audit_rule_data *data; 1217 struct audit_rule_data *data;
@@ -566,15 +1219,30 @@ static int audit_list_rules(void *_dest)
566 data = audit_krule_to_data(&e->rule); 1219 data = audit_krule_to_data(&e->rule);
567 if (unlikely(!data)) 1220 if (unlikely(!data))
568 break; 1221 break;
569 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 1222 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
570 data, sizeof(*data)); 1223 data, sizeof(*data) + data->buflen);
1224 if (skb)
1225 skb_queue_tail(q, skb);
571 kfree(data); 1226 kfree(data);
572 } 1227 }
573 } 1228 }
574 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); 1229 for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
1230 list_for_each_entry(e, &audit_inode_hash[i], list) {
1231 struct audit_rule_data *data;
575 1232
576 mutex_unlock(&audit_netlink_mutex); 1233 data = audit_krule_to_data(&e->rule);
577 return 0; 1234 if (unlikely(!data))
1235 break;
1236 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
1237 data, sizeof(*data) + data->buflen);
1238 if (skb)
1239 skb_queue_tail(q, skb);
1240 kfree(data);
1241 }
1242 }
1243 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
1244 if (skb)
1245 skb_queue_tail(q, skb);
578} 1246}
579 1247
580/** 1248/**
@@ -592,7 +1260,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
592 size_t datasz, uid_t loginuid, u32 sid) 1260 size_t datasz, uid_t loginuid, u32 sid)
593{ 1261{
594 struct task_struct *tsk; 1262 struct task_struct *tsk;
595 int *dest; 1263 struct audit_netlink_list *dest;
596 int err = 0; 1264 int err = 0;
597 struct audit_entry *entry; 1265 struct audit_entry *entry;
598 1266
@@ -605,18 +1273,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
605 * happen if we're actually running in the context of auditctl 1273 * happen if we're actually running in the context of auditctl
606 * trying to _send_ the stuff */ 1274 * trying to _send_ the stuff */
607 1275
608 dest = kmalloc(2 * sizeof(int), GFP_KERNEL); 1276 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
609 if (!dest) 1277 if (!dest)
610 return -ENOMEM; 1278 return -ENOMEM;
611 dest[0] = pid; 1279 dest->pid = pid;
612 dest[1] = seq; 1280 skb_queue_head_init(&dest->q);
613 1281
1282 mutex_lock(&audit_filter_mutex);
614 if (type == AUDIT_LIST) 1283 if (type == AUDIT_LIST)
615 tsk = kthread_run(audit_list, dest, "audit_list"); 1284 audit_list(pid, seq, &dest->q);
616 else 1285 else
617 tsk = kthread_run(audit_list_rules, dest, 1286 audit_list_rules(pid, seq, &dest->q);
618 "audit_list_rules"); 1287 mutex_unlock(&audit_filter_mutex);
1288
1289 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
619 if (IS_ERR(tsk)) { 1290 if (IS_ERR(tsk)) {
1291 skb_queue_purge(&dest->q);
620 kfree(dest); 1292 kfree(dest);
621 err = PTR_ERR(tsk); 1293 err = PTR_ERR(tsk);
622 } 1294 }
@@ -632,6 +1304,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
632 1304
633 err = audit_add_rule(entry, 1305 err = audit_add_rule(entry,
634 &audit_filter_list[entry->rule.listnr]); 1306 &audit_filter_list[entry->rule.listnr]);
1307
635 if (sid) { 1308 if (sid) {
636 char *ctx = NULL; 1309 char *ctx = NULL;
637 u32 len; 1310 u32 len;
@@ -712,7 +1385,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
712 return 0; 1385 return 0;
713} 1386}
714 1387
1388/* Compare given dentry name with last component in given path,
1389 * return of 0 indicates a match. */
1390int audit_compare_dname_path(const char *dname, const char *path,
1391 int *dirlen)
1392{
1393 int dlen, plen;
1394 const char *p;
715 1395
1396 if (!dname || !path)
1397 return 1;
1398
1399 dlen = strlen(dname);
1400 plen = strlen(path);
1401 if (plen < dlen)
1402 return 1;
1403
1404 /* disregard trailing slashes */
1405 p = path + plen - 1;
1406 while ((*p == '/') && (p > path))
1407 p--;
1408
1409 /* find last path component */
1410 p = p - dlen + 1;
1411 if (p < path)
1412 return 1;
1413 else if (p > path) {
1414 if (*--p != '/')
1415 return 1;
1416 else
1417 p++;
1418 }
1419
1420 /* return length of path's directory component */
1421 if (dirlen)
1422 *dirlen = p - path;
1423 return strncmp(p, dname, dlen);
1424}
716 1425
717static int audit_filter_user_rules(struct netlink_skb_parms *cb, 1426static int audit_filter_user_rules(struct netlink_skb_parms *cb,
718 struct audit_krule *rule, 1427 struct audit_krule *rule,
@@ -744,7 +1453,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
744 } 1453 }
745 switch (rule->action) { 1454 switch (rule->action) {
746 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 1455 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
747 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
748 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 1456 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
749 } 1457 }
750 return 1; 1458 return 1;
@@ -826,32 +1534,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
826int selinux_audit_rule_update(void) 1534int selinux_audit_rule_update(void)
827{ 1535{
828 struct audit_entry *entry, *n, *nentry; 1536 struct audit_entry *entry, *n, *nentry;
1537 struct audit_watch *watch;
829 int i, err = 0; 1538 int i, err = 0;
830 1539
831 /* audit_netlink_mutex synchronizes the writers */ 1540 /* audit_filter_mutex synchronizes the writers */
832 mutex_lock(&audit_netlink_mutex); 1541 mutex_lock(&audit_filter_mutex);
833 1542
834 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1543 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
835 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1544 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
836 if (!audit_rule_has_selinux(&entry->rule)) 1545 if (!audit_rule_has_selinux(&entry->rule))
837 continue; 1546 continue;
838 1547
839 nentry = audit_dupe_rule(&entry->rule); 1548 watch = entry->rule.watch;
1549 nentry = audit_dupe_rule(&entry->rule, watch);
840 if (unlikely(IS_ERR(nentry))) { 1550 if (unlikely(IS_ERR(nentry))) {
841 /* save the first error encountered for the 1551 /* save the first error encountered for the
842 * return value */ 1552 * return value */
843 if (!err) 1553 if (!err)
844 err = PTR_ERR(nentry); 1554 err = PTR_ERR(nentry);
845 audit_panic("error updating selinux filters"); 1555 audit_panic("error updating selinux filters");
1556 if (watch)
1557 list_del(&entry->rule.rlist);
846 list_del_rcu(&entry->list); 1558 list_del_rcu(&entry->list);
847 } else { 1559 } else {
1560 if (watch) {
1561 list_add(&nentry->rule.rlist,
1562 &watch->rules);
1563 list_del(&entry->rule.rlist);
1564 }
848 list_replace_rcu(&entry->list, &nentry->list); 1565 list_replace_rcu(&entry->list, &nentry->list);
849 } 1566 }
850 call_rcu(&entry->rcu, audit_free_rule_rcu); 1567 call_rcu(&entry->rcu, audit_free_rule_rcu);
851 } 1568 }
852 } 1569 }
853 1570
854 mutex_unlock(&audit_netlink_mutex); 1571 mutex_unlock(&audit_filter_mutex);
855 1572
856 return err; 1573 return err;
857} 1574}
1575
1576/* Update watch data in audit rules based on inotify events. */
1577void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1578 u32 cookie, const char *dname, struct inode *inode)
1579{
1580 struct audit_parent *parent;
1581
1582 parent = container_of(i_watch, struct audit_parent, wdata);
1583
1584 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1585 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1586 inode->i_ino, 0);
1587 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1588 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1589 /* inotify automatically removes the watch and sends IN_IGNORED */
1590 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1591 audit_remove_parent_watches(parent);
1592 /* inotify does not remove the watch, so remove it manually */
1593 else if(mask & IN_MOVE_SELF) {
1594 audit_remove_parent_watches(parent);
1595 inotify_remove_watch_locked(audit_ih, i_watch);
1596 } else if (mask & IN_IGNORED)
1597 put_inotify_watch(i_watch);
1598}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c03a4ed1b27..9ebd96fda295 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
5 * Copyright 2005 Hewlett-Packard Development Company, L.P. 5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
6 * Copyright (C) 2005 IBM Corporation 6 * Copyright (C) 2005, 2006 IBM Corporation
7 * All Rights Reserved. 7 * All Rights Reserved.
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,9 @@
29 * this file -- see entry.S) is based on a GPL'd patch written by 29 * this file -- see entry.S) is based on a GPL'd patch written by
30 * okir@suse.de and Copyright 2003 SuSE Linux AG. 30 * okir@suse.de and Copyright 2003 SuSE Linux AG.
31 * 31 *
32 * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
33 * 2006.
34 *
32 * The support of additional filter rules compares (>, <, >=, <=) was 35 * The support of additional filter rules compares (>, <, >=, <=) was
33 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. 36 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
34 * 37 *
@@ -49,6 +52,7 @@
49#include <linux/module.h> 52#include <linux/module.h>
50#include <linux/mount.h> 53#include <linux/mount.h>
51#include <linux/socket.h> 54#include <linux/socket.h>
55#include <linux/mqueue.h>
52#include <linux/audit.h> 56#include <linux/audit.h>
53#include <linux/personality.h> 57#include <linux/personality.h>
54#include <linux/time.h> 58#include <linux/time.h>
@@ -59,6 +63,8 @@
59#include <linux/list.h> 63#include <linux/list.h>
60#include <linux/tty.h> 64#include <linux/tty.h>
61#include <linux/selinux.h> 65#include <linux/selinux.h>
66#include <linux/binfmts.h>
67#include <linux/syscalls.h>
62 68
63#include "audit.h" 69#include "audit.h"
64 70
@@ -76,6 +82,9 @@ extern int audit_enabled;
76 * path_lookup. */ 82 * path_lookup. */
77#define AUDIT_NAMES_RESERVED 7 83#define AUDIT_NAMES_RESERVED 7
78 84
85/* Indicates that audit should log the full pathname. */
86#define AUDIT_NAME_FULL -1
87
79/* When fs/namei.c:getname() is called, we store the pointer in name and 88/* When fs/namei.c:getname() is called, we store the pointer in name and
80 * we don't let putname() free it (instead we free all of the saved 89 * we don't let putname() free it (instead we free all of the saved
81 * pointers at syscall exit time). 90 * pointers at syscall exit time).
@@ -83,8 +92,9 @@ extern int audit_enabled;
83 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 92 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
84struct audit_names { 93struct audit_names {
85 const char *name; 94 const char *name;
95 int name_len; /* number of name's characters to log */
96 unsigned name_put; /* call __putname() for this name */
86 unsigned long ino; 97 unsigned long ino;
87 unsigned long pino;
88 dev_t dev; 98 dev_t dev;
89 umode_t mode; 99 umode_t mode;
90 uid_t uid; 100 uid_t uid;
@@ -100,6 +110,33 @@ struct audit_aux_data {
100 110
101#define AUDIT_AUX_IPCPERM 0 111#define AUDIT_AUX_IPCPERM 0
102 112
113struct audit_aux_data_mq_open {
114 struct audit_aux_data d;
115 int oflag;
116 mode_t mode;
117 struct mq_attr attr;
118};
119
120struct audit_aux_data_mq_sendrecv {
121 struct audit_aux_data d;
122 mqd_t mqdes;
123 size_t msg_len;
124 unsigned int msg_prio;
125 struct timespec abs_timeout;
126};
127
128struct audit_aux_data_mq_notify {
129 struct audit_aux_data d;
130 mqd_t mqdes;
131 struct sigevent notification;
132};
133
134struct audit_aux_data_mq_getsetattr {
135 struct audit_aux_data d;
136 mqd_t mqdes;
137 struct mq_attr mqstat;
138};
139
103struct audit_aux_data_ipcctl { 140struct audit_aux_data_ipcctl {
104 struct audit_aux_data d; 141 struct audit_aux_data d;
105 struct ipc_perm p; 142 struct ipc_perm p;
@@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl {
110 u32 osid; 147 u32 osid;
111}; 148};
112 149
150struct audit_aux_data_execve {
151 struct audit_aux_data d;
152 int argc;
153 int envc;
154 char mem[0];
155};
156
113struct audit_aux_data_socketcall { 157struct audit_aux_data_socketcall {
114 struct audit_aux_data d; 158 struct audit_aux_data d;
115 int nargs; 159 int nargs;
@@ -148,7 +192,7 @@ struct audit_context {
148 struct audit_aux_data *aux; 192 struct audit_aux_data *aux;
149 193
150 /* Save things to print about task_struct */ 194 /* Save things to print about task_struct */
151 pid_t pid; 195 pid_t pid, ppid;
152 uid_t uid, euid, suid, fsuid; 196 uid_t uid, euid, suid, fsuid;
153 gid_t gid, egid, sgid, fsgid; 197 gid_t gid, egid, sgid, fsgid;
154 unsigned long personality; 198 unsigned long personality;
@@ -160,12 +204,13 @@ struct audit_context {
160#endif 204#endif
161}; 205};
162 206
163 207/* Determine if any context name data matches a rule's watch data */
164/* Compare a task_struct with an audit_rule. Return 1 on match, 0 208/* Compare a task_struct with an audit_rule. Return 1 on match, 0
165 * otherwise. */ 209 * otherwise. */
166static int audit_filter_rules(struct task_struct *tsk, 210static int audit_filter_rules(struct task_struct *tsk,
167 struct audit_krule *rule, 211 struct audit_krule *rule,
168 struct audit_context *ctx, 212 struct audit_context *ctx,
213 struct audit_names *name,
169 enum audit_state *state) 214 enum audit_state *state)
170{ 215{
171 int i, j, need_sid = 1; 216 int i, j, need_sid = 1;
@@ -179,6 +224,10 @@ static int audit_filter_rules(struct task_struct *tsk,
179 case AUDIT_PID: 224 case AUDIT_PID:
180 result = audit_comparator(tsk->pid, f->op, f->val); 225 result = audit_comparator(tsk->pid, f->op, f->val);
181 break; 226 break;
227 case AUDIT_PPID:
228 if (ctx)
229 result = audit_comparator(ctx->ppid, f->op, f->val);
230 break;
182 case AUDIT_UID: 231 case AUDIT_UID:
183 result = audit_comparator(tsk->uid, f->op, f->val); 232 result = audit_comparator(tsk->uid, f->op, f->val);
184 break; 233 break;
@@ -224,7 +273,10 @@ static int audit_filter_rules(struct task_struct *tsk,
224 } 273 }
225 break; 274 break;
226 case AUDIT_DEVMAJOR: 275 case AUDIT_DEVMAJOR:
227 if (ctx) { 276 if (name)
277 result = audit_comparator(MAJOR(name->dev),
278 f->op, f->val);
279 else if (ctx) {
228 for (j = 0; j < ctx->name_count; j++) { 280 for (j = 0; j < ctx->name_count; j++) {
229 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { 281 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
230 ++result; 282 ++result;
@@ -234,7 +286,10 @@ static int audit_filter_rules(struct task_struct *tsk,
234 } 286 }
235 break; 287 break;
236 case AUDIT_DEVMINOR: 288 case AUDIT_DEVMINOR:
237 if (ctx) { 289 if (name)
290 result = audit_comparator(MINOR(name->dev),
291 f->op, f->val);
292 else if (ctx) {
238 for (j = 0; j < ctx->name_count; j++) { 293 for (j = 0; j < ctx->name_count; j++) {
239 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { 294 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
240 ++result; 295 ++result;
@@ -244,16 +299,22 @@ static int audit_filter_rules(struct task_struct *tsk,
244 } 299 }
245 break; 300 break;
246 case AUDIT_INODE: 301 case AUDIT_INODE:
247 if (ctx) { 302 if (name)
303 result = (name->ino == f->val);
304 else if (ctx) {
248 for (j = 0; j < ctx->name_count; j++) { 305 for (j = 0; j < ctx->name_count; j++) {
249 if (audit_comparator(ctx->names[j].ino, f->op, f->val) || 306 if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
250 audit_comparator(ctx->names[j].pino, f->op, f->val)) {
251 ++result; 307 ++result;
252 break; 308 break;
253 } 309 }
254 } 310 }
255 } 311 }
256 break; 312 break;
313 case AUDIT_WATCH:
314 if (name && rule->watch->ino != (unsigned long)-1)
315 result = (name->dev == rule->watch->dev &&
316 name->ino == rule->watch->ino);
317 break;
257 case AUDIT_LOGINUID: 318 case AUDIT_LOGINUID:
258 result = 0; 319 result = 0;
259 if (ctx) 320 if (ctx)
@@ -294,7 +355,6 @@ static int audit_filter_rules(struct task_struct *tsk,
294 } 355 }
295 switch (rule->action) { 356 switch (rule->action) {
296 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 357 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
297 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
298 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 358 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
299 } 359 }
300 return 1; 360 return 1;
@@ -311,7 +371,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
311 371
312 rcu_read_lock(); 372 rcu_read_lock();
313 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 373 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
314 if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { 374 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
315 rcu_read_unlock(); 375 rcu_read_unlock();
316 return state; 376 return state;
317 } 377 }
@@ -341,8 +401,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
341 int bit = AUDIT_BIT(ctx->major); 401 int bit = AUDIT_BIT(ctx->major);
342 402
343 list_for_each_entry_rcu(e, list, list) { 403 list_for_each_entry_rcu(e, list, list) {
344 if ((e->rule.mask[word] & bit) == bit 404 if ((e->rule.mask[word] & bit) == bit &&
345 && audit_filter_rules(tsk, &e->rule, ctx, &state)) { 405 audit_filter_rules(tsk, &e->rule, ctx, NULL,
406 &state)) {
407 rcu_read_unlock();
408 return state;
409 }
410 }
411 }
412 rcu_read_unlock();
413 return AUDIT_BUILD_CONTEXT;
414}
415
416/* At syscall exit time, this filter is called if any audit_names[] have been
417 * collected during syscall processing. We only check rules in sublists at hash
418 * buckets applicable to the inode numbers in audit_names[].
419 * Regarding audit_state, same rules apply as for audit_filter_syscall().
420 */
421enum audit_state audit_filter_inodes(struct task_struct *tsk,
422 struct audit_context *ctx)
423{
424 int i;
425 struct audit_entry *e;
426 enum audit_state state;
427
428 if (audit_pid && tsk->tgid == audit_pid)
429 return AUDIT_DISABLED;
430
431 rcu_read_lock();
432 for (i = 0; i < ctx->name_count; i++) {
433 int word = AUDIT_WORD(ctx->major);
434 int bit = AUDIT_BIT(ctx->major);
435 struct audit_names *n = &ctx->names[i];
436 int h = audit_hash_ino((u32)n->ino);
437 struct list_head *list = &audit_inode_hash[h];
438
439 if (list_empty(list))
440 continue;
441
442 list_for_each_entry_rcu(e, list, list) {
443 if ((e->rule.mask[word] & bit) == bit &&
444 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
346 rcu_read_unlock(); 445 rcu_read_unlock();
347 return state; 446 return state;
348 } 447 }
@@ -352,6 +451,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
352 return AUDIT_BUILD_CONTEXT; 451 return AUDIT_BUILD_CONTEXT;
353} 452}
354 453
454void audit_set_auditable(struct audit_context *ctx)
455{
456 ctx->auditable = 1;
457}
458
355static inline struct audit_context *audit_get_context(struct task_struct *tsk, 459static inline struct audit_context *audit_get_context(struct task_struct *tsk,
356 int return_valid, 460 int return_valid,
357 int return_code) 461 int return_code)
@@ -365,12 +469,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
365 469
366 if (context->in_syscall && !context->auditable) { 470 if (context->in_syscall && !context->auditable) {
367 enum audit_state state; 471 enum audit_state state;
472
368 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); 473 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
474 if (state == AUDIT_RECORD_CONTEXT) {
475 context->auditable = 1;
476 goto get_context;
477 }
478
479 state = audit_filter_inodes(tsk, context);
369 if (state == AUDIT_RECORD_CONTEXT) 480 if (state == AUDIT_RECORD_CONTEXT)
370 context->auditable = 1; 481 context->auditable = 1;
482
371 } 483 }
372 484
485get_context:
373 context->pid = tsk->pid; 486 context->pid = tsk->pid;
487 context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
374 context->uid = tsk->uid; 488 context->uid = tsk->uid;
375 context->gid = tsk->gid; 489 context->gid = tsk->gid;
376 context->euid = tsk->euid; 490 context->euid = tsk->euid;
@@ -413,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context)
413#endif 527#endif
414 528
415 for (i = 0; i < context->name_count; i++) { 529 for (i = 0; i < context->name_count; i++) {
416 if (context->names[i].name) 530 if (context->names[i].name && context->names[i].name_put)
417 __putname(context->names[i].name); 531 __putname(context->names[i].name);
418 } 532 }
419 context->name_count = 0; 533 context->name_count = 0;
@@ -606,7 +720,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
606 tty = "(none)"; 720 tty = "(none)";
607 audit_log_format(ab, 721 audit_log_format(ab,
608 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 722 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
609 " pid=%d auid=%u uid=%u gid=%u" 723 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
610 " euid=%u suid=%u fsuid=%u" 724 " euid=%u suid=%u fsuid=%u"
611 " egid=%u sgid=%u fsgid=%u tty=%s", 725 " egid=%u sgid=%u fsgid=%u tty=%s",
612 context->argv[0], 726 context->argv[0],
@@ -614,6 +728,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
614 context->argv[2], 728 context->argv[2],
615 context->argv[3], 729 context->argv[3],
616 context->name_count, 730 context->name_count,
731 context->ppid,
617 context->pid, 732 context->pid,
618 context->loginuid, 733 context->loginuid,
619 context->uid, 734 context->uid,
@@ -630,11 +745,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
630 continue; /* audit_panic has been called */ 745 continue; /* audit_panic has been called */
631 746
632 switch (aux->type) { 747 switch (aux->type) {
748 case AUDIT_MQ_OPEN: {
749 struct audit_aux_data_mq_open *axi = (void *)aux;
750 audit_log_format(ab,
751 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
752 "mq_msgsize=%ld mq_curmsgs=%ld",
753 axi->oflag, axi->mode, axi->attr.mq_flags,
754 axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
755 axi->attr.mq_curmsgs);
756 break; }
757
758 case AUDIT_MQ_SENDRECV: {
759 struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
760 audit_log_format(ab,
761 "mqdes=%d msg_len=%zd msg_prio=%u "
762 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
763 axi->mqdes, axi->msg_len, axi->msg_prio,
764 axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
765 break; }
766
767 case AUDIT_MQ_NOTIFY: {
768 struct audit_aux_data_mq_notify *axi = (void *)aux;
769 audit_log_format(ab,
770 "mqdes=%d sigev_signo=%d",
771 axi->mqdes,
772 axi->notification.sigev_signo);
773 break; }
774
775 case AUDIT_MQ_GETSETATTR: {
776 struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
777 audit_log_format(ab,
778 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
779 "mq_curmsgs=%ld ",
780 axi->mqdes,
781 axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
782 axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
783 break; }
784
633 case AUDIT_IPC: { 785 case AUDIT_IPC: {
634 struct audit_aux_data_ipcctl *axi = (void *)aux; 786 struct audit_aux_data_ipcctl *axi = (void *)aux;
635 audit_log_format(ab, 787 audit_log_format(ab,
636 " qbytes=%lx iuid=%u igid=%u mode=%x", 788 "ouid=%u ogid=%u mode=%x",
637 axi->qbytes, axi->uid, axi->gid, axi->mode); 789 axi->uid, axi->gid, axi->mode);
638 if (axi->osid != 0) { 790 if (axi->osid != 0) {
639 char *ctx = NULL; 791 char *ctx = NULL;
640 u32 len; 792 u32 len;
@@ -652,19 +804,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
652 case AUDIT_IPC_SET_PERM: { 804 case AUDIT_IPC_SET_PERM: {
653 struct audit_aux_data_ipcctl *axi = (void *)aux; 805 struct audit_aux_data_ipcctl *axi = (void *)aux;
654 audit_log_format(ab, 806 audit_log_format(ab,
655 " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", 807 "qbytes=%lx ouid=%u ogid=%u mode=%x",
656 axi->qbytes, axi->uid, axi->gid, axi->mode); 808 axi->qbytes, axi->uid, axi->gid, axi->mode);
657 if (axi->osid != 0) { 809 break; }
658 char *ctx = NULL; 810
659 u32 len; 811 case AUDIT_EXECVE: {
660 if (selinux_ctxid_to_string( 812 struct audit_aux_data_execve *axi = (void *)aux;
661 axi->osid, &ctx, &len)) { 813 int i;
662 audit_log_format(ab, " osid=%u", 814 const char *p;
663 axi->osid); 815 for (i = 0, p = axi->mem; i < axi->argc; i++) {
664 call_panic = 1; 816 audit_log_format(ab, "a%d=", i);
665 } else 817 p = audit_log_untrustedstring(ab, p);
666 audit_log_format(ab, " obj=%s", ctx); 818 audit_log_format(ab, "\n");
667 kfree(ctx);
668 } 819 }
669 break; } 820 break; }
670 821
@@ -700,8 +851,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
700 } 851 }
701 } 852 }
702 for (i = 0; i < context->name_count; i++) { 853 for (i = 0; i < context->name_count; i++) {
703 unsigned long ino = context->names[i].ino; 854 struct audit_names *n = &context->names[i];
704 unsigned long pino = context->names[i].pino;
705 855
706 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); 856 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
707 if (!ab) 857 if (!ab)
@@ -709,33 +859,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
709 859
710 audit_log_format(ab, "item=%d", i); 860 audit_log_format(ab, "item=%d", i);
711 861
712 audit_log_format(ab, " name="); 862 if (n->name) {
713 if (context->names[i].name) 863 switch(n->name_len) {
714 audit_log_untrustedstring(ab, context->names[i].name); 864 case AUDIT_NAME_FULL:
715 else 865 /* log the full path */
716 audit_log_format(ab, "(null)"); 866 audit_log_format(ab, " name=");
717 867 audit_log_untrustedstring(ab, n->name);
718 if (pino != (unsigned long)-1) 868 break;
719 audit_log_format(ab, " parent=%lu", pino); 869 case 0:
720 if (ino != (unsigned long)-1) 870 /* name was specified as a relative path and the
721 audit_log_format(ab, " inode=%lu", ino); 871 * directory component is the cwd */
722 if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) 872 audit_log_d_path(ab, " name=", context->pwd,
723 audit_log_format(ab, " dev=%02x:%02x mode=%#o" 873 context->pwdmnt);
724 " ouid=%u ogid=%u rdev=%02x:%02x", 874 break;
725 MAJOR(context->names[i].dev), 875 default:
726 MINOR(context->names[i].dev), 876 /* log the name's directory component */
727 context->names[i].mode, 877 audit_log_format(ab, " name=");
728 context->names[i].uid, 878 audit_log_n_untrustedstring(ab, n->name_len,
729 context->names[i].gid, 879 n->name);
730 MAJOR(context->names[i].rdev), 880 }
731 MINOR(context->names[i].rdev)); 881 } else
732 if (context->names[i].osid != 0) { 882 audit_log_format(ab, " name=(null)");
883
884 if (n->ino != (unsigned long)-1) {
885 audit_log_format(ab, " inode=%lu"
886 " dev=%02x:%02x mode=%#o"
887 " ouid=%u ogid=%u rdev=%02x:%02x",
888 n->ino,
889 MAJOR(n->dev),
890 MINOR(n->dev),
891 n->mode,
892 n->uid,
893 n->gid,
894 MAJOR(n->rdev),
895 MINOR(n->rdev));
896 }
897 if (n->osid != 0) {
733 char *ctx = NULL; 898 char *ctx = NULL;
734 u32 len; 899 u32 len;
735 if (selinux_ctxid_to_string( 900 if (selinux_ctxid_to_string(
736 context->names[i].osid, &ctx, &len)) { 901 n->osid, &ctx, &len)) {
737 audit_log_format(ab, " osid=%u", 902 audit_log_format(ab, " osid=%u", n->osid);
738 context->names[i].osid);
739 call_panic = 2; 903 call_panic = 2;
740 } else 904 } else
741 audit_log_format(ab, " obj=%s", ctx); 905 audit_log_format(ab, " obj=%s", ctx);
@@ -908,11 +1072,11 @@ void audit_syscall_exit(int valid, long return_code)
908 * Add a name to the list of audit names for this context. 1072 * Add a name to the list of audit names for this context.
909 * Called from fs/namei.c:getname(). 1073 * Called from fs/namei.c:getname().
910 */ 1074 */
911void audit_getname(const char *name) 1075void __audit_getname(const char *name)
912{ 1076{
913 struct audit_context *context = current->audit_context; 1077 struct audit_context *context = current->audit_context;
914 1078
915 if (!context || IS_ERR(name) || !name) 1079 if (IS_ERR(name) || !name)
916 return; 1080 return;
917 1081
918 if (!context->in_syscall) { 1082 if (!context->in_syscall) {
@@ -925,6 +1089,8 @@ void audit_getname(const char *name)
925 } 1089 }
926 BUG_ON(context->name_count >= AUDIT_NAMES); 1090 BUG_ON(context->name_count >= AUDIT_NAMES);
927 context->names[context->name_count].name = name; 1091 context->names[context->name_count].name = name;
1092 context->names[context->name_count].name_len = AUDIT_NAME_FULL;
1093 context->names[context->name_count].name_put = 1;
928 context->names[context->name_count].ino = (unsigned long)-1; 1094 context->names[context->name_count].ino = (unsigned long)-1;
929 ++context->name_count; 1095 ++context->name_count;
930 if (!context->pwd) { 1096 if (!context->pwd) {
@@ -991,11 +1157,10 @@ static void audit_inode_context(int idx, const struct inode *inode)
991 * audit_inode - store the inode and device from a lookup 1157 * audit_inode - store the inode and device from a lookup
992 * @name: name being audited 1158 * @name: name being audited
993 * @inode: inode being audited 1159 * @inode: inode being audited
994 * @flags: lookup flags (as used in path_lookup())
995 * 1160 *
996 * Called from fs/namei.c:path_lookup(). 1161 * Called from fs/namei.c:path_lookup().
997 */ 1162 */
998void __audit_inode(const char *name, const struct inode *inode, unsigned flags) 1163void __audit_inode(const char *name, const struct inode *inode)
999{ 1164{
1000 int idx; 1165 int idx;
1001 struct audit_context *context = current->audit_context; 1166 struct audit_context *context = current->audit_context;
@@ -1021,20 +1186,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
1021 ++context->ino_count; 1186 ++context->ino_count;
1022#endif 1187#endif
1023 } 1188 }
1189 context->names[idx].ino = inode->i_ino;
1024 context->names[idx].dev = inode->i_sb->s_dev; 1190 context->names[idx].dev = inode->i_sb->s_dev;
1025 context->names[idx].mode = inode->i_mode; 1191 context->names[idx].mode = inode->i_mode;
1026 context->names[idx].uid = inode->i_uid; 1192 context->names[idx].uid = inode->i_uid;
1027 context->names[idx].gid = inode->i_gid; 1193 context->names[idx].gid = inode->i_gid;
1028 context->names[idx].rdev = inode->i_rdev; 1194 context->names[idx].rdev = inode->i_rdev;
1029 audit_inode_context(idx, inode); 1195 audit_inode_context(idx, inode);
1030 if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) &&
1031 (strcmp(name, ".") != 0)) {
1032 context->names[idx].ino = (unsigned long)-1;
1033 context->names[idx].pino = inode->i_ino;
1034 } else {
1035 context->names[idx].ino = inode->i_ino;
1036 context->names[idx].pino = (unsigned long)-1;
1037 }
1038} 1196}
1039 1197
1040/** 1198/**
@@ -1056,51 +1214,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
1056{ 1214{
1057 int idx; 1215 int idx;
1058 struct audit_context *context = current->audit_context; 1216 struct audit_context *context = current->audit_context;
1217 const char *found_name = NULL;
1218 int dirlen = 0;
1059 1219
1060 if (!context->in_syscall) 1220 if (!context->in_syscall)
1061 return; 1221 return;
1062 1222
1063 /* determine matching parent */ 1223 /* determine matching parent */
1064 if (dname) 1224 if (!dname)
1065 for (idx = 0; idx < context->name_count; idx++) 1225 goto update_context;
1066 if (context->names[idx].pino == pino) { 1226 for (idx = 0; idx < context->name_count; idx++)
1067 const char *n; 1227 if (context->names[idx].ino == pino) {
1068 const char *name = context->names[idx].name; 1228 const char *name = context->names[idx].name;
1069 int dlen = strlen(dname); 1229
1070 int nlen = name ? strlen(name) : 0; 1230 if (!name)
1071 1231 continue;
1072 if (nlen < dlen) 1232
1073 continue; 1233 if (audit_compare_dname_path(dname, name, &dirlen) == 0) {
1074 1234 context->names[idx].name_len = dirlen;
1075 /* disregard trailing slashes */ 1235 found_name = name;
1076 n = name + nlen - 1; 1236 break;
1077 while ((*n == '/') && (n > name))
1078 n--;
1079
1080 /* find last path component */
1081 n = n - dlen + 1;
1082 if (n < name)
1083 continue;
1084 else if (n > name) {
1085 if (*--n != '/')
1086 continue;
1087 else
1088 n++;
1089 }
1090
1091 if (strncmp(n, dname, dlen) == 0)
1092 goto update_context;
1093 } 1237 }
1238 }
1094 1239
1095 /* catch-all in case match not found */ 1240update_context:
1096 idx = context->name_count++; 1241 idx = context->name_count++;
1097 context->names[idx].name = NULL;
1098 context->names[idx].pino = pino;
1099#if AUDIT_DEBUG 1242#if AUDIT_DEBUG
1100 context->ino_count++; 1243 context->ino_count++;
1101#endif 1244#endif
1245 /* Re-use the name belonging to the slot for a matching parent directory.
1246 * All names for this context are relinquished in audit_free_names() */
1247 context->names[idx].name = found_name;
1248 context->names[idx].name_len = AUDIT_NAME_FULL;
1249 context->names[idx].name_put = 0; /* don't call __putname() */
1102 1250
1103update_context:
1104 if (inode) { 1251 if (inode) {
1105 context->names[idx].ino = inode->i_ino; 1252 context->names[idx].ino = inode->i_ino;
1106 context->names[idx].dev = inode->i_sb->s_dev; 1253 context->names[idx].dev = inode->i_sb->s_dev;
@@ -1109,7 +1256,8 @@ update_context:
1109 context->names[idx].gid = inode->i_gid; 1256 context->names[idx].gid = inode->i_gid;
1110 context->names[idx].rdev = inode->i_rdev; 1257 context->names[idx].rdev = inode->i_rdev;
1111 audit_inode_context(idx, inode); 1258 audit_inode_context(idx, inode);
1112 } 1259 } else
1260 context->names[idx].ino = (unsigned long)-1;
1113} 1261}
1114 1262
1115/** 1263/**
@@ -1142,18 +1290,23 @@ void auditsc_get_stamp(struct audit_context *ctx,
1142 */ 1290 */
1143int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 1291int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1144{ 1292{
1145 if (task->audit_context) { 1293 struct audit_context *context = task->audit_context;
1146 struct audit_buffer *ab; 1294
1147 1295 if (context) {
1148 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); 1296 /* Only log if audit is enabled */
1149 if (ab) { 1297 if (context->in_syscall) {
1150 audit_log_format(ab, "login pid=%d uid=%u " 1298 struct audit_buffer *ab;
1151 "old auid=%u new auid=%u", 1299
1152 task->pid, task->uid, 1300 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1153 task->audit_context->loginuid, loginuid); 1301 if (ab) {
1154 audit_log_end(ab); 1302 audit_log_format(ab, "login pid=%d uid=%u "
1303 "old auid=%u new auid=%u",
1304 task->pid, task->uid,
1305 context->loginuid, loginuid);
1306 audit_log_end(ab);
1307 }
1155 } 1308 }
1156 task->audit_context->loginuid = loginuid; 1309 context->loginuid = loginuid;
1157 } 1310 }
1158 return 0; 1311 return 0;
1159} 1312}
@@ -1170,16 +1323,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1170} 1323}
1171 1324
1172/** 1325/**
1173 * audit_ipc_obj - record audit data for ipc object 1326 * __audit_mq_open - record audit data for a POSIX MQ open
1174 * @ipcp: ipc permissions 1327 * @oflag: open flag
1328 * @mode: mode bits
1329 * @u_attr: queue attributes
1175 * 1330 *
1176 * Returns 0 for success or NULL context or < 0 on error. 1331 * Returns 0 for success or NULL context or < 0 on error.
1177 */ 1332 */
1178int audit_ipc_obj(struct kern_ipc_perm *ipcp) 1333int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
1179{ 1334{
1180 struct audit_aux_data_ipcctl *ax; 1335 struct audit_aux_data_mq_open *ax;
1336 struct audit_context *context = current->audit_context;
1337
1338 if (!audit_enabled)
1339 return 0;
1340
1341 if (likely(!context))
1342 return 0;
1343
1344 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1345 if (!ax)
1346 return -ENOMEM;
1347
1348 if (u_attr != NULL) {
1349 if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
1350 kfree(ax);
1351 return -EFAULT;
1352 }
1353 } else
1354 memset(&ax->attr, 0, sizeof(ax->attr));
1355
1356 ax->oflag = oflag;
1357 ax->mode = mode;
1358
1359 ax->d.type = AUDIT_MQ_OPEN;
1360 ax->d.next = context->aux;
1361 context->aux = (void *)ax;
1362 return 0;
1363}
1364
1365/**
1366 * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
1367 * @mqdes: MQ descriptor
1368 * @msg_len: Message length
1369 * @msg_prio: Message priority
1370 * @abs_timeout: Message timeout in absolute time
1371 *
1372 * Returns 0 for success or NULL context or < 0 on error.
1373 */
1374int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
1375 const struct timespec __user *u_abs_timeout)
1376{
1377 struct audit_aux_data_mq_sendrecv *ax;
1378 struct audit_context *context = current->audit_context;
1379
1380 if (!audit_enabled)
1381 return 0;
1382
1383 if (likely(!context))
1384 return 0;
1385
1386 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1387 if (!ax)
1388 return -ENOMEM;
1389
1390 if (u_abs_timeout != NULL) {
1391 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
1392 kfree(ax);
1393 return -EFAULT;
1394 }
1395 } else
1396 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
1397
1398 ax->mqdes = mqdes;
1399 ax->msg_len = msg_len;
1400 ax->msg_prio = msg_prio;
1401
1402 ax->d.type = AUDIT_MQ_SENDRECV;
1403 ax->d.next = context->aux;
1404 context->aux = (void *)ax;
1405 return 0;
1406}
1407
1408/**
1409 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
1410 * @mqdes: MQ descriptor
1411 * @msg_len: Message length
1412 * @msg_prio: Message priority
1413 * @abs_timeout: Message timeout in absolute time
1414 *
1415 * Returns 0 for success or NULL context or < 0 on error.
1416 */
1417int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
1418 unsigned int __user *u_msg_prio,
1419 const struct timespec __user *u_abs_timeout)
1420{
1421 struct audit_aux_data_mq_sendrecv *ax;
1422 struct audit_context *context = current->audit_context;
1423
1424 if (!audit_enabled)
1425 return 0;
1426
1427 if (likely(!context))
1428 return 0;
1429
1430 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1431 if (!ax)
1432 return -ENOMEM;
1433
1434 if (u_msg_prio != NULL) {
1435 if (get_user(ax->msg_prio, u_msg_prio)) {
1436 kfree(ax);
1437 return -EFAULT;
1438 }
1439 } else
1440 ax->msg_prio = 0;
1441
1442 if (u_abs_timeout != NULL) {
1443 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
1444 kfree(ax);
1445 return -EFAULT;
1446 }
1447 } else
1448 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
1449
1450 ax->mqdes = mqdes;
1451 ax->msg_len = msg_len;
1452
1453 ax->d.type = AUDIT_MQ_SENDRECV;
1454 ax->d.next = context->aux;
1455 context->aux = (void *)ax;
1456 return 0;
1457}
1458
1459/**
1460 * __audit_mq_notify - record audit data for a POSIX MQ notify
1461 * @mqdes: MQ descriptor
1462 * @u_notification: Notification event
1463 *
1464 * Returns 0 for success or NULL context or < 0 on error.
1465 */
1466
1467int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
1468{
1469 struct audit_aux_data_mq_notify *ax;
1470 struct audit_context *context = current->audit_context;
1471
1472 if (!audit_enabled)
1473 return 0;
1474
1475 if (likely(!context))
1476 return 0;
1477
1478 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1479 if (!ax)
1480 return -ENOMEM;
1481
1482 if (u_notification != NULL) {
1483 if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
1484 kfree(ax);
1485 return -EFAULT;
1486 }
1487 } else
1488 memset(&ax->notification, 0, sizeof(ax->notification));
1489
1490 ax->mqdes = mqdes;
1491
1492 ax->d.type = AUDIT_MQ_NOTIFY;
1493 ax->d.next = context->aux;
1494 context->aux = (void *)ax;
1495 return 0;
1496}
1497
1498/**
1499 * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
1500 * @mqdes: MQ descriptor
1501 * @mqstat: MQ flags
1502 *
1503 * Returns 0 for success or NULL context or < 0 on error.
1504 */
1505int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
1506{
1507 struct audit_aux_data_mq_getsetattr *ax;
1181 struct audit_context *context = current->audit_context; 1508 struct audit_context *context = current->audit_context;
1182 1509
1510 if (!audit_enabled)
1511 return 0;
1512
1183 if (likely(!context)) 1513 if (likely(!context))
1184 return 0; 1514 return 0;
1185 1515
@@ -1187,6 +1517,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1187 if (!ax) 1517 if (!ax)
1188 return -ENOMEM; 1518 return -ENOMEM;
1189 1519
1520 ax->mqdes = mqdes;
1521 ax->mqstat = *mqstat;
1522
1523 ax->d.type = AUDIT_MQ_GETSETATTR;
1524 ax->d.next = context->aux;
1525 context->aux = (void *)ax;
1526 return 0;
1527}
1528
1529/**
1530 * audit_ipc_obj - record audit data for ipc object
1531 * @ipcp: ipc permissions
1532 *
1533 * Returns 0 for success or NULL context or < 0 on error.
1534 */
1535int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
1536{
1537 struct audit_aux_data_ipcctl *ax;
1538 struct audit_context *context = current->audit_context;
1539
1540 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1541 if (!ax)
1542 return -ENOMEM;
1543
1190 ax->uid = ipcp->uid; 1544 ax->uid = ipcp->uid;
1191 ax->gid = ipcp->gid; 1545 ax->gid = ipcp->gid;
1192 ax->mode = ipcp->mode; 1546 ax->mode = ipcp->mode;
@@ -1204,17 +1558,15 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1204 * @uid: msgq user id 1558 * @uid: msgq user id
1205 * @gid: msgq group id 1559 * @gid: msgq group id
1206 * @mode: msgq mode (permissions) 1560 * @mode: msgq mode (permissions)
1561 * @ipcp: in-kernel IPC permissions
1207 * 1562 *
1208 * Returns 0 for success or NULL context or < 0 on error. 1563 * Returns 0 for success or NULL context or < 0 on error.
1209 */ 1564 */
1210int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) 1565int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1211{ 1566{
1212 struct audit_aux_data_ipcctl *ax; 1567 struct audit_aux_data_ipcctl *ax;
1213 struct audit_context *context = current->audit_context; 1568 struct audit_context *context = current->audit_context;
1214 1569
1215 if (likely(!context))
1216 return 0;
1217
1218 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 1570 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1219 if (!ax) 1571 if (!ax)
1220 return -ENOMEM; 1572 return -ENOMEM;
@@ -1223,7 +1575,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
1223 ax->uid = uid; 1575 ax->uid = uid;
1224 ax->gid = gid; 1576 ax->gid = gid;
1225 ax->mode = mode; 1577 ax->mode = mode;
1226 selinux_get_ipc_sid(ipcp, &ax->osid);
1227 1578
1228 ax->d.type = AUDIT_IPC_SET_PERM; 1579 ax->d.type = AUDIT_IPC_SET_PERM;
1229 ax->d.next = context->aux; 1580 ax->d.next = context->aux;
@@ -1231,6 +1582,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
1231 return 0; 1582 return 0;
1232} 1583}
1233 1584
1585int audit_bprm(struct linux_binprm *bprm)
1586{
1587 struct audit_aux_data_execve *ax;
1588 struct audit_context *context = current->audit_context;
1589 unsigned long p, next;
1590 void *to;
1591
1592 if (likely(!audit_enabled || !context))
1593 return 0;
1594
1595 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
1596 GFP_KERNEL);
1597 if (!ax)
1598 return -ENOMEM;
1599
1600 ax->argc = bprm->argc;
1601 ax->envc = bprm->envc;
1602 for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
1603 struct page *page = bprm->page[p / PAGE_SIZE];
1604 void *kaddr = kmap(page);
1605 next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1606 memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
1607 to += next - p;
1608 kunmap(page);
1609 }
1610
1611 ax->d.type = AUDIT_EXECVE;
1612 ax->d.next = context->aux;
1613 context->aux = (void *)ax;
1614 return 0;
1615}
1616
1617
1234/** 1618/**
1235 * audit_socketcall - record audit data for sys_socketcall 1619 * audit_socketcall - record audit data for sys_socketcall
1236 * @nargs: number of args 1620 * @nargs: number of args
@@ -1325,19 +1709,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1325 * If the audit subsystem is being terminated, record the task (pid) 1709 * If the audit subsystem is being terminated, record the task (pid)
1326 * and uid that is doing that. 1710 * and uid that is doing that.
1327 */ 1711 */
1328void audit_signal_info(int sig, struct task_struct *t) 1712void __audit_signal_info(int sig, struct task_struct *t)
1329{ 1713{
1330 extern pid_t audit_sig_pid; 1714 extern pid_t audit_sig_pid;
1331 extern uid_t audit_sig_uid; 1715 extern uid_t audit_sig_uid;
1332 1716 extern u32 audit_sig_sid;
1333 if (unlikely(audit_pid && t->tgid == audit_pid)) { 1717
1334 if (sig == SIGTERM || sig == SIGHUP) { 1718 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
1335 struct audit_context *ctx = current->audit_context; 1719 struct task_struct *tsk = current;
1336 audit_sig_pid = current->pid; 1720 struct audit_context *ctx = tsk->audit_context;
1337 if (ctx) 1721 audit_sig_pid = tsk->pid;
1338 audit_sig_uid = ctx->loginuid; 1722 if (ctx)
1339 else 1723 audit_sig_uid = ctx->loginuid;
1340 audit_sig_uid = current->uid; 1724 else
1341 } 1725 audit_sig_uid = tsk->uid;
1726 selinux_get_task_sid(tsk, &audit_sig_sid);
1342 } 1727 }
1343} 1728}
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d8..126dee9530aa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
21#include <linux/unistd.h> 21#include <linux/unistd.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26 27
@@ -729,17 +730,10 @@ void
729sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 730sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
730{ 731{
731 switch (_NSIG_WORDS) { 732 switch (_NSIG_WORDS) {
732#if defined (__COMPAT_ENDIAN_SWAP__)
733 case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
734 case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
735 case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
736 case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
737#else
738 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 733 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
739 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); 734 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
740 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); 735 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
741 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 736 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
742#endif
743 } 737 }
744} 738}
745 739
@@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
934 928
935 return ret; 929 return ret;
936} 930}
931
932#ifdef CONFIG_NUMA
933asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
934 compat_uptr_t __user *pages32,
935 const int __user *nodes,
936 int __user *status,
937 int flags)
938{
939 const void __user * __user *pages;
940 int i;
941
942 pages = compat_alloc_user_space(nr_pages * sizeof(void *));
943 for (i = 0; i < nr_pages; i++) {
944 compat_uptr_t p;
945
946 if (get_user(p, pages32 + i) ||
947 put_user(compat_ptr(p), pages + i))
948 return -EFAULT;
949 }
950 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
951}
952#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe2b8d0bfe4c..03dcd981846a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,10 +13,10 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <asm/semaphore.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DEFINE_MUTEX(cpucontrol);
20 20
21static BLOCKING_NOTIFIER_HEAD(cpu_chain); 21static BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible)
30 30
31 if (lock_cpu_hotplug_owner != current) { 31 if (lock_cpu_hotplug_owner != current) {
32 if (interruptible) 32 if (interruptible)
33 ret = down_interruptible(&cpucontrol); 33 ret = mutex_lock_interruptible(&cpucontrol);
34 else 34 else
35 down(&cpucontrol); 35 mutex_lock(&cpucontrol);
36 } 36 }
37 37
38 /* 38 /*
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void)
56{ 56{
57 if (--lock_cpu_hotplug_depth == 0) { 57 if (--lock_cpu_hotplug_depth == 0) {
58 lock_cpu_hotplug_owner = NULL; 58 lock_cpu_hotplug_owner = NULL;
59 up(&cpucontrol); 59 mutex_unlock(&cpucontrol);
60 } 60 }
61} 61}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572b..1535af3a912d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -41,6 +41,7 @@
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/security.h>
44#include <linux/slab.h> 45#include <linux/slab.h>
45#include <linux/smp_lock.h> 46#include <linux/smp_lock.h>
46#include <linux/spinlock.h> 47#include <linux/spinlock.h>
@@ -392,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
392 return 0; 393 return 0;
393} 394}
394 395
395static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, 396static int cpuset_get_sb(struct file_system_type *fs_type,
396 int flags, const char *unused_dev_name, 397 int flags, const char *unused_dev_name,
397 void *data) 398 void *data, struct vfsmount *mnt)
398{ 399{
399 return get_sb_single(fs_type, flags, data, cpuset_fill_super); 400 return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
400} 401}
401 402
402static struct file_system_type cpuset_fs_type = { 403static struct file_system_type cpuset_fs_type = {
@@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1177 cpumask_t cpus; 1178 cpumask_t cpus;
1178 nodemask_t from, to; 1179 nodemask_t from, to;
1179 struct mm_struct *mm; 1180 struct mm_struct *mm;
1181 int retval;
1180 1182
1181 if (sscanf(pidbuf, "%d", &pid) != 1) 1183 if (sscanf(pidbuf, "%d", &pid) != 1)
1182 return -EIO; 1184 return -EIO;
@@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1205 get_task_struct(tsk); 1207 get_task_struct(tsk);
1206 } 1208 }
1207 1209
1210 retval = security_task_setscheduler(tsk, 0, NULL);
1211 if (retval) {
1212 put_task_struct(tsk);
1213 return retval;
1214 }
1215
1208 mutex_lock(&callback_mutex); 1216 mutex_lock(&callback_mutex);
1209 1217
1210 task_lock(tsk); 1218 task_lock(tsk);
@@ -2434,31 +2442,43 @@ void __cpuset_memory_pressure_bump(void)
2434 */ 2442 */
2435static int proc_cpuset_show(struct seq_file *m, void *v) 2443static int proc_cpuset_show(struct seq_file *m, void *v)
2436{ 2444{
2445 struct pid *pid;
2437 struct task_struct *tsk; 2446 struct task_struct *tsk;
2438 char *buf; 2447 char *buf;
2439 int retval = 0; 2448 int retval;
2440 2449
2450 retval = -ENOMEM;
2441 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2451 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2442 if (!buf) 2452 if (!buf)
2443 return -ENOMEM; 2453 goto out;
2444 2454
2445 tsk = m->private; 2455 retval = -ESRCH;
2456 pid = m->private;
2457 tsk = get_pid_task(pid, PIDTYPE_PID);
2458 if (!tsk)
2459 goto out_free;
2460
2461 retval = -EINVAL;
2446 mutex_lock(&manage_mutex); 2462 mutex_lock(&manage_mutex);
2463
2447 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2464 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2448 if (retval < 0) 2465 if (retval < 0)
2449 goto out; 2466 goto out_unlock;
2450 seq_puts(m, buf); 2467 seq_puts(m, buf);
2451 seq_putc(m, '\n'); 2468 seq_putc(m, '\n');
2452out: 2469out_unlock:
2453 mutex_unlock(&manage_mutex); 2470 mutex_unlock(&manage_mutex);
2471 put_task_struct(tsk);
2472out_free:
2454 kfree(buf); 2473 kfree(buf);
2474out:
2455 return retval; 2475 return retval;
2456} 2476}
2457 2477
2458static int cpuset_open(struct inode *inode, struct file *file) 2478static int cpuset_open(struct inode *inode, struct file *file)
2459{ 2479{
2460 struct task_struct *tsk = PROC_I(inode)->task; 2480 struct pid *pid = PROC_I(inode)->pid;
2461 return single_open(file, proc_cpuset_show, tsk); 2481 return single_open(file, proc_cpuset_show, pid);
2462} 2482}
2463 2483
2464struct file_operations proc_cpuset_operations = { 2484struct file_operations proc_cpuset_operations = {
diff --git a/kernel/exit.c b/kernel/exit.c
index e95b93282210..304ef637be6c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -36,6 +36,7 @@
36#include <linux/compat.h> 36#include <linux/compat.h>
37#include <linux/pipe_fs_i.h> 37#include <linux/pipe_fs_i.h>
38#include <linux/audit.h> /* for audit_free() */ 38#include <linux/audit.h> /* for audit_free() */
39#include <linux/resource.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/unistd.h> 42#include <asm/unistd.h>
@@ -45,8 +46,6 @@
45extern void sem_exit (void); 46extern void sem_exit (void);
46extern struct task_struct *child_reaper; 47extern struct task_struct *child_reaper;
47 48
48int getrusage(struct task_struct *, int, struct rusage __user *);
49
50static void exit_mm(struct task_struct * tsk); 49static void exit_mm(struct task_struct * tsk);
51 50
52static void __unhash_process(struct task_struct *p) 51static void __unhash_process(struct task_struct *p)
@@ -138,12 +137,8 @@ void release_task(struct task_struct * p)
138{ 137{
139 int zap_leader; 138 int zap_leader;
140 task_t *leader; 139 task_t *leader;
141 struct dentry *proc_dentry;
142
143repeat: 140repeat:
144 atomic_dec(&p->user->processes); 141 atomic_dec(&p->user->processes);
145 spin_lock(&p->proc_lock);
146 proc_dentry = proc_pid_unhash(p);
147 write_lock_irq(&tasklist_lock); 142 write_lock_irq(&tasklist_lock);
148 ptrace_unlink(p); 143 ptrace_unlink(p);
149 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 144 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -172,8 +167,7 @@ repeat:
172 167
173 sched_exit(p); 168 sched_exit(p);
174 write_unlock_irq(&tasklist_lock); 169 write_unlock_irq(&tasklist_lock);
175 spin_unlock(&p->proc_lock); 170 proc_flush_task(p);
176 proc_pid_flush(proc_dentry);
177 release_thread(p); 171 release_thread(p);
178 call_rcu(&p->rcu, delayed_put_task_struct); 172 call_rcu(&p->rcu, delayed_put_task_struct);
179 173
@@ -579,7 +573,7 @@ static void exit_mm(struct task_struct * tsk)
579 down_read(&mm->mmap_sem); 573 down_read(&mm->mmap_sem);
580 } 574 }
581 atomic_inc(&mm->mm_count); 575 atomic_inc(&mm->mm_count);
582 if (mm != tsk->active_mm) BUG(); 576 BUG_ON(mm != tsk->active_mm);
583 /* more a memory barrier than a real lock */ 577 /* more a memory barrier than a real lock */
584 task_lock(tsk); 578 task_lock(tsk);
585 tsk->mm = NULL; 579 tsk->mm = NULL;
@@ -881,14 +875,6 @@ fastcall NORET_TYPE void do_exit(long code)
881 875
882 tsk->flags |= PF_EXITING; 876 tsk->flags |= PF_EXITING;
883 877
884 /*
885 * Make sure we don't try to process any timer firings
886 * while we are already exiting.
887 */
888 tsk->it_virt_expires = cputime_zero;
889 tsk->it_prof_expires = cputime_zero;
890 tsk->it_sched_expires = 0;
891
892 if (unlikely(in_atomic())) 878 if (unlikely(in_atomic()))
893 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 879 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
894 current->comm, current->pid, 880 current->comm, current->pid,
@@ -903,11 +889,11 @@ fastcall NORET_TYPE void do_exit(long code)
903 if (group_dead) { 889 if (group_dead) {
904 hrtimer_cancel(&tsk->signal->real_timer); 890 hrtimer_cancel(&tsk->signal->real_timer);
905 exit_itimers(tsk->signal); 891 exit_itimers(tsk->signal);
906 acct_process(code);
907 } 892 }
893 acct_collect(code, group_dead);
908 if (unlikely(tsk->robust_list)) 894 if (unlikely(tsk->robust_list))
909 exit_robust_list(tsk); 895 exit_robust_list(tsk);
910#ifdef CONFIG_COMPAT 896#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
911 if (unlikely(tsk->compat_robust_list)) 897 if (unlikely(tsk->compat_robust_list))
912 compat_exit_robust_list(tsk); 898 compat_exit_robust_list(tsk);
913#endif 899#endif
@@ -915,6 +901,8 @@ fastcall NORET_TYPE void do_exit(long code)
915 audit_free(tsk); 901 audit_free(tsk);
916 exit_mm(tsk); 902 exit_mm(tsk);
917 903
904 if (group_dead)
905 acct_process();
918 exit_sem(tsk); 906 exit_sem(tsk);
919 __exit_files(tsk); 907 __exit_files(tsk);
920 __exit_fs(tsk); 908 __exit_fs(tsk);
@@ -1538,8 +1526,7 @@ check_continued:
1538 if (options & __WNOTHREAD) 1526 if (options & __WNOTHREAD)
1539 break; 1527 break;
1540 tsk = next_thread(tsk); 1528 tsk = next_thread(tsk);
1541 if (tsk->signal != current->signal) 1529 BUG_ON(tsk->signal != current->signal);
1542 BUG();
1543 } while (tsk != current); 1530 } while (tsk != current);
1544 1531
1545 read_unlock(&tasklist_lock); 1532 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index ac8100e3088a..9b4e54ef0225 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm)
368 */ 368 */
369void mmput(struct mm_struct *mm) 369void mmput(struct mm_struct *mm)
370{ 370{
371 might_sleep();
372
371 if (atomic_dec_and_test(&mm->mm_users)) { 373 if (atomic_dec_and_test(&mm->mm_users)) {
372 exit_aio(mm); 374 exit_aio(mm);
373 exit_mmap(mm); 375 exit_mmap(mm);
@@ -623,6 +625,7 @@ out:
623/* 625/*
624 * Allocate a new files structure and copy contents from the 626 * Allocate a new files structure and copy contents from the
625 * passed in files structure. 627 * passed in files structure.
628 * errorp will be valid only when the returned files_struct is NULL.
626 */ 629 */
627static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) 630static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
628{ 631{
@@ -631,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
631 int open_files, size, i, expand; 634 int open_files, size, i, expand;
632 struct fdtable *old_fdt, *new_fdt; 635 struct fdtable *old_fdt, *new_fdt;
633 636
637 *errorp = -ENOMEM;
634 newf = alloc_files(); 638 newf = alloc_files();
635 if (!newf) 639 if (!newf)
636 goto out; 640 goto out;
@@ -744,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
744 * break this. 748 * break this.
745 */ 749 */
746 tsk->files = NULL; 750 tsk->files = NULL;
747 error = -ENOMEM;
748 newf = dup_fd(oldf, &error); 751 newf = dup_fd(oldf, &error);
749 if (!newf) 752 if (!newf)
750 goto out; 753 goto out;
@@ -871,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
871 tsk->it_prof_expires = 874 tsk->it_prof_expires =
872 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 875 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
873 } 876 }
877 acct_init_pacct(&sig->pacct);
874 878
875 return 0; 879 return 0;
876} 880}
@@ -989,13 +993,10 @@ static task_t *copy_process(unsigned long clone_flags,
989 if (put_user(p->pid, parent_tidptr)) 993 if (put_user(p->pid, parent_tidptr))
990 goto bad_fork_cleanup; 994 goto bad_fork_cleanup;
991 995
992 p->proc_dentry = NULL;
993
994 INIT_LIST_HEAD(&p->children); 996 INIT_LIST_HEAD(&p->children);
995 INIT_LIST_HEAD(&p->sibling); 997 INIT_LIST_HEAD(&p->sibling);
996 p->vfork_done = NULL; 998 p->vfork_done = NULL;
997 spin_lock_init(&p->alloc_lock); 999 spin_lock_init(&p->alloc_lock);
998 spin_lock_init(&p->proc_lock);
999 1000
1000 clear_tsk_thread_flag(p, TIF_SIGPENDING); 1001 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1001 init_sigpending(&p->pending); 1002 init_sigpending(&p->pending);
@@ -1155,18 +1156,6 @@ static task_t *copy_process(unsigned long clone_flags,
1155 } 1156 }
1156 1157
1157 if (clone_flags & CLONE_THREAD) { 1158 if (clone_flags & CLONE_THREAD) {
1158 /*
1159 * Important: if an exit-all has been started then
1160 * do not create this new thread - the whole thread
1161 * group is supposed to exit anyway.
1162 */
1163 if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1164 spin_unlock(&current->sighand->siglock);
1165 write_unlock_irq(&tasklist_lock);
1166 retval = -EAGAIN;
1167 goto bad_fork_cleanup_namespace;
1168 }
1169
1170 p->group_leader = current->group_leader; 1159 p->group_leader = current->group_leader;
1171 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1160 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1172 1161
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057b..e1a380c77a5a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
1056 (unsigned long)uaddr2, val2, val3); 1056 (unsigned long)uaddr2, val2, val3);
1057} 1057}
1058 1058
1059static struct super_block * 1059static int futexfs_get_sb(struct file_system_type *fs_type,
1060futexfs_get_sb(struct file_system_type *fs_type, 1060 int flags, const char *dev_name, void *data,
1061 int flags, const char *dev_name, void *data) 1061 struct vfsmount *mnt)
1062{ 1062{
1063 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); 1063 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
1064} 1064}
1065 1065
1066static struct file_system_type futex_fs_type = { 1066static struct file_system_type futex_fs_type = {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 01fa2ae98a85..55601b3ce60e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
98 98
99/** 99/**
100 * ktime_get_ts - get the monotonic clock in timespec format 100 * ktime_get_ts - get the monotonic clock in timespec format
101 *
102 * @ts: pointer to timespec variable 101 * @ts: pointer to timespec variable
103 * 102 *
104 * The function calculates the monotonic clock from the realtime 103 * The function calculates the monotonic clock from the realtime
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
238# ifndef CONFIG_KTIME_SCALAR 237# ifndef CONFIG_KTIME_SCALAR
239/** 238/**
240 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable 239 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
241 *
242 * @kt: addend 240 * @kt: addend
243 * @nsec: the scalar nsec value to add 241 * @nsec: the scalar nsec value to add
244 * 242 *
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
299 297
300/** 298/**
301 * hrtimer_forward - forward the timer expiry 299 * hrtimer_forward - forward the timer expiry
302 *
303 * @timer: hrtimer to forward 300 * @timer: hrtimer to forward
304 * @now: forward past this time 301 * @now: forward past this time
305 * @interval: the interval to forward 302 * @interval: the interval to forward
@@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
393 if (base->first == &timer->node) 390 if (base->first == &timer->node)
394 base->first = rb_next(&timer->node); 391 base->first = rb_next(&timer->node);
395 rb_erase(&timer->node, &base->active); 392 rb_erase(&timer->node, &base->active);
396 timer->node.rb_parent = HRTIMER_INACTIVE; 393 rb_set_parent(&timer->node, &timer->node);
397} 394}
398 395
399/* 396/*
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
411 408
412/** 409/**
413 * hrtimer_start - (re)start an relative timer on the current CPU 410 * hrtimer_start - (re)start an relative timer on the current CPU
414 *
415 * @timer: the timer to be added 411 * @timer: the timer to be added
416 * @tim: expiry time 412 * @tim: expiry time
417 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 413 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
460 456
461/** 457/**
462 * hrtimer_try_to_cancel - try to deactivate a timer 458 * hrtimer_try_to_cancel - try to deactivate a timer
463 *
464 * @timer: hrtimer to stop 459 * @timer: hrtimer to stop
465 * 460 *
466 * Returns: 461 * Returns:
467 * 0 when the timer was not active 462 * 0 when the timer was not active
468 * 1 when the timer was active 463 * 1 when the timer was active
469 * -1 when the timer is currently excuting the callback function and 464 * -1 when the timer is currently excuting the callback function and
470 * can not be stopped 465 * cannot be stopped
471 */ 466 */
472int hrtimer_try_to_cancel(struct hrtimer *timer) 467int hrtimer_try_to_cancel(struct hrtimer *timer)
473{ 468{
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
489 484
490/** 485/**
491 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 486 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
492 *
493 * @timer: the timer to be cancelled 487 * @timer: the timer to be cancelled
494 * 488 *
495 * Returns: 489 * Returns:
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
510 504
511/** 505/**
512 * hrtimer_get_remaining - get remaining time for the timer 506 * hrtimer_get_remaining - get remaining time for the timer
513 *
514 * @timer: the timer to read 507 * @timer: the timer to read
515 */ 508 */
516ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 509ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
564 557
565/** 558/**
566 * hrtimer_init - initialize a timer to the given clock 559 * hrtimer_init - initialize a timer to the given clock
567 *
568 * @timer: the timer to be initialized 560 * @timer: the timer to be initialized
569 * @clock_id: the clock to be used 561 * @clock_id: the clock to be used
570 * @mode: timer mode abs/rel 562 * @mode: timer mode abs/rel
@@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
576 568
577 memset(timer, 0, sizeof(struct hrtimer)); 569 memset(timer, 0, sizeof(struct hrtimer));
578 570
579 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 571 bases = __raw_get_cpu_var(hrtimer_bases);
580 572
581 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
582 clock_id = CLOCK_MONOTONIC; 574 clock_id = CLOCK_MONOTONIC;
583 575
584 timer->base = &bases[clock_id]; 576 timer->base = &bases[clock_id];
585 timer->node.rb_parent = HRTIMER_INACTIVE; 577 rb_set_parent(&timer->node, &timer->node);
586} 578}
587EXPORT_SYMBOL_GPL(hrtimer_init); 579EXPORT_SYMBOL_GPL(hrtimer_init);
588 580
589/** 581/**
590 * hrtimer_get_res - get the timer resolution for a clock 582 * hrtimer_get_res - get the timer resolution for a clock
591 *
592 * @which_clock: which clock to query 583 * @which_clock: which clock to query
593 * @tp: pointer to timespec variable to store the resolution 584 * @tp: pointer to timespec variable to store the resolution
594 * 585 *
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
599{ 590{
600 struct hrtimer_base *bases; 591 struct hrtimer_base *bases;
601 592
602 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 593 bases = __raw_get_cpu_var(hrtimer_bases);
603 *tp = ktime_to_timespec(bases[which_clock].resolution); 594 *tp = ktime_to_timespec(bases[which_clock].resolution);
604 595
605 return 0; 596 return 0;
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
deleted file mode 100644
index 55b1e5b85db9..000000000000
--- a/kernel/intermodule.c
+++ /dev/null
@@ -1,184 +0,0 @@
1/* Deprecated, do not use. Moved from module.c to here. --RR */
2
3/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
4#include <linux/module.h>
5#include <linux/kmod.h>
6#include <linux/spinlock.h>
7#include <linux/list.h>
8#include <linux/slab.h>
9
10/* inter_module functions are always available, even when the kernel is
11 * compiled without modules. Consumers of inter_module_xxx routines
12 * will always work, even when both are built into the kernel, this
13 * approach removes lots of #ifdefs in mainline code.
14 */
15
16static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
17static DEFINE_SPINLOCK(ime_lock);
18static int kmalloc_failed;
19
20struct inter_module_entry {
21 struct list_head list;
22 const char *im_name;
23 struct module *owner;
24 const void *userdata;
25};
26
27/**
28 * inter_module_register - register a new set of inter module data.
29 * @im_name: an arbitrary string to identify the data, must be unique
30 * @owner: module that is registering the data, always use THIS_MODULE
31 * @userdata: pointer to arbitrary userdata to be registered
32 *
33 * Description: Check that the im_name has not already been registered,
34 * complain if it has. For new data, add it to the inter_module_entry
35 * list.
36 */
37void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
38{
39 struct list_head *tmp;
40 struct inter_module_entry *ime, *ime_new;
41
42 if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
43 /* Overloaded kernel, not fatal */
44 printk(KERN_ERR
45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
46 im_name);
47 kmalloc_failed = 1;
48 return;
49 }
50 ime_new->im_name = im_name;
51 ime_new->owner = owner;
52 ime_new->userdata = userdata;
53
54 spin_lock(&ime_lock);
55 list_for_each(tmp, &ime_list) {
56 ime = list_entry(tmp, struct inter_module_entry, list);
57 if (strcmp(ime->im_name, im_name) == 0) {
58 spin_unlock(&ime_lock);
59 kfree(ime_new);
60 /* Program logic error, fatal */
61 printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
62 BUG();
63 }
64 }
65 list_add(&(ime_new->list), &ime_list);
66 spin_unlock(&ime_lock);
67}
68
69/**
70 * inter_module_unregister - unregister a set of inter module data.
71 * @im_name: an arbitrary string to identify the data, must be unique
72 *
73 * Description: Check that the im_name has been registered, complain if
74 * it has not. For existing data, remove it from the
75 * inter_module_entry list.
76 */
77void inter_module_unregister(const char *im_name)
78{
79 struct list_head *tmp;
80 struct inter_module_entry *ime;
81
82 spin_lock(&ime_lock);
83 list_for_each(tmp, &ime_list) {
84 ime = list_entry(tmp, struct inter_module_entry, list);
85 if (strcmp(ime->im_name, im_name) == 0) {
86 list_del(&(ime->list));
87 spin_unlock(&ime_lock);
88 kfree(ime);
89 return;
90 }
91 }
92 spin_unlock(&ime_lock);
93 if (kmalloc_failed) {
94 printk(KERN_ERR
95 "inter_module_unregister: no entry for '%s', "
96 "probably caused by previous kmalloc failure\n",
97 im_name);
98 return;
99 }
100 else {
101 /* Program logic error, fatal */
102 printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
103 BUG();
104 }
105}
106
107/**
108 * inter_module_get - return arbitrary userdata from another module.
109 * @im_name: an arbitrary string to identify the data, must be unique
110 *
111 * Description: If the im_name has not been registered, return NULL.
112 * Try to increment the use count on the owning module, if that fails
113 * then return NULL. Otherwise return the userdata.
114 */
115static const void *inter_module_get(const char *im_name)
116{
117 struct list_head *tmp;
118 struct inter_module_entry *ime;
119 const void *result = NULL;
120
121 spin_lock(&ime_lock);
122 list_for_each(tmp, &ime_list) {
123 ime = list_entry(tmp, struct inter_module_entry, list);
124 if (strcmp(ime->im_name, im_name) == 0) {
125 if (try_module_get(ime->owner))
126 result = ime->userdata;
127 break;
128 }
129 }
130 spin_unlock(&ime_lock);
131 return(result);
132}
133
134/**
135 * inter_module_get_request - im get with automatic request_module.
136 * @im_name: an arbitrary string to identify the data, must be unique
137 * @modname: module that is expected to register im_name
138 *
139 * Description: If inter_module_get fails, do request_module then retry.
140 */
141const void *inter_module_get_request(const char *im_name, const char *modname)
142{
143 const void *result = inter_module_get(im_name);
144 if (!result) {
145 request_module("%s", modname);
146 result = inter_module_get(im_name);
147 }
148 return(result);
149}
150
151/**
152 * inter_module_put - release use of data from another module.
153 * @im_name: an arbitrary string to identify the data, must be unique
154 *
155 * Description: If the im_name has not been registered, complain,
156 * otherwise decrement the use count on the owning module.
157 */
158void inter_module_put(const char *im_name)
159{
160 struct list_head *tmp;
161 struct inter_module_entry *ime;
162
163 spin_lock(&ime_lock);
164 list_for_each(tmp, &ime_list) {
165 ime = list_entry(tmp, struct inter_module_entry, list);
166 if (strcmp(ime->im_name, im_name) == 0) {
167 if (ime->owner)
168 module_put(ime->owner);
169 spin_unlock(&ime_lock);
170 return;
171 }
172 }
173 spin_unlock(&ime_lock);
174 printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
175 BUG();
176}
177
178EXPORT_SYMBOL(inter_module_register);
179EXPORT_SYMBOL(inter_module_unregister);
180EXPORT_SYMBOL(inter_module_get_request);
181EXPORT_SYMBOL(inter_module_put);
182
183MODULE_LICENSE("GPL");
184
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37db..0f6530117105 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
76/* 76/*
77 * Have got an event to handle: 77 * Have got an event to handle:
78 */ 78 */
79fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, 79fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
80 struct irqaction *action) 80 struct irqaction *action)
81{ 81{
82 int ret, retval = 0, status = 0; 82 irqreturn_t ret, retval = IRQ_NONE;
83 unsigned int status = 0;
83 84
84 if (!(action->flags & SA_INTERRUPT)) 85 if (!(action->flags & SA_INTERRUPT))
85 local_irq_enable(); 86 local_irq_enable();
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 134f9f2e0e39..a12d00eb5e7c 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -30,7 +30,7 @@ void move_native_irq(int irq)
30 30
31 desc->move_irq = 0; 31 desc->move_irq = 0;
32 32
33 if (likely(cpus_empty(pending_irq_cpumask[irq]))) 33 if (unlikely(cpus_empty(pending_irq_cpumask[irq])))
34 return; 34 return;
35 35
36 if (!desc->handler->set_affinity) 36 if (!desc->handler->set_affinity)
@@ -49,7 +49,7 @@ void move_native_irq(int irq)
49 * cause some ioapics to mal-function. 49 * cause some ioapics to mal-function.
50 * Being paranoid i guess! 50 * Being paranoid i guess!
51 */ 51 */
52 if (unlikely(!cpus_empty(tmp))) { 52 if (likely(!cpus_empty(tmp))) {
53 if (likely(!(desc->status & IRQ_DISABLED))) 53 if (likely(!(desc->status & IRQ_DISABLED)))
54 desc->handler->disable(irq); 54 desc->handler->disable(irq);
55 55
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce0..afacd6f585fa 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
24#ifdef CONFIG_GENERIC_PENDING_IRQ 24#ifdef CONFIG_GENERIC_PENDING_IRQ
25void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 25void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
26{ 26{
27 set_balance_irq_affinity(irq, mask_val);
28
27 /* 29 /*
28 * Save these away for later use. Re-progam when the 30 * Save these away for later use. Re-progam when the
29 * interrupt is pending 31 * interrupt is pending
@@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
33#else 35#else
34void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 36void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
35{ 37{
38 set_balance_irq_affinity(irq, mask_val);
36 irq_affinity[irq] = mask_val; 39 irq_affinity[irq] = mask_val;
37 irq_desc[irq].handler->set_affinity(irq, mask_val); 40 irq_desc[irq].handler->set_affinity(irq, mask_val);
38} 41}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7df9abd5ec86..b2fb3c18d06b 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,7 +11,7 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13 13
14static int irqfixup; 14static int irqfixup __read_mostly;
15 15
16/* 16/*
17 * Recovery handler for misrouted interrupts. 17 * Recovery handler for misrouted interrupts.
@@ -136,9 +136,9 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
136void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, 136void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
137 struct pt_regs *regs) 137 struct pt_regs *regs)
138{ 138{
139 if (action_ret != IRQ_HANDLED) { 139 if (unlikely(action_ret != IRQ_HANDLED)) {
140 desc->irqs_unhandled++; 140 desc->irqs_unhandled++;
141 if (action_ret != IRQ_NONE) 141 if (unlikely(action_ret != IRQ_NONE))
142 report_bad_irq(irq, desc, action_ret); 142 report_bad_irq(irq, desc, action_ret);
143 } 143 }
144 144
@@ -152,11 +152,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
152 } 152 }
153 153
154 desc->irq_count++; 154 desc->irq_count++;
155 if (desc->irq_count < 100000) 155 if (likely(desc->irq_count < 100000))
156 return; 156 return;
157 157
158 desc->irq_count = 0; 158 desc->irq_count = 0;
159 if (desc->irqs_unhandled > 99900) { 159 if (unlikely(desc->irqs_unhandled > 99900)) {
160 /* 160 /*
161 * The interrupt is stuck 161 * The interrupt is stuck
162 */ 162 */
@@ -171,7 +171,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
171 desc->irqs_unhandled = 0; 171 desc->irqs_unhandled = 0;
172} 172}
173 173
174int noirqdebug; 174int noirqdebug __read_mostly;
175 175
176int __init noirqdebug_setup(char *str) 176int __init noirqdebug_setup(char *str)
177{ 177{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0e..58f0f382597c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
902 * kexec does not sync, or unmount filesystems so if you need 902 * kexec does not sync, or unmount filesystems so if you need
903 * that to happen you need to do that yourself. 903 * that to happen you need to do that yourself.
904 */ 904 */
905struct kimage *kexec_image = NULL; 905struct kimage *kexec_image;
906static struct kimage *kexec_crash_image = NULL; 906struct kimage *kexec_crash_image;
907/* 907/*
908 * A home grown binary mutex. 908 * A home grown binary mutex.
909 * Nothing can wait so this mutex is safe to use 909 * Nothing can wait so this mutex is safe to use
910 * in interrupt context :) 910 * in interrupt context :)
911 */ 911 */
912static int kexec_lock = 0; 912static int kexec_lock;
913 913
914asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 914asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
915 struct kexec_segment __user *segments, 915 struct kexec_segment __user *segments,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fbf466a29aa..64aab081153b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,11 +47,17 @@
47 47
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50static atomic_t kprobe_count;
50 51
51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 52DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 53DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 54static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 55
56static struct notifier_block kprobe_page_fault_nb = {
57 .notifier_call = kprobe_exceptions_notify,
58 .priority = 0x7fffffff /* we need to notified first */
59};
60
55#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 61#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
56/* 62/*
57 * kprobe->ainsn.insn points to the copy of the instruction to be 63 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
368*/ 374*/
369static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 375static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
370{ 376{
371 struct kprobe *kp;
372
373 if (p->break_handler) { 377 if (p->break_handler) {
374 list_for_each_entry_rcu(kp, &old_p->list, list) { 378 if (old_p->break_handler)
375 if (kp->break_handler) 379 return -EEXIST;
376 return -EEXIST;
377 }
378 list_add_tail_rcu(&p->list, &old_p->list); 380 list_add_tail_rcu(&p->list, &old_p->list);
381 old_p->break_handler = aggr_break_handler;
379 } else 382 } else
380 list_add_rcu(&p->list, &old_p->list); 383 list_add_rcu(&p->list, &old_p->list);
384 if (p->post_handler && !old_p->post_handler)
385 old_p->post_handler = aggr_post_handler;
381 return 0; 386 return 0;
382} 387}
383 388
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
390 copy_kprobe(p, ap); 395 copy_kprobe(p, ap);
391 ap->addr = p->addr; 396 ap->addr = p->addr;
392 ap->pre_handler = aggr_pre_handler; 397 ap->pre_handler = aggr_pre_handler;
393 ap->post_handler = aggr_post_handler;
394 ap->fault_handler = aggr_fault_handler; 398 ap->fault_handler = aggr_fault_handler;
395 ap->break_handler = aggr_break_handler; 399 if (p->post_handler)
400 ap->post_handler = aggr_post_handler;
401 if (p->break_handler)
402 ap->break_handler = aggr_break_handler;
396 403
397 INIT_LIST_HEAD(&ap->list); 404 INIT_LIST_HEAD(&ap->list);
398 list_add_rcu(&p->list, &ap->list); 405 list_add_rcu(&p->list, &ap->list);
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
464 old_p = get_kprobe(p->addr); 471 old_p = get_kprobe(p->addr);
465 if (old_p) { 472 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 473 ret = register_aggr_kprobe(old_p, p);
474 if (!ret)
475 atomic_inc(&kprobe_count);
467 goto out; 476 goto out;
468 } 477 }
469 478
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
474 hlist_add_head_rcu(&p->hlist, 483 hlist_add_head_rcu(&p->hlist,
475 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 484 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
476 485
486 if (atomic_add_return(1, &kprobe_count) == \
487 (ARCH_INACTIVE_KPROBE_COUNT + 1))
488 register_page_fault_notifier(&kprobe_page_fault_nb);
489
477 arch_arm_kprobe(p); 490 arch_arm_kprobe(p);
478 491
479out: 492out:
@@ -536,14 +549,40 @@ valid_p:
536 kfree(old_p); 549 kfree(old_p);
537 } 550 }
538 arch_remove_kprobe(p); 551 arch_remove_kprobe(p);
552 } else {
553 mutex_lock(&kprobe_mutex);
554 if (p->break_handler)
555 old_p->break_handler = NULL;
556 if (p->post_handler){
557 list_for_each_entry_rcu(list_p, &old_p->list, list){
558 if (list_p->post_handler){
559 cleanup_p = 2;
560 break;
561 }
562 }
563 if (cleanup_p == 0)
564 old_p->post_handler = NULL;
565 }
566 mutex_unlock(&kprobe_mutex);
539 } 567 }
568
569 /* Call unregister_page_fault_notifier()
570 * if no probes are active
571 */
572 mutex_lock(&kprobe_mutex);
573 if (atomic_add_return(-1, &kprobe_count) == \
574 ARCH_INACTIVE_KPROBE_COUNT)
575 unregister_page_fault_notifier(&kprobe_page_fault_nb);
576 mutex_unlock(&kprobe_mutex);
577 return;
540} 578}
541 579
542static struct notifier_block kprobe_exceptions_nb = { 580static struct notifier_block kprobe_exceptions_nb = {
543 .notifier_call = kprobe_exceptions_notify, 581 .notifier_call = kprobe_exceptions_notify,
544 .priority = 0x7fffffff /* we need to notified first */ 582 .priority = 0x7fffffff /* we need to be notified first */
545}; 583};
546 584
585
547int __kprobes register_jprobe(struct jprobe *jp) 586int __kprobes register_jprobe(struct jprobe *jp)
548{ 587{
549 /* Todo: Verify probepoint is a function entry point */ 588 /* Todo: Verify probepoint is a function entry point */
@@ -652,6 +691,7 @@ static int __init init_kprobes(void)
652 INIT_HLIST_HEAD(&kprobe_table[i]); 691 INIT_HLIST_HEAD(&kprobe_table[i]);
653 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 692 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
654 } 693 }
694 atomic_set(&kprobe_count, 0);
655 695
656 err = arch_init_kprobes(); 696 err = arch_init_kprobes();
657 if (!err) 697 if (!err)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67b..9e28478a17a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/sysfs.h> 14#include <linux/sysfs.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/kexec.h>
17 18
18#define KERNEL_ATTR_RO(_name) \ 19#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 20static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
48KERNEL_ATTR_RW(uevent_helper); 49KERNEL_ATTR_RW(uevent_helper);
49#endif 50#endif
50 51
52#ifdef CONFIG_KEXEC
53static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
54{
55 return sprintf(page, "%d\n", !!kexec_image);
56}
57KERNEL_ATTR_RO(kexec_loaded);
58
59static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
60{
61 return sprintf(page, "%d\n", !!kexec_crash_image);
62}
63KERNEL_ATTR_RO(kexec_crash_loaded);
64#endif /* CONFIG_KEXEC */
65
51decl_subsys(kernel, NULL, NULL); 66decl_subsys(kernel, NULL, NULL);
52EXPORT_SYMBOL_GPL(kernel_subsys); 67EXPORT_SYMBOL_GPL(kernel_subsys);
53 68
@@ -56,6 +71,10 @@ static struct attribute * kernel_attrs[] = {
56 &uevent_seqnum_attr.attr, 71 &uevent_seqnum_attr.attr,
57 &uevent_helper_attr.attr, 72 &uevent_helper_attr.attr,
58#endif 73#endif
74#ifdef CONFIG_KEXEC
75 &kexec_loaded_attr.attr,
76 &kexec_crash_loaded_attr.attr,
77#endif
59 NULL 78 NULL
60}; 79};
61 80
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..24be714b04c7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
45static DEFINE_MUTEX(kthread_stop_lock); 45static DEFINE_MUTEX(kthread_stop_lock);
46static struct kthread_stop_info kthread_stop_info; 46static struct kthread_stop_info kthread_stop_info;
47 47
48/**
49 * kthread_should_stop - should this kthread return now?
50 *
51 * When someone calls kthread_stop on your kthread, it will be woken
52 * and this will return true. You should then return, and your return
53 * value will be passed through to kthread_stop().
54 */
48int kthread_should_stop(void) 55int kthread_should_stop(void)
49{ 56{
50 return (kthread_stop_info.k == current); 57 return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
122 complete(&create->done); 129 complete(&create->done);
123} 130}
124 131
132/**
133 * kthread_create - create a kthread.
134 * @threadfn: the function to run until signal_pending(current).
135 * @data: data ptr for @threadfn.
136 * @namefmt: printf-style name for the thread.
137 *
138 * Description: This helper function creates and names a kernel
139 * thread. The thread will be stopped: use wake_up_process() to start
140 * it. See also kthread_run(), kthread_create_on_cpu().
141 *
142 * When woken, the thread will run @threadfn() with @data as its
143 * argument. @threadfn can either call do_exit() directly if it is a
144 * standalone thread for which noone will call kthread_stop(), or
145 * return when 'kthread_should_stop()' is true (which means
146 * kthread_stop() has been called). The return value should be zero
147 * or a negative error number; it will be passed to kthread_stop().
148 *
149 * Returns a task_struct or ERR_PTR(-ENOMEM).
150 */
125struct task_struct *kthread_create(int (*threadfn)(void *data), 151struct task_struct *kthread_create(int (*threadfn)(void *data),
126 void *data, 152 void *data,
127 const char namefmt[], 153 const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
156} 182}
157EXPORT_SYMBOL(kthread_create); 183EXPORT_SYMBOL(kthread_create);
158 184
185/**
186 * kthread_bind - bind a just-created kthread to a cpu.
187 * @k: thread created by kthread_create().
188 * @cpu: cpu (might not be online, must be possible) for @k to run on.
189 *
190 * Description: This function is equivalent to set_cpus_allowed(),
191 * except that @cpu doesn't need to be online, and the thread must be
192 * stopped (i.e., just returned from kthread_create().
193 */
159void kthread_bind(struct task_struct *k, unsigned int cpu) 194void kthread_bind(struct task_struct *k, unsigned int cpu)
160{ 195{
161 BUG_ON(k->state != TASK_INTERRUPTIBLE); 196 BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
166} 201}
167EXPORT_SYMBOL(kthread_bind); 202EXPORT_SYMBOL(kthread_bind);
168 203
204/**
205 * kthread_stop - stop a thread created by kthread_create().
206 * @k: thread created by kthread_create().
207 *
208 * Sets kthread_should_stop() for @k to return true, wakes it, and
209 * waits for it to exit. Your threadfn() must not call do_exit()
210 * itself if you use this function! This can also be called after
211 * kthread_create() instead of calling wake_up_process(): the thread
212 * will exit without calling threadfn().
213 *
214 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
215 * was never called.
216 */
169int kthread_stop(struct task_struct *k) 217int kthread_stop(struct task_struct *k)
170{ 218{
171 return kthread_stop_sem(k, NULL); 219 return kthread_stop_sem(k, NULL);
172} 220}
173EXPORT_SYMBOL(kthread_stop); 221EXPORT_SYMBOL(kthread_stop);
174 222
223/**
224 * kthread_stop_sem - stop a thread created by kthread_create().
225 * @k: thread created by kthread_create().
226 * @s: semaphore that @k waits on while idle.
227 *
228 * Does essentially the same thing as kthread_stop() above, but wakes
229 * @k by calling up(@s).
230 *
231 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
232 * was never called.
233 */
175int kthread_stop_sem(struct task_struct *k, struct semaphore *s) 234int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
176{ 235{
177 int ret; 236 int ret;
@@ -210,5 +269,5 @@ static __init int helper_init(void)
210 269
211 return 0; 270 return 0;
212} 271}
213core_initcall(helper_init);
214 272
273core_initcall(helper_init);
diff --git a/kernel/module.c b/kernel/module.c
index 690381508d09..10e5b872adf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -40,6 +40,7 @@
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/mutex.h> 42#include <linux/mutex.h>
43#include <linux/unwind.h>
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/semaphore.h> 45#include <asm/semaphore.h>
45#include <asm/cacheflush.h> 46#include <asm/cacheflush.h>
@@ -1052,6 +1053,8 @@ static void free_module(struct module *mod)
1052 remove_sect_attrs(mod); 1053 remove_sect_attrs(mod);
1053 mod_kobject_remove(mod); 1054 mod_kobject_remove(mod);
1054 1055
1056 unwind_remove_table(mod->unwind_info, 0);
1057
1055 /* Arch-specific cleanup. */ 1058 /* Arch-specific cleanup. */
1056 module_arch_cleanup(mod); 1059 module_arch_cleanup(mod);
1057 1060
@@ -1317,7 +1320,7 @@ int is_exported(const char *name, const struct module *mod)
1317 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1320 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
1318 return 1; 1321 return 1;
1319 else 1322 else
1320 if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1323 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
1321 return 1; 1324 return 1;
1322 else 1325 else
1323 return 0; 1326 return 0;
@@ -1403,7 +1406,7 @@ static struct module *load_module(void __user *umod,
1403 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, 1406 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
1404 exportindex, modindex, obsparmindex, infoindex, gplindex, 1407 exportindex, modindex, obsparmindex, infoindex, gplindex,
1405 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, 1408 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
1406 gplfuturecrcindex; 1409 gplfuturecrcindex, unwindex = 0;
1407 struct module *mod; 1410 struct module *mod;
1408 long err = 0; 1411 long err = 0;
1409 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1412 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1493,6 +1496,9 @@ static struct module *load_module(void __user *umod,
1493 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1496 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1494 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1497 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1495 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1498 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1499#ifdef ARCH_UNWIND_SECTION_NAME
1500 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1501#endif
1496 1502
1497 /* Don't keep modinfo section */ 1503 /* Don't keep modinfo section */
1498 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1504 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1501,6 +1507,8 @@ static struct module *load_module(void __user *umod,
1501 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1507 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1502 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1508 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1503#endif 1509#endif
1510 if (unwindex)
1511 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1504 1512
1505 /* Check module struct version now, before we try to use module. */ 1513 /* Check module struct version now, before we try to use module. */
1506 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1514 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1729,6 +1737,11 @@ static struct module *load_module(void __user *umod,
1729 goto arch_cleanup; 1737 goto arch_cleanup;
1730 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 1738 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1731 1739
1740 /* Size of section 0 is 0, so this works well if no unwind info. */
1741 mod->unwind_info = unwind_add_table(mod,
1742 (void *)sechdrs[unwindex].sh_addr,
1743 sechdrs[unwindex].sh_size);
1744
1732 /* Get rid of temporary copy */ 1745 /* Get rid of temporary copy */
1733 vfree(hdr); 1746 vfree(hdr);
1734 1747
@@ -1827,6 +1840,7 @@ sys_init_module(void __user *umod,
1827 mod->state = MODULE_STATE_LIVE; 1840 mod->state = MODULE_STATE_LIVE;
1828 /* Drop initial reference. */ 1841 /* Drop initial reference. */
1829 module_put(mod); 1842 module_put(mod);
1843 unwind_remove_table(mod->unwind_info, 1);
1830 module_free(mod, mod->module_init); 1844 module_free(mod, mod->module_init);
1831 mod->module_init = NULL; 1845 mod->module_init = NULL;
1832 mod->init_size = 0; 1846 mod->init_size = 0;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index f4913c376950..036b6285b15c 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -153,13 +153,13 @@ next:
153 continue; 153 continue;
154 count++; 154 count++;
155 cursor = curr->next; 155 cursor = curr->next;
156 debug_spin_lock_restore(&debug_mutex_lock, flags); 156 debug_spin_unlock_restore(&debug_mutex_lock, flags);
157 157
158 printk("\n#%03d: ", count); 158 printk("\n#%03d: ", count);
159 printk_lock(lock, filter ? 0 : 1); 159 printk_lock(lock, filter ? 0 : 1);
160 goto next; 160 goto next;
161 } 161 }
162 debug_spin_lock_restore(&debug_mutex_lock, flags); 162 debug_spin_unlock_restore(&debug_mutex_lock, flags);
163 printk("\n"); 163 printk("\n");
164} 164}
165 165
@@ -316,7 +316,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
316 continue; 316 continue;
317 list_del_init(curr); 317 list_del_init(curr);
318 DEBUG_OFF(); 318 DEBUG_OFF();
319 debug_spin_lock_restore(&debug_mutex_lock, flags); 319 debug_spin_unlock_restore(&debug_mutex_lock, flags);
320 320
321 printk("BUG: %s/%d, lock held at task exit time!\n", 321 printk("BUG: %s/%d, lock held at task exit time!\n",
322 task->comm, task->pid); 322 task->comm, task->pid);
@@ -325,7 +325,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
325 printk("exiting task is not even the owner??\n"); 325 printk("exiting task is not even the owner??\n");
326 return; 326 return;
327 } 327 }
328 debug_spin_lock_restore(&debug_mutex_lock, flags); 328 debug_spin_unlock_restore(&debug_mutex_lock, flags);
329} 329}
330 330
331/* 331/*
@@ -352,7 +352,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
352 continue; 352 continue;
353 list_del_init(curr); 353 list_del_init(curr);
354 DEBUG_OFF(); 354 DEBUG_OFF();
355 debug_spin_lock_restore(&debug_mutex_lock, flags); 355 debug_spin_unlock_restore(&debug_mutex_lock, flags);
356 356
357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", 357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
358 current->comm, current->pid, lock, from, to); 358 current->comm, current->pid, lock, from, to);
@@ -362,7 +362,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
362 printk("freeing task is not even the owner??\n"); 362 printk("freeing task is not even the owner??\n");
363 return; 363 return;
364 } 364 }
365 debug_spin_lock_restore(&debug_mutex_lock, flags); 365 debug_spin_unlock_restore(&debug_mutex_lock, flags);
366} 366}
367 367
368/* 368/*
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index fd384050acb1..a5196c36a5fd 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
46extern void debug_mutex_unlock(struct mutex *lock); 46extern void debug_mutex_unlock(struct mutex *lock);
47extern void debug_mutex_init(struct mutex *lock, const char *name); 47extern void debug_mutex_init(struct mutex *lock, const char *name);
48 48
49#define debug_spin_lock(lock) \
50 do { \
51 local_irq_disable(); \
52 if (debug_mutex_on) \
53 spin_lock(lock); \
54 } while (0)
55
56#define debug_spin_unlock(lock) \
57 do { \
58 if (debug_mutex_on) \
59 spin_unlock(lock); \
60 local_irq_enable(); \
61 preempt_check_resched(); \
62 } while (0)
63
64#define debug_spin_lock_save(lock, flags) \ 49#define debug_spin_lock_save(lock, flags) \
65 do { \ 50 do { \
66 local_irq_save(flags); \ 51 local_irq_save(flags); \
@@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
68 spin_lock(lock); \ 53 spin_lock(lock); \
69 } while (0) 54 } while (0)
70 55
71#define debug_spin_lock_restore(lock, flags) \ 56#define debug_spin_unlock_restore(lock, flags) \
72 do { \ 57 do { \
73 if (debug_mutex_on) \ 58 if (debug_mutex_on) \
74 spin_unlock(lock); \ 59 spin_unlock(lock); \
@@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
76 preempt_check_resched(); \ 61 preempt_check_resched(); \
77 } while (0) 62 } while (0)
78 63
79#define spin_lock_mutex(lock) \ 64#define spin_lock_mutex(lock, flags) \
80 do { \ 65 do { \
81 struct mutex *l = container_of(lock, struct mutex, wait_lock); \ 66 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
82 \ 67 \
83 DEBUG_WARN_ON(in_interrupt()); \ 68 DEBUG_WARN_ON(in_interrupt()); \
84 debug_spin_lock(&debug_mutex_lock); \ 69 debug_spin_lock_save(&debug_mutex_lock, flags); \
85 spin_lock(lock); \ 70 spin_lock(lock); \
86 DEBUG_WARN_ON(l->magic != l); \ 71 DEBUG_WARN_ON(l->magic != l); \
87 } while (0) 72 } while (0)
88 73
89#define spin_unlock_mutex(lock) \ 74#define spin_unlock_mutex(lock, flags) \
90 do { \ 75 do { \
91 spin_unlock(lock); \ 76 spin_unlock(lock); \
92 debug_spin_unlock(&debug_mutex_lock); \ 77 debug_spin_unlock_restore(&debug_mutex_lock, flags); \
93 } while (0) 78 } while (0)
94 79
95#define DEBUG_OFF() \ 80#define DEBUG_OFF() \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5449b210d9ed..7043db21bbce 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
125 struct task_struct *task = current; 125 struct task_struct *task = current;
126 struct mutex_waiter waiter; 126 struct mutex_waiter waiter;
127 unsigned int old_val; 127 unsigned int old_val;
128 unsigned long flags;
128 129
129 debug_mutex_init_waiter(&waiter); 130 debug_mutex_init_waiter(&waiter);
130 131
131 spin_lock_mutex(&lock->wait_lock); 132 spin_lock_mutex(&lock->wait_lock, flags);
132 133
133 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); 134 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
134 135
@@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
157 if (unlikely(state == TASK_INTERRUPTIBLE && 158 if (unlikely(state == TASK_INTERRUPTIBLE &&
158 signal_pending(task))) { 159 signal_pending(task))) {
159 mutex_remove_waiter(lock, &waiter, task->thread_info); 160 mutex_remove_waiter(lock, &waiter, task->thread_info);
160 spin_unlock_mutex(&lock->wait_lock); 161 spin_unlock_mutex(&lock->wait_lock, flags);
161 162
162 debug_mutex_free_waiter(&waiter); 163 debug_mutex_free_waiter(&waiter);
163 return -EINTR; 164 return -EINTR;
@@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
165 __set_task_state(task, state); 166 __set_task_state(task, state);
166 167
167 /* didnt get the lock, go to sleep: */ 168 /* didnt get the lock, go to sleep: */
168 spin_unlock_mutex(&lock->wait_lock); 169 spin_unlock_mutex(&lock->wait_lock, flags);
169 schedule(); 170 schedule();
170 spin_lock_mutex(&lock->wait_lock); 171 spin_lock_mutex(&lock->wait_lock, flags);
171 } 172 }
172 173
173 /* got the lock - rejoice! */ 174 /* got the lock - rejoice! */
@@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
178 if (likely(list_empty(&lock->wait_list))) 179 if (likely(list_empty(&lock->wait_list)))
179 atomic_set(&lock->count, 0); 180 atomic_set(&lock->count, 0);
180 181
181 spin_unlock_mutex(&lock->wait_lock); 182 spin_unlock_mutex(&lock->wait_lock, flags);
182 183
183 debug_mutex_free_waiter(&waiter); 184 debug_mutex_free_waiter(&waiter);
184 185
@@ -203,10 +204,11 @@ static fastcall noinline void
203__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) 204__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
204{ 205{
205 struct mutex *lock = container_of(lock_count, struct mutex, count); 206 struct mutex *lock = container_of(lock_count, struct mutex, count);
207 unsigned long flags;
206 208
207 DEBUG_WARN_ON(lock->owner != current_thread_info()); 209 DEBUG_WARN_ON(lock->owner != current_thread_info());
208 210
209 spin_lock_mutex(&lock->wait_lock); 211 spin_lock_mutex(&lock->wait_lock, flags);
210 212
211 /* 213 /*
212 * some architectures leave the lock unlocked in the fastpath failure 214 * some architectures leave the lock unlocked in the fastpath failure
@@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
231 233
232 debug_mutex_clear_owner(lock); 234 debug_mutex_clear_owner(lock);
233 235
234 spin_unlock_mutex(&lock->wait_lock); 236 spin_unlock_mutex(&lock->wait_lock, flags);
235} 237}
236 238
237/* 239/*
@@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
276static inline int __mutex_trylock_slowpath(atomic_t *lock_count) 278static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
277{ 279{
278 struct mutex *lock = container_of(lock_count, struct mutex, count); 280 struct mutex *lock = container_of(lock_count, struct mutex, count);
281 unsigned long flags;
279 int prev; 282 int prev;
280 283
281 spin_lock_mutex(&lock->wait_lock); 284 spin_lock_mutex(&lock->wait_lock, flags);
282 285
283 prev = atomic_xchg(&lock->count, -1); 286 prev = atomic_xchg(&lock->count, -1);
284 if (likely(prev == 1)) 287 if (likely(prev == 1))
@@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
287 if (likely(list_empty(&lock->wait_list))) 290 if (likely(list_empty(&lock->wait_list)))
288 atomic_set(&lock->count, 0); 291 atomic_set(&lock->count, 0);
289 292
290 spin_unlock_mutex(&lock->wait_lock); 293 spin_unlock_mutex(&lock->wait_lock, flags);
291 294
292 return prev == 1; 295 return prev == 1;
293} 296}
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 00fe84e7b672..069189947257 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -9,8 +9,10 @@
9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: 9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
10 */ 10 */
11 11
12#define spin_lock_mutex(lock) spin_lock(lock) 12#define spin_lock_mutex(lock, flags) \
13#define spin_unlock_mutex(lock) spin_unlock(lock) 13 do { spin_lock(lock); (void)(flags); } while (0)
14#define spin_unlock_mutex(lock, flags) \
15 do { spin_unlock(lock); (void)(flags); } while (0)
14#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
15 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
16 18
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 520f6c59948d..d38d9ec3276c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
555 struct cpu_timer_list *next; 555 struct cpu_timer_list *next;
556 unsigned long i; 556 unsigned long i;
557 557
558 if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
559 return;
560
561 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 558 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
562 p->cpu_timers : p->signal->cpu_timers); 559 p->cpu_timers : p->signal->cpu_timers);
563 head += CPUCLOCK_WHICH(timer->it_clock); 560 head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk,
1173 } 1170 }
1174 t = tsk; 1171 t = tsk;
1175 do { 1172 do {
1173 if (unlikely(t->flags & PF_EXITING))
1174 continue;
1175
1176 ticks = cputime_add(cputime_add(t->utime, t->stime), 1176 ticks = cputime_add(cputime_add(t->utime, t->stime),
1177 prof_left); 1177 prof_left);
1178 if (!cputime_eq(prof_expires, cputime_zero) && 1178 if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk,
1193 t->it_sched_expires > sched)) { 1193 t->it_sched_expires > sched)) {
1194 t->it_sched_expires = sched; 1194 t->it_sched_expires = sched;
1195 } 1195 }
1196 1196 } while ((t = next_thread(t)) != tsk);
1197 do {
1198 t = next_thread(t);
1199 } while (unlikely(t->flags & PF_EXITING));
1200 } while (t != tsk);
1201 } 1197 }
1202} 1198}
1203 1199
@@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1289 1285
1290#undef UNEXPIRED 1286#undef UNEXPIRED
1291 1287
1292 BUG_ON(tsk->exit_state);
1293
1294 /* 1288 /*
1295 * Double-check with locks held. 1289 * Double-check with locks held.
1296 */ 1290 */
1297 read_lock(&tasklist_lock); 1291 read_lock(&tasklist_lock);
1298 spin_lock(&tsk->sighand->siglock); 1292 if (likely(tsk->signal != NULL)) {
1293 spin_lock(&tsk->sighand->siglock);
1299 1294
1300 /* 1295 /*
1301 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] 1296 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
1302 * all the timers that are firing, and put them on the firing list. 1297 * all the timers that are firing, and put them on the firing list.
1303 */ 1298 */
1304 check_thread_timers(tsk, &firing); 1299 check_thread_timers(tsk, &firing);
1305 check_process_timers(tsk, &firing); 1300 check_process_timers(tsk, &firing);
1306 1301
1307 /* 1302 /*
1308 * We must release these locks before taking any timer's lock. 1303 * We must release these locks before taking any timer's lock.
1309 * There is a potential race with timer deletion here, as the 1304 * There is a potential race with timer deletion here, as the
1310 * siglock now protects our private firing list. We have set 1305 * siglock now protects our private firing list. We have set
1311 * the firing flag in each timer, so that a deletion attempt 1306 * the firing flag in each timer, so that a deletion attempt
1312 * that gets the timer lock before we do will give it up and 1307 * that gets the timer lock before we do will give it up and
1313 * spin until we've taken care of that timer below. 1308 * spin until we've taken care of that timer below.
1314 */ 1309 */
1315 spin_unlock(&tsk->sighand->siglock); 1310 spin_unlock(&tsk->sighand->siglock);
1311 }
1316 read_unlock(&tasklist_lock); 1312 read_unlock(&tasklist_lock);
1317 1313
1318 /* 1314 /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..fc311a4673a2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,15 @@ config PM_DEBUG
36 code. This is helpful when debugging and reporting various PM bugs, 36 code. This is helpful when debugging and reporting various PM bugs,
37 like suspend support. 37 like suspend support.
38 38
39config PM_TRACE
40 bool "Suspend/resume event tracing"
41 depends on PM && PM_DEBUG && X86_32
42 default y
43 ---help---
44 This enables some cheesy code to save the last PM event point in the
45 RTC across reboots, so that you can debug a machine that just hangs
46 during suspend (or more commonly, during resume).
47
39config SOFTWARE_SUSPEND 48config SOFTWARE_SUSPEND
40 bool "Software Suspend" 49 bool "Software Suspend"
41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 50 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 81d4d982f3f0..e13e74067845 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -231,7 +231,7 @@ static int software_resume(void)
231late_initcall(software_resume); 231late_initcall(software_resume);
232 232
233 233
234static char * pm_disk_modes[] = { 234static const char * const pm_disk_modes[] = {
235 [PM_DISK_FIRMWARE] = "firmware", 235 [PM_DISK_FIRMWARE] = "firmware",
236 [PM_DISK_PLATFORM] = "platform", 236 [PM_DISK_PLATFORM] = "platform",
237 [PM_DISK_SHUTDOWN] = "shutdown", 237 [PM_DISK_SHUTDOWN] = "shutdown",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a6d9ef46009e..6d295c776794 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,7 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18 18#include <linux/console.h>
19 19
20#include "power.h" 20#include "power.h"
21 21
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state)
86 goto Thaw; 86 goto Thaw;
87 } 87 }
88 88
89 suspend_console();
89 if ((error = device_suspend(PMSG_SUSPEND))) { 90 if ((error = device_suspend(PMSG_SUSPEND))) {
90 printk(KERN_ERR "Some devices failed to suspend\n"); 91 printk(KERN_ERR "Some devices failed to suspend\n");
91 goto Finish; 92 goto Finish;
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state)
133static void suspend_finish(suspend_state_t state) 134static void suspend_finish(suspend_state_t state)
134{ 135{
135 device_resume(); 136 device_resume();
137 resume_console();
136 thaw_processes(); 138 thaw_processes();
137 enable_nonboot_cpus(); 139 enable_nonboot_cpus();
138 if (pm_ops && pm_ops->finish) 140 if (pm_ops && pm_ops->finish)
@@ -143,7 +145,7 @@ static void suspend_finish(suspend_state_t state)
143 145
144 146
145 147
146static char *pm_states[PM_SUSPEND_MAX] = { 148static const char * const pm_states[PM_SUSPEND_MAX] = {
147 [PM_SUSPEND_STANDBY] = "standby", 149 [PM_SUSPEND_STANDBY] = "standby",
148 [PM_SUSPEND_MEM] = "mem", 150 [PM_SUSPEND_MEM] = "mem",
149#ifdef CONFIG_SOFTWARE_SUSPEND 151#ifdef CONFIG_SOFTWARE_SUSPEND
@@ -260,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
260static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) 262static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
261{ 263{
262 suspend_state_t state = PM_SUSPEND_STANDBY; 264 suspend_state_t state = PM_SUSPEND_STANDBY;
263 char ** s; 265 const char * const *s;
264 char *p; 266 char *p;
265 int error; 267 int error;
266 int len; 268 int len;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f21767..57a792982fb9 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -55,7 +55,7 @@ struct snapshot_handle {
55 unsigned int page; 55 unsigned int page;
56 unsigned int page_offset; 56 unsigned int page_offset;
57 unsigned int prev; 57 unsigned int prev;
58 struct pbe *pbe; 58 struct pbe *pbe, *last_pbe;
59 void *buffer; 59 void *buffer;
60 unsigned int buf_offset; 60 unsigned int buf_offset;
61}; 61};
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b78..24c96f354231 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -150,6 +150,10 @@ int restore_highmem(void)
150 } 150 }
151 return 0; 151 return 0;
152} 152}
153#else
154static inline unsigned int count_highmem_pages(void) {return 0;}
155static inline int save_highmem(void) {return 0;}
156static inline int restore_highmem(void) {return 0;}
153#endif 157#endif
154 158
155static int pfn_is_nosave(unsigned long pfn) 159static int pfn_is_nosave(unsigned long pfn)
@@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
293 } 297 }
294} 298}
295 299
296/** 300static unsigned int unsafe_pages;
297 * On resume it is necessary to trace and eventually free the unsafe
298 * pages that have been allocated, because they are needed for I/O
299 * (on x86-64 we likely will "eat" these pages once again while
300 * creating the temporary page translation tables)
301 */
302
303struct eaten_page {
304 struct eaten_page *next;
305 char padding[PAGE_SIZE - sizeof(void *)];
306};
307
308static struct eaten_page *eaten_pages = NULL;
309
310static void release_eaten_pages(void)
311{
312 struct eaten_page *p, *q;
313
314 p = eaten_pages;
315 while (p) {
316 q = p->next;
317 /* We don't want swsusp_free() to free this page again */
318 ClearPageNosave(virt_to_page(p));
319 free_page((unsigned long)p);
320 p = q;
321 }
322 eaten_pages = NULL;
323}
324 301
325/** 302/**
326 * @safe_needed - on resume, for storing the PBE list and the image, 303 * @safe_needed - on resume, for storing the PBE list and the image,
327 * we can only use memory pages that do not conflict with the pages 304 * we can only use memory pages that do not conflict with the pages
328 * which had been used before suspend. 305 * used before suspend.
329 * 306 *
330 * The unsafe pages are marked with the PG_nosave_free flag 307 * The unsafe pages are marked with the PG_nosave_free flag
331 * 308 * and we count them using unsafe_pages
332 * Allocated but unusable (ie eaten) memory pages should be marked
333 * so that swsusp_free() can release them
334 */ 309 */
335 310
336static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 311static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
337{ 312{
338 void *res; 313 void *res;
339 314
315 res = (void *)get_zeroed_page(gfp_mask);
340 if (safe_needed) 316 if (safe_needed)
341 do { 317 while (res && PageNosaveFree(virt_to_page(res))) {
318 /* The page is unsafe, mark it for swsusp_free() */
319 SetPageNosave(virt_to_page(res));
320 unsafe_pages++;
342 res = (void *)get_zeroed_page(gfp_mask); 321 res = (void *)get_zeroed_page(gfp_mask);
343 if (res && PageNosaveFree(virt_to_page(res))) { 322 }
344 /* This is for swsusp_free() */
345 SetPageNosave(virt_to_page(res));
346 ((struct eaten_page *)res)->next = eaten_pages;
347 eaten_pages = res;
348 }
349 } while (res && PageNosaveFree(virt_to_page(res)));
350 else
351 res = (void *)get_zeroed_page(gfp_mask);
352 if (res) { 323 if (res) {
353 SetPageNosave(virt_to_page(res)); 324 SetPageNosave(virt_to_page(res));
354 SetPageNosaveFree(virt_to_page(res)); 325 SetPageNosaveFree(virt_to_page(res));
@@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
374 * On each page we set up a list of struct_pbe elements. 345 * On each page we set up a list of struct_pbe elements.
375 */ 346 */
376 347
377struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) 348static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
349 int safe_needed)
378{ 350{
379 unsigned int num; 351 unsigned int num;
380 struct pbe *pblist, *pbe; 352 struct pbe *pblist, *pbe;
@@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist)
642 return -EFAULT; 614 return -EFAULT;
643 } 615 }
644 616
617 unsafe_pages = 0;
618
645 return 0; 619 return 0;
646} 620}
647 621
@@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
719} 693}
720 694
721/** 695/**
722 * create_image - use metadata contained in the PBE list 696 * prepare_image - use metadata contained in the PBE list
723 * pointed to by pagedir_nosave to mark the pages that will 697 * pointed to by pagedir_nosave to mark the pages that will
724 * be overwritten in the process of restoring the system 698 * be overwritten in the process of restoring the system
725 * memory state from the image and allocate memory for 699 * memory state from the image ("unsafe" pages) and allocate
726 * the image avoiding these pages 700 * memory for the image
701 *
702 * The idea is to allocate the PBE list first and then
703 * allocate as many pages as it's needed for the image data,
704 * but not to assign these pages to the PBEs initially.
705 * Instead, we just mark them as allocated and create a list
706 * of "safe" which will be used later
727 */ 707 */
728 708
729static int create_image(struct snapshot_handle *handle) 709struct safe_page {
710 struct safe_page *next;
711 char padding[PAGE_SIZE - sizeof(void *)];
712};
713
714static struct safe_page *safe_pages;
715
716static int prepare_image(struct snapshot_handle *handle)
730{ 717{
731 int error = 0; 718 int error = 0;
732 struct pbe *p, *pblist; 719 unsigned int nr_pages = nr_copy_pages;
720 struct pbe *p, *pblist = NULL;
733 721
734 p = pagedir_nosave; 722 p = pagedir_nosave;
735 error = mark_unsafe_pages(p); 723 error = mark_unsafe_pages(p);
736 if (!error) { 724 if (!error) {
737 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); 725 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
738 if (pblist) 726 if (pblist)
739 copy_page_backup_list(pblist, p); 727 copy_page_backup_list(pblist, p);
740 free_pagedir(p, 0); 728 free_pagedir(p, 0);
741 if (!pblist) 729 if (!pblist)
742 error = -ENOMEM; 730 error = -ENOMEM;
743 } 731 }
744 if (!error) 732 safe_pages = NULL;
745 error = alloc_data_pages(pblist, GFP_ATOMIC, 1); 733 if (!error && nr_pages > unsafe_pages) {
734 nr_pages -= unsafe_pages;
735 while (nr_pages--) {
736 struct safe_page *ptr;
737
738 ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
739 if (!ptr) {
740 error = -ENOMEM;
741 break;
742 }
743 if (!PageNosaveFree(virt_to_page(ptr))) {
744 /* The page is "safe", add it to the list */
745 ptr->next = safe_pages;
746 safe_pages = ptr;
747 }
748 /* Mark the page as allocated */
749 SetPageNosave(virt_to_page(ptr));
750 SetPageNosaveFree(virt_to_page(ptr));
751 }
752 }
746 if (!error) { 753 if (!error) {
747 release_eaten_pages();
748 pagedir_nosave = pblist; 754 pagedir_nosave = pblist;
749 } else { 755 } else {
750 pagedir_nosave = NULL;
751 handle->pbe = NULL; 756 handle->pbe = NULL;
752 nr_copy_pages = 0; 757 swsusp_free();
753 nr_meta_pages = 0;
754 } 758 }
755 return error; 759 return error;
756} 760}
757 761
762static void *get_buffer(struct snapshot_handle *handle)
763{
764 struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
765 struct page *page = virt_to_page(pbe->orig_address);
766
767 if (PageNosave(page) && PageNosaveFree(page)) {
768 /*
769 * We have allocated the "original" page frame and we can
770 * use it directly to store the read page
771 */
772 pbe->address = 0;
773 if (last && last->next)
774 last->next = NULL;
775 return (void *)pbe->orig_address;
776 }
777 /*
778 * The "original" page frame has not been allocated and we have to
779 * use a "safe" page frame to store the read page
780 */
781 pbe->address = (unsigned long)safe_pages;
782 safe_pages = safe_pages->next;
783 if (last)
784 last->next = pbe;
785 handle->last_pbe = pbe;
786 return (void *)pbe->address;
787}
788
758/** 789/**
759 * snapshot_write_next - used for writing the system memory snapshot. 790 * snapshot_write_next - used for writing the system memory snapshot.
760 * 791 *
@@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
799 } else if (handle->prev <= nr_meta_pages) { 830 } else if (handle->prev <= nr_meta_pages) {
800 handle->pbe = unpack_orig_addresses(buffer, handle->pbe); 831 handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
801 if (!handle->pbe) { 832 if (!handle->pbe) {
802 error = create_image(handle); 833 error = prepare_image(handle);
803 if (error) 834 if (error)
804 return error; 835 return error;
805 handle->pbe = pagedir_nosave; 836 handle->pbe = pagedir_nosave;
806 handle->buffer = (void *)handle->pbe->address; 837 handle->last_pbe = NULL;
838 handle->buffer = get_buffer(handle);
807 } 839 }
808 } else { 840 } else {
809 handle->pbe = handle->pbe->next; 841 handle->pbe = handle->pbe->next;
810 handle->buffer = (void *)handle->pbe->address; 842 handle->buffer = get_buffer(handle);
811 } 843 }
812 handle->prev = handle->page; 844 handle->prev = handle->page;
813 } 845 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e0..17f669c83012 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void);
67int save_highmem(void); 67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static int save_highmem(void) { return 0; } 70static inline int save_highmem(void) { return 0; }
71static int restore_highmem(void) { return 0; } 71static inline int restore_highmem(void) { return 0; }
72static unsigned int count_highmem_pages(void) { return 0; } 72static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 73#endif
74 74
75/** 75/**
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
175 */ 175 */
176 176
177#define SHRINK_BITE 10000 177#define SHRINK_BITE 10000
178static inline unsigned long __shrink_memory(long tmp)
179{
180 if (tmp > SHRINK_BITE)
181 tmp = SHRINK_BITE;
182 return shrink_all_memory(tmp);
183}
178 184
179int swsusp_shrink_memory(void) 185int swsusp_shrink_memory(void)
180{ 186{
@@ -192,15 +198,17 @@ int swsusp_shrink_memory(void)
192 PAGES_FOR_IO; 198 PAGES_FOR_IO;
193 tmp = size; 199 tmp = size;
194 for_each_zone (zone) 200 for_each_zone (zone)
195 if (!is_highmem(zone)) 201 if (!is_highmem(zone) && populated_zone(zone)) {
196 tmp -= zone->free_pages; 202 tmp -= zone->free_pages;
203 tmp += zone->lowmem_reserve[ZONE_NORMAL];
204 }
197 if (tmp > 0) { 205 if (tmp > 0) {
198 tmp = shrink_all_memory(SHRINK_BITE); 206 tmp = __shrink_memory(tmp);
199 if (!tmp) 207 if (!tmp)
200 return -ENOMEM; 208 return -ENOMEM;
201 pages += tmp; 209 pages += tmp;
202 } else if (size > image_size / PAGE_SIZE) { 210 } else if (size > image_size / PAGE_SIZE) {
203 tmp = shrink_all_memory(SHRINK_BITE); 211 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
204 pages += tmp; 212 pages += tmp;
205 } 213 }
206 printk("\b%c", p[i++%4]); 214 printk("\b%c", p[i++%4]);
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..95b7fe17f124 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -24,6 +24,7 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
27#include <linux/interrupt.h> /* For in_interrupt() */ 28#include <linux/interrupt.h> /* For in_interrupt() */
28#include <linux/config.h> 29#include <linux/config.h>
29#include <linux/delay.h> 30#include <linux/delay.h>
@@ -67,6 +68,7 @@ EXPORT_SYMBOL(oops_in_progress);
67 * driver system. 68 * driver system.
68 */ 69 */
69static DECLARE_MUTEX(console_sem); 70static DECLARE_MUTEX(console_sem);
71static DECLARE_MUTEX(secondary_console_sem);
70struct console *console_drivers; 72struct console *console_drivers;
71/* 73/*
72 * This is used for debugging the mess that is the VT code by 74 * This is used for debugging the mess that is the VT code by
@@ -76,7 +78,7 @@ struct console *console_drivers;
76 * path in the console code where we end up in places I want 78 * path in the console code where we end up in places I want
77 * locked without the console sempahore held 79 * locked without the console sempahore held
78 */ 80 */
79static int console_locked; 81static int console_locked, console_suspended;
80 82
81/* 83/*
82 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars 84 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
@@ -326,7 +328,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
326 struct console *con; 328 struct console *con;
327 329
328 for (con = console_drivers; con; con = con->next) { 330 for (con = console_drivers; con; con = con->next) {
329 if ((con->flags & CON_ENABLED) && con->write) 331 if ((con->flags & CON_ENABLED) && con->write &&
332 (cpu_online(smp_processor_id()) ||
333 (con->flags & CON_ANYTIME)))
330 con->write(con, &LOG_BUF(start), end - start); 334 con->write(con, &LOG_BUF(start), end - start);
331 } 335 }
332} 336}
@@ -436,6 +440,7 @@ static int printk_time = 1;
436#else 440#else
437static int printk_time = 0; 441static int printk_time = 0;
438#endif 442#endif
443module_param(printk_time, int, S_IRUGO | S_IWUSR);
439 444
440static int __init printk_time_setup(char *str) 445static int __init printk_time_setup(char *str)
441{ 446{
@@ -452,6 +457,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
452 return sched_clock(); 457 return sched_clock();
453} 458}
454 459
460/* Check if we have any console registered that can be called early in boot. */
461static int have_callable_console(void)
462{
463 struct console *con;
464
465 for (con = console_drivers; con; con = con->next)
466 if (con->flags & CON_ANYTIME)
467 return 1;
468
469 return 0;
470}
471
455/** 472/**
456 * printk - print a kernel message 473 * printk - print a kernel message
457 * @fmt: format string 474 * @fmt: format string
@@ -565,27 +582,29 @@ asmlinkage int vprintk(const char *fmt, va_list args)
565 log_level_unknown = 1; 582 log_level_unknown = 1;
566 } 583 }
567 584
568 if (!cpu_online(smp_processor_id())) { 585 if (!down_trylock(&console_sem)) {
569 /* 586 /*
570 * Some console drivers may assume that per-cpu resources have 587 * We own the drivers. We can drop the spinlock and
571 * been allocated. So don't allow them to be called by this 588 * let release_console_sem() print the text, maybe ...
572 * CPU until it is officially up. We shouldn't be calling into
573 * random console drivers on a CPU which doesn't exist yet..
574 */ 589 */
590 console_locked = 1;
575 printk_cpu = UINT_MAX; 591 printk_cpu = UINT_MAX;
576 spin_unlock_irqrestore(&logbuf_lock, flags); 592 spin_unlock_irqrestore(&logbuf_lock, flags);
577 goto out; 593
578 }
579 if (!down_trylock(&console_sem)) {
580 console_locked = 1;
581 /* 594 /*
582 * We own the drivers. We can drop the spinlock and let 595 * Console drivers may assume that per-cpu resources have
583 * release_console_sem() print the text 596 * been allocated. So unless they're explicitly marked as
597 * being able to cope (CON_ANYTIME) don't call them until
598 * this CPU is officially up.
584 */ 599 */
585 printk_cpu = UINT_MAX; 600 if (cpu_online(smp_processor_id()) || have_callable_console()) {
586 spin_unlock_irqrestore(&logbuf_lock, flags); 601 console_may_schedule = 0;
587 console_may_schedule = 0; 602 release_console_sem();
588 release_console_sem(); 603 } else {
604 /* Release by hand to avoid flushing the buffer. */
605 console_locked = 0;
606 up(&console_sem);
607 }
589 } else { 608 } else {
590 /* 609 /*
591 * Someone else owns the drivers. We drop the spinlock, which 610 * Someone else owns the drivers. We drop the spinlock, which
@@ -595,7 +614,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
595 printk_cpu = UINT_MAX; 614 printk_cpu = UINT_MAX;
596 spin_unlock_irqrestore(&logbuf_lock, flags); 615 spin_unlock_irqrestore(&logbuf_lock, flags);
597 } 616 }
598out: 617
599 preempt_enable(); 618 preempt_enable();
600 return printed_len; 619 return printed_len;
601} 620}
@@ -698,6 +717,23 @@ int __init add_preferred_console(char *name, int idx, char *options)
698} 717}
699 718
700/** 719/**
720 * suspend_console - suspend the console subsystem
721 *
722 * This disables printk() while we go into suspend states
723 */
724void suspend_console(void)
725{
726 acquire_console_sem();
727 console_suspended = 1;
728}
729
730void resume_console(void)
731{
732 console_suspended = 0;
733 release_console_sem();
734}
735
736/**
701 * acquire_console_sem - lock the console system for exclusive use. 737 * acquire_console_sem - lock the console system for exclusive use.
702 * 738 *
703 * Acquires a semaphore which guarantees that the caller has 739 * Acquires a semaphore which guarantees that the caller has
@@ -708,6 +744,10 @@ int __init add_preferred_console(char *name, int idx, char *options)
708void acquire_console_sem(void) 744void acquire_console_sem(void)
709{ 745{
710 BUG_ON(in_interrupt()); 746 BUG_ON(in_interrupt());
747 if (console_suspended) {
748 down(&secondary_console_sem);
749 return;
750 }
711 down(&console_sem); 751 down(&console_sem);
712 console_locked = 1; 752 console_locked = 1;
713 console_may_schedule = 1; 753 console_may_schedule = 1;
@@ -750,6 +790,10 @@ void release_console_sem(void)
750 unsigned long _con_start, _log_end; 790 unsigned long _con_start, _log_end;
751 unsigned long wake_klogd = 0; 791 unsigned long wake_klogd = 0;
752 792
793 if (console_suspended) {
794 up(&secondary_console_sem);
795 return;
796 }
753 for ( ; ; ) { 797 for ( ; ; ) {
754 spin_lock_irqsave(&logbuf_lock, flags); 798 spin_lock_irqsave(&logbuf_lock, flags);
755 wake_klogd |= log_start - log_end; 799 wake_klogd |= log_start - log_end;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 921c22ad16e4..335c5b932e14 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill)
120 120
121static int may_attach(struct task_struct *task) 121static int may_attach(struct task_struct *task)
122{ 122{
123 if (!task->mm) 123 /* May we inspect the given task?
124 return -EPERM; 124 * This check is used both for attaching with ptrace
125 * and for allowing access to sensitive information in /proc.
126 *
127 * ptrace_attach denies several cases that /proc allows
128 * because setting up the necessary parent/child relationship
129 * or halting the specified task is impossible.
130 */
131 int dumpable = 0;
132 /* Don't let security modules deny introspection */
133 if (task == current)
134 return 0;
125 if (((current->uid != task->euid) || 135 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) || 136 (current->uid != task->suid) ||
127 (current->uid != task->uid) || 137 (current->uid != task->uid) ||
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task)
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 140 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM; 141 return -EPERM;
132 smp_rmb(); 142 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) 143 if (task->mm)
144 dumpable = task->mm->dumpable;
145 if (!dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM; 146 return -EPERM;
135 147
136 return security_ptrace(current, task); 148 return security_ptrace(current, task);
@@ -176,6 +188,8 @@ repeat:
176 goto repeat; 188 goto repeat;
177 } 189 }
178 190
191 if (!task->mm)
192 goto bad;
179 /* the same process cannot be attached many times */ 193 /* the same process cannot be attached many times */
180 if (task->ptrace & PT_PTRACED) 194 if (task->ptrace & PT_PTRACED)
181 goto bad; 195 goto bad;
@@ -200,7 +214,7 @@ out:
200 return retval; 214 return retval;
201} 215}
202 216
203void __ptrace_detach(struct task_struct *child, unsigned int data) 217static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
204{ 218{
205 child->exit_code = data; 219 child->exit_code = data;
206 /* .. re-parent .. */ 220 /* .. re-parent .. */
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
219 ptrace_disable(child); 233 ptrace_disable(child);
220 234
221 write_lock_irq(&tasklist_lock); 235 write_lock_irq(&tasklist_lock);
236 /* protect against de_thread()->release_task() */
222 if (child->ptrace) 237 if (child->ptrace)
223 __ptrace_detach(child, data); 238 __ptrace_detach(child, data);
224 write_unlock_irq(&tasklist_lock); 239 write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bbb..20e9710fc21c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -612,14 +612,6 @@ void synchronize_rcu(void)
612 wait_for_completion(&rcu.completion); 612 wait_for_completion(&rcu.completion);
613} 613}
614 614
615/*
616 * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
617 */
618void synchronize_kernel(void)
619{
620 synchronize_rcu();
621}
622
623module_param(blimit, int, 0); 615module_param(blimit, int, 0);
624module_param(qhimark, int, 0); 616module_param(qhimark, int, 0);
625module_param(qlowmark, int, 0); 617module_param(qlowmark, int, 0);
@@ -627,7 +619,6 @@ module_param(qlowmark, int, 0);
627module_param(rsinterval, int, 0); 619module_param(rsinterval, int, 0);
628#endif 620#endif
629EXPORT_SYMBOL_GPL(rcu_batches_completed); 621EXPORT_SYMBOL_GPL(rcu_batches_completed);
630EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ 622EXPORT_SYMBOL_GPL(call_rcu);
631EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ 623EXPORT_SYMBOL_GPL(call_rcu_bh);
632EXPORT_SYMBOL_GPL(synchronize_rcu); 624EXPORT_SYMBOL_GPL(synchronize_rcu);
633EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/sched.c b/kernel/sched.c
index c13f1bd2df7d..a856040c200a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -818,6 +818,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
818 * the target CPU. 818 * the target CPU.
819 */ 819 */
820#ifdef CONFIG_SMP 820#ifdef CONFIG_SMP
821
822#ifndef tsk_is_polling
823#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
824#endif
825
821static void resched_task(task_t *p) 826static void resched_task(task_t *p)
822{ 827{
823 int cpu; 828 int cpu;
@@ -833,9 +838,9 @@ static void resched_task(task_t *p)
833 if (cpu == smp_processor_id()) 838 if (cpu == smp_processor_id())
834 return; 839 return;
835 840
836 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ 841 /* NEED_RESCHED must be visible before we test polling */
837 smp_mb(); 842 smp_mb();
838 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) 843 if (!tsk_is_polling(p))
839 smp_send_reschedule(cpu); 844 smp_send_reschedule(cpu);
840} 845}
841#else 846#else
@@ -3886,6 +3891,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3886 !capable(CAP_SYS_NICE)) 3891 !capable(CAP_SYS_NICE))
3887 goto out_unlock; 3892 goto out_unlock;
3888 3893
3894 retval = security_task_setscheduler(p, 0, NULL);
3895 if (retval)
3896 goto out_unlock;
3897
3889 cpus_allowed = cpuset_cpus_allowed(p); 3898 cpus_allowed = cpuset_cpus_allowed(p);
3890 cpus_and(new_mask, new_mask, cpus_allowed); 3899 cpus_and(new_mask, new_mask, cpus_allowed);
3891 retval = set_cpus_allowed(p, new_mask); 3900 retval = set_cpus_allowed(p, new_mask);
@@ -3954,7 +3963,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
3954 if (!p) 3963 if (!p)
3955 goto out_unlock; 3964 goto out_unlock;
3956 3965
3957 retval = 0; 3966 retval = security_task_getscheduler(p);
3967 if (retval)
3968 goto out_unlock;
3969
3958 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 3970 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
3959 3971
3960out_unlock: 3972out_unlock:
@@ -4046,6 +4058,9 @@ asmlinkage long sys_sched_yield(void)
4046 4058
4047static inline void __cond_resched(void) 4059static inline void __cond_resched(void)
4048{ 4060{
4061#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4062 __might_sleep(__FILE__, __LINE__);
4063#endif
4049 /* 4064 /*
4050 * The BKS might be reacquired before we have dropped 4065 * The BKS might be reacquired before we have dropped
4051 * PREEMPT_ACTIVE, which could trigger a second 4066 * PREEMPT_ACTIVE, which could trigger a second
@@ -4142,7 +4157,7 @@ EXPORT_SYMBOL(yield);
4142 */ 4157 */
4143void __sched io_schedule(void) 4158void __sched io_schedule(void)
4144{ 4159{
4145 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4160 struct runqueue *rq = &__raw_get_cpu_var(runqueues);
4146 4161
4147 atomic_inc(&rq->nr_iowait); 4162 atomic_inc(&rq->nr_iowait);
4148 schedule(); 4163 schedule();
@@ -4153,7 +4168,7 @@ EXPORT_SYMBOL(io_schedule);
4153 4168
4154long __sched io_schedule_timeout(long timeout) 4169long __sched io_schedule_timeout(long timeout)
4155{ 4170{
4156 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4171 struct runqueue *rq = &__raw_get_cpu_var(runqueues);
4157 long ret; 4172 long ret;
4158 4173
4159 atomic_inc(&rq->nr_iowait); 4174 atomic_inc(&rq->nr_iowait);
@@ -4237,7 +4252,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4237 if (retval) 4252 if (retval)
4238 goto out_unlock; 4253 goto out_unlock;
4239 4254
4240 jiffies_to_timespec(p->policy & SCHED_FIFO ? 4255 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4241 0 : task_timeslice(p), &t); 4256 0 : task_timeslice(p), &t);
4242 read_unlock(&tasklist_lock); 4257 read_unlock(&tasklist_lock);
4243 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4258 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4746,6 +4761,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4746 break; 4761 break;
4747#ifdef CONFIG_HOTPLUG_CPU 4762#ifdef CONFIG_HOTPLUG_CPU
4748 case CPU_UP_CANCELED: 4763 case CPU_UP_CANCELED:
4764 if (!cpu_rq(cpu)->migration_thread)
4765 break;
4749 /* Unbind it from offline cpu so it can run. Fall thru. */ 4766 /* Unbind it from offline cpu so it can run. Fall thru. */
4750 kthread_bind(cpu_rq(cpu)->migration_thread, 4767 kthread_bind(cpu_rq(cpu)->migration_thread,
4751 any_online_cpu(cpu_online_map)); 4768 any_online_cpu(cpu_online_map));
diff --git a/kernel/signal.c b/kernel/signal.c
index e5f8aea78ffe..52adf53929f6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,12 +23,12 @@
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/signal.h> 25#include <linux/signal.h>
26#include <linux/audit.h>
27#include <linux/capability.h> 26#include <linux/capability.h>
28#include <asm/param.h> 27#include <asm/param.h>
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
30#include <asm/unistd.h> 29#include <asm/unistd.h>
31#include <asm/siginfo.h> 30#include <asm/siginfo.h>
31#include "audit.h" /* audit_signal_info() */
32 32
33/* 33/*
34 * SLAB caches for signal bits. 34 * SLAB caches for signal bits.
@@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1531 spin_unlock_irqrestore(&sighand->siglock, flags); 1531 spin_unlock_irqrestore(&sighand->siglock, flags);
1532} 1532}
1533 1533
1534static inline int may_ptrace_stop(void)
1535{
1536 if (!likely(current->ptrace & PT_PTRACED))
1537 return 0;
1538
1539 if (unlikely(current->parent == current->real_parent &&
1540 (current->ptrace & PT_ATTACHED)))
1541 return 0;
1542
1543 if (unlikely(current->signal == current->parent->signal) &&
1544 unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
1545 return 0;
1546
1547 /*
1548 * Are we in the middle of do_coredump?
1549 * If so and our tracer is also part of the coredump stopping
1550 * is a deadlock situation, and pointless because our tracer
1551 * is dead so don't allow us to stop.
1552 * If SIGKILL was already sent before the caller unlocked
1553 * ->siglock we must see ->core_waiters != 0. Otherwise it
1554 * is safe to enter schedule().
1555 */
1556 if (unlikely(current->mm->core_waiters) &&
1557 unlikely(current->mm == current->parent->mm))
1558 return 0;
1559
1560 return 1;
1561}
1562
1534/* 1563/*
1535 * This must be called with current->sighand->siglock held. 1564 * This must be called with current->sighand->siglock held.
1536 * 1565 *
@@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1559 spin_unlock_irq(&current->sighand->siglock); 1588 spin_unlock_irq(&current->sighand->siglock);
1560 try_to_freeze(); 1589 try_to_freeze();
1561 read_lock(&tasklist_lock); 1590 read_lock(&tasklist_lock);
1562 if (likely(current->ptrace & PT_PTRACED) && 1591 if (may_ptrace_stop()) {
1563 likely(current->parent != current->real_parent ||
1564 !(current->ptrace & PT_ATTACHED)) &&
1565 (likely(current->parent->signal != current->signal) ||
1566 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1567 do_notify_parent_cldstop(current, CLD_TRAPPED); 1592 do_notify_parent_cldstop(current, CLD_TRAPPED);
1568 read_unlock(&tasklist_lock); 1593 read_unlock(&tasklist_lock);
1569 schedule(); 1594 schedule();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 336f92d64e2e..9e2f1c6e73d7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb,
470 break; 470 break;
471#ifdef CONFIG_HOTPLUG_CPU 471#ifdef CONFIG_HOTPLUG_CPU
472 case CPU_UP_CANCELED: 472 case CPU_UP_CANCELED:
473 if (!per_cpu(ksoftirqd, hotcpu))
474 break;
473 /* Unbind so it can run. Fall thru. */ 475 /* Unbind so it can run. Fall thru. */
474 kthread_bind(per_cpu(ksoftirqd, hotcpu), 476 kthread_bind(per_cpu(ksoftirqd, hotcpu),
475 any_online_cpu(cpu_online_map)); 477 any_online_cpu(cpu_online_map));
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 14c7faf02909..b5c3b94e01ce 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
36 36
37void touch_softlockup_watchdog(void) 37void touch_softlockup_watchdog(void)
38{ 38{
39 per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; 39 __raw_get_cpu_var(touch_timestamp) = jiffies;
40} 40}
41EXPORT_SYMBOL(touch_softlockup_watchdog); 41EXPORT_SYMBOL(touch_softlockup_watchdog);
42 42
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
127 break; 127 break;
128#ifdef CONFIG_HOTPLUG_CPU 128#ifdef CONFIG_HOTPLUG_CPU
129 case CPU_UP_CANCELED: 129 case CPU_UP_CANCELED:
130 if (!per_cpu(watchdog_task, hotcpu))
131 break;
130 /* Unbind so it can run. Fall thru. */ 132 /* Unbind so it can run. Fall thru. */
131 kthread_bind(per_cpu(watchdog_task, hotcpu), 133 kthread_bind(per_cpu(watchdog_task, hotcpu),
132 any_online_cpu(cpu_online_map)); 134 any_online_cpu(cpu_online_map));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..2c0aacc37c55 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/syscalls.h> 6#include <linux/syscalls.h>
7#include <linux/kthread.h>
7#include <asm/atomic.h> 8#include <asm/atomic.h>
8#include <asm/semaphore.h> 9#include <asm/semaphore.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
@@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads;
25static atomic_t stopmachine_thread_ack; 26static atomic_t stopmachine_thread_ack;
26static DECLARE_MUTEX(stopmachine_mutex); 27static DECLARE_MUTEX(stopmachine_mutex);
27 28
28static int stopmachine(void *cpu) 29static int stopmachine(void *unused)
29{ 30{
30 int irqs_disabled = 0; 31 int irqs_disabled = 0;
31 int prepared = 0; 32 int prepared = 0;
32 33
33 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
34
35 /* Ack: we are alive */ 34 /* Ack: we are alive */
36 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ 35 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
37 atomic_inc(&stopmachine_thread_ack); 36 atomic_inc(&stopmachine_thread_ack);
@@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state)
85 84
86static int stop_machine(void) 85static int stop_machine(void)
87{ 86{
88 int i, ret = 0; 87 int ret = 0;
88 unsigned int i;
89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
90 90
91 /* One high-prio thread per cpu. We'll do this one. */ 91 /* One high-prio thread per cpu. We'll do this one. */
@@ -96,11 +96,16 @@ static int stop_machine(void)
96 stopmachine_state = STOPMACHINE_WAIT; 96 stopmachine_state = STOPMACHINE_WAIT;
97 97
98 for_each_online_cpu(i) { 98 for_each_online_cpu(i) {
99 struct task_struct *tsk;
99 if (i == raw_smp_processor_id()) 100 if (i == raw_smp_processor_id())
100 continue; 101 continue;
101 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); 102 tsk = kthread_create(stopmachine, NULL, "stopmachine");
102 if (ret < 0) 103 if (IS_ERR(tsk)) {
104 ret = PTR_ERR(tsk);
103 break; 105 break;
106 }
107 kthread_bind(tsk, i);
108 wake_up_process(tsk);
104 stopmachine_num_threads++; 109 stopmachine_num_threads++;
105 } 110 }
106 111
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936f..2d5179c67cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -13,7 +13,6 @@
13#include <linux/notifier.h> 13#include <linux/notifier.h>
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/prctl.h> 15#include <linux/prctl.h>
16#include <linux/init.h>
17#include <linux/highuid.h> 16#include <linux/highuid.h>
18#include <linux/fs.h> 17#include <linux/fs.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
@@ -57,6 +56,12 @@
57#ifndef GET_FPEXC_CTL 56#ifndef GET_FPEXC_CTL
58# define GET_FPEXC_CTL(a,b) (-EINVAL) 57# define GET_FPEXC_CTL(a,b) (-EINVAL)
59#endif 58#endif
59#ifndef GET_ENDIAN
60# define GET_ENDIAN(a,b) (-EINVAL)
61#endif
62#ifndef SET_ENDIAN
63# define SET_ENDIAN(a,b) (-EINVAL)
64#endif
60 65
61/* 66/*
62 * this is where the system-wide overflow UID and GID are defined, for 67 * this is where the system-wide overflow UID and GID are defined, for
@@ -132,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
132 unsigned long val, void *v) 137 unsigned long val, void *v)
133{ 138{
134 int ret = NOTIFY_DONE; 139 int ret = NOTIFY_DONE;
135 struct notifier_block *nb; 140 struct notifier_block *nb, *next_nb;
136 141
137 nb = rcu_dereference(*nl); 142 nb = rcu_dereference(*nl);
138 while (nb) { 143 while (nb) {
144 next_nb = rcu_dereference(nb->next);
139 ret = nb->notifier_call(nb, val, v); 145 ret = nb->notifier_call(nb, val, v);
140 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) 146 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
141 break; 147 break;
142 nb = rcu_dereference(nb->next); 148 nb = next_nb;
143 } 149 }
144 return ret; 150 return ret;
145} 151}
@@ -583,7 +589,7 @@ void emergency_restart(void)
583} 589}
584EXPORT_SYMBOL_GPL(emergency_restart); 590EXPORT_SYMBOL_GPL(emergency_restart);
585 591
586void kernel_restart_prepare(char *cmd) 592static void kernel_restart_prepare(char *cmd)
587{ 593{
588 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 594 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
589 system_state = SYSTEM_RESTART; 595 system_state = SYSTEM_RESTART;
@@ -617,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
617 * Move into place and start executing a preloaded standalone 623 * Move into place and start executing a preloaded standalone
618 * executable. If nothing was preloaded return an error. 624 * executable. If nothing was preloaded return an error.
619 */ 625 */
620void kernel_kexec(void) 626static void kernel_kexec(void)
621{ 627{
622#ifdef CONFIG_KEXEC 628#ifdef CONFIG_KEXEC
623 struct kimage *image; 629 struct kimage *image;
@@ -631,7 +637,6 @@ void kernel_kexec(void)
631 machine_kexec(image); 637 machine_kexec(image);
632#endif 638#endif
633} 639}
634EXPORT_SYMBOL_GPL(kernel_kexec);
635 640
636void kernel_shutdown_prepare(enum system_states state) 641void kernel_shutdown_prepare(enum system_states state)
637{ 642{
@@ -1860,23 +1865,20 @@ out:
1860 * fields when reaping, so a sample either gets all the additions of a 1865 * fields when reaping, so a sample either gets all the additions of a
1861 * given child after it's reaped, or none so this sample is before reaping. 1866 * given child after it's reaped, or none so this sample is before reaping.
1862 * 1867 *
1863 * tasklist_lock locking optimisation: 1868 * Locking:
1864 * If we are current and single threaded, we do not need to take the tasklist 1869 * We need to take the siglock for CHILDEREN, SELF and BOTH
1865 * lock or the siglock. No one else can take our signal_struct away, 1870 * for the cases current multithreaded, non-current single threaded
1866 * no one else can reap the children to update signal->c* counters, and 1871 * non-current multithreaded. Thread traversal is now safe with
1867 * no one else can race with the signal-> fields. 1872 * the siglock held.
1868 * If we do not take the tasklist_lock, the signal-> fields could be read 1873 * Strictly speaking, we donot need to take the siglock if we are current and
1869 * out of order while another thread was just exiting. So we place a 1874 * single threaded, as no one else can take our signal_struct away, no one
1870 * read memory barrier when we avoid the lock. On the writer side, 1875 * else can reap the children to update signal->c* counters, and no one else
1871 * write memory barrier is implied in __exit_signal as __exit_signal releases 1876 * can race with the signal-> fields. If we do not take any lock, the
1872 * the siglock spinlock after updating the signal-> fields. 1877 * signal-> fields could be read out of order while another thread was just
1873 * 1878 * exiting. So we should place a read memory barrier when we avoid the lock.
1874 * We don't really need the siglock when we access the non c* fields 1879 * On the writer side, write memory barrier is implied in __exit_signal
1875 * of the signal_struct (for RUSAGE_SELF) even in multithreaded 1880 * as __exit_signal releases the siglock spinlock after updating the signal->
1876 * case, since we take the tasklist lock for read and the non c* signal-> 1881 * fields. But we don't do this yet to keep things simple.
1877 * fields are updated only in __exit_signal, which is called with
1878 * tasklist_lock taken for write, hence these two threads cannot execute
1879 * concurrently.
1880 * 1882 *
1881 */ 1883 */
1882 1884
@@ -1885,35 +1887,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1885 struct task_struct *t; 1887 struct task_struct *t;
1886 unsigned long flags; 1888 unsigned long flags;
1887 cputime_t utime, stime; 1889 cputime_t utime, stime;
1888 int need_lock = 0;
1889 1890
1890 memset((char *) r, 0, sizeof *r); 1891 memset((char *) r, 0, sizeof *r);
1891 utime = stime = cputime_zero; 1892 utime = stime = cputime_zero;
1892 1893
1893 if (p != current || !thread_group_empty(p)) 1894 rcu_read_lock();
1894 need_lock = 1; 1895 if (!lock_task_sighand(p, &flags)) {
1895 1896 rcu_read_unlock();
1896 if (need_lock) { 1897 return;
1897 read_lock(&tasklist_lock); 1898 }
1898 if (unlikely(!p->signal)) {
1899 read_unlock(&tasklist_lock);
1900 return;
1901 }
1902 } else
1903 /* See locking comments above */
1904 smp_rmb();
1905 1899
1906 switch (who) { 1900 switch (who) {
1907 case RUSAGE_BOTH: 1901 case RUSAGE_BOTH:
1908 case RUSAGE_CHILDREN: 1902 case RUSAGE_CHILDREN:
1909 spin_lock_irqsave(&p->sighand->siglock, flags);
1910 utime = p->signal->cutime; 1903 utime = p->signal->cutime;
1911 stime = p->signal->cstime; 1904 stime = p->signal->cstime;
1912 r->ru_nvcsw = p->signal->cnvcsw; 1905 r->ru_nvcsw = p->signal->cnvcsw;
1913 r->ru_nivcsw = p->signal->cnivcsw; 1906 r->ru_nivcsw = p->signal->cnivcsw;
1914 r->ru_minflt = p->signal->cmin_flt; 1907 r->ru_minflt = p->signal->cmin_flt;
1915 r->ru_majflt = p->signal->cmaj_flt; 1908 r->ru_majflt = p->signal->cmaj_flt;
1916 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1917 1909
1918 if (who == RUSAGE_CHILDREN) 1910 if (who == RUSAGE_CHILDREN)
1919 break; 1911 break;
@@ -1941,8 +1933,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1941 BUG(); 1933 BUG();
1942 } 1934 }
1943 1935
1944 if (need_lock) 1936 unlock_task_sighand(p, &flags);
1945 read_unlock(&tasklist_lock); 1937 rcu_read_unlock();
1938
1946 cputime_to_timeval(utime, &r->ru_utime); 1939 cputime_to_timeval(utime, &r->ru_utime);
1947 cputime_to_timeval(stime, &r->ru_stime); 1940 cputime_to_timeval(stime, &r->ru_stime);
1948} 1941}
@@ -2057,6 +2050,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2057 return -EFAULT; 2050 return -EFAULT;
2058 return 0; 2051 return 0;
2059 } 2052 }
2053 case PR_GET_ENDIAN:
2054 error = GET_ENDIAN(current, arg2);
2055 break;
2056 case PR_SET_ENDIAN:
2057 error = SET_ENDIAN(current, arg2);
2058 break;
2059
2060 default: 2060 default:
2061 error = -EINVAL; 2061 error = -EINVAL;
2062 break; 2062 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f1..6991bece67e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
87cond_syscall(sys_inotify_add_watch); 87cond_syscall(sys_inotify_add_watch);
88cond_syscall(sys_inotify_rm_watch); 88cond_syscall(sys_inotify_rm_watch);
89cond_syscall(sys_migrate_pages); 89cond_syscall(sys_migrate_pages);
90cond_syscall(sys_move_pages);
90cond_syscall(sys_chown16); 91cond_syscall(sys_chown16);
91cond_syscall(sys_fchown16); 92cond_syscall(sys_fchown16);
92cond_syscall(sys_getegid16); 93cond_syscall(sys_getegid16);
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore);
132cond_syscall(sys_madvise); 133cond_syscall(sys_madvise);
133cond_syscall(sys_mremap); 134cond_syscall(sys_mremap);
134cond_syscall(sys_remap_file_pages); 135cond_syscall(sys_remap_file_pages);
136cond_syscall(compat_sys_move_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e82726faeeff..f1a4eb1a655e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
59extern int C_A_D; 59extern int C_A_D;
60extern int sysctl_overcommit_memory; 60extern int sysctl_overcommit_memory;
61extern int sysctl_overcommit_ratio; 61extern int sysctl_overcommit_ratio;
62extern int sysctl_panic_on_oom;
62extern int max_threads; 63extern int max_threads;
63extern int sysrq_enabled; 64extern int sysrq_enabled;
64extern int core_uses_pid; 65extern int core_uses_pid;
@@ -72,6 +73,7 @@ extern int printk_ratelimit_burst;
72extern int pid_max_min, pid_max_max; 73extern int pid_max_min, pid_max_max;
73extern int sysctl_drop_caches; 74extern int sysctl_drop_caches;
74extern int percpu_pagelist_fraction; 75extern int percpu_pagelist_fraction;
76extern int compat_log;
75 77
76#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 78#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
77int unknown_nmi_panic; 79int unknown_nmi_panic;
@@ -142,7 +144,6 @@ static struct ctl_table_header root_table_header =
142 144
143static ctl_table kern_table[]; 145static ctl_table kern_table[];
144static ctl_table vm_table[]; 146static ctl_table vm_table[];
145static ctl_table proc_table[];
146static ctl_table fs_table[]; 147static ctl_table fs_table[];
147static ctl_table debug_table[]; 148static ctl_table debug_table[];
148static ctl_table dev_table[]; 149static ctl_table dev_table[];
@@ -150,7 +151,7 @@ extern ctl_table random_table[];
150#ifdef CONFIG_UNIX98_PTYS 151#ifdef CONFIG_UNIX98_PTYS
151extern ctl_table pty_table[]; 152extern ctl_table pty_table[];
152#endif 153#endif
153#ifdef CONFIG_INOTIFY 154#ifdef CONFIG_INOTIFY_USER
154extern ctl_table inotify_table[]; 155extern ctl_table inotify_table[];
155#endif 156#endif
156 157
@@ -202,12 +203,6 @@ static ctl_table root_table[] = {
202 }, 203 },
203#endif 204#endif
204 { 205 {
205 .ctl_name = CTL_PROC,
206 .procname = "proc",
207 .mode = 0555,
208 .child = proc_table,
209 },
210 {
211 .ctl_name = CTL_FS, 206 .ctl_name = CTL_FS,
212 .procname = "fs", 207 .procname = "fs",
213 .mode = 0555, 208 .mode = 0555,
@@ -398,7 +393,7 @@ static ctl_table kern_table[] = {
398 .strategy = &sysctl_string, 393 .strategy = &sysctl_string,
399 }, 394 },
400#endif 395#endif
401#ifdef CONFIG_HOTPLUG 396#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
402 { 397 {
403 .ctl_name = KERN_HOTPLUG, 398 .ctl_name = KERN_HOTPLUG,
404 .procname = "hotplug", 399 .procname = "hotplug",
@@ -683,6 +678,16 @@ static ctl_table kern_table[] = {
683 .proc_handler = &proc_dointvec, 678 .proc_handler = &proc_dointvec,
684 }, 679 },
685#endif 680#endif
681#ifdef CONFIG_COMPAT
682 {
683 .ctl_name = KERN_COMPAT_LOG,
684 .procname = "compat-log",
685 .data = &compat_log,
686 .maxlen = sizeof (int),
687 .mode = 0644,
688 .proc_handler = &proc_dointvec,
689 },
690#endif
686 { .ctl_name = 0 } 691 { .ctl_name = 0 }
687}; 692};
688 693
@@ -702,6 +707,14 @@ static ctl_table vm_table[] = {
702 .proc_handler = &proc_dointvec, 707 .proc_handler = &proc_dointvec,
703 }, 708 },
704 { 709 {
710 .ctl_name = VM_PANIC_ON_OOM,
711 .procname = "panic_on_oom",
712 .data = &sysctl_panic_on_oom,
713 .maxlen = sizeof(sysctl_panic_on_oom),
714 .mode = 0644,
715 .proc_handler = &proc_dointvec,
716 },
717 {
705 .ctl_name = VM_OVERCOMMIT_RATIO, 718 .ctl_name = VM_OVERCOMMIT_RATIO,
706 .procname = "overcommit_ratio", 719 .procname = "overcommit_ratio",
707 .data = &sysctl_overcommit_ratio, 720 .data = &sysctl_overcommit_ratio,
@@ -918,10 +931,6 @@ static ctl_table vm_table[] = {
918 { .ctl_name = 0 } 931 { .ctl_name = 0 }
919}; 932};
920 933
921static ctl_table proc_table[] = {
922 { .ctl_name = 0 }
923};
924
925static ctl_table fs_table[] = { 934static ctl_table fs_table[] = {
926 { 935 {
927 .ctl_name = FS_NRINODE, 936 .ctl_name = FS_NRINODE,
@@ -1028,7 +1037,7 @@ static ctl_table fs_table[] = {
1028 .mode = 0644, 1037 .mode = 0644,
1029 .proc_handler = &proc_doulongvec_minmax, 1038 .proc_handler = &proc_doulongvec_minmax,
1030 }, 1039 },
1031#ifdef CONFIG_INOTIFY 1040#ifdef CONFIG_INOTIFY_USER
1032 { 1041 {
1033 .ctl_name = FS_INOTIFY, 1042 .ctl_name = FS_INOTIFY,
1034 .procname = "inotify", 1043 .procname = "inotify",
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
523 523
524 524
525#else 525#else
526#ifndef CONFIG_GENERIC_TIME
526/* 527/*
527 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 528 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
528 * and therefore only yields usec accuracy 529 * and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
537} 538}
538EXPORT_SYMBOL_GPL(getnstimeofday); 539EXPORT_SYMBOL_GPL(getnstimeofday);
539#endif 540#endif
541#endif
540 542
541/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 543/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
542 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 544 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..74eca5939bd9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,349 @@
1/*
2 * linux/kernel/time/clocksource.c
3 *
4 * This file contains the functions which manage clocksource drivers.
5 *
6 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */
26
27#include <linux/clocksource.h>
28#include <linux/sysdev.h>
29#include <linux/init.h>
30#include <linux/module.h>
31
32/* XXX - Would like a better way for initializing curr_clocksource */
33extern struct clocksource clocksource_jiffies;
34
35/*[Clocksource internal variables]---------
36 * curr_clocksource:
37 * currently selected clocksource. Initialized to clocksource_jiffies.
38 * next_clocksource:
39 * pending next selected clocksource.
40 * clocksource_list:
41 * linked list with the registered clocksources
42 * clocksource_lock:
43 * protects manipulations to curr_clocksource and next_clocksource
44 * and the clocksource_list
45 * override_name:
46 * Name of the user-specified clocksource.
47 */
48static struct clocksource *curr_clocksource = &clocksource_jiffies;
49static struct clocksource *next_clocksource;
50static LIST_HEAD(clocksource_list);
51static DEFINE_SPINLOCK(clocksource_lock);
52static char override_name[32];
53static int finished_booting;
54
55/* clocksource_done_booting - Called near the end of bootup
56 *
57 * Hack to avoid lots of clocksource churn at boot time
58 */
59static int __init clocksource_done_booting(void)
60{
61 finished_booting = 1;
62 return 0;
63}
64
65late_initcall(clocksource_done_booting);
66
67/**
68 * clocksource_get_next - Returns the selected clocksource
69 *
70 */
71struct clocksource *clocksource_get_next(void)
72{
73 unsigned long flags;
74
75 spin_lock_irqsave(&clocksource_lock, flags);
76 if (next_clocksource && finished_booting) {
77 curr_clocksource = next_clocksource;
78 next_clocksource = NULL;
79 }
80 spin_unlock_irqrestore(&clocksource_lock, flags);
81
82 return curr_clocksource;
83}
84
85/**
86 * select_clocksource - Finds the best registered clocksource.
87 *
88 * Private function. Must hold clocksource_lock when called.
89 *
90 * Looks through the list of registered clocksources, returning
91 * the one with the highest rating value. If there is a clocksource
92 * name that matches the override string, it returns that clocksource.
93 */
94static struct clocksource *select_clocksource(void)
95{
96 struct clocksource *best = NULL;
97 struct list_head *tmp;
98
99 list_for_each(tmp, &clocksource_list) {
100 struct clocksource *src;
101
102 src = list_entry(tmp, struct clocksource, list);
103 if (!best)
104 best = src;
105
106 /* check for override: */
107 if (strlen(src->name) == strlen(override_name) &&
108 !strcmp(src->name, override_name)) {
109 best = src;
110 break;
111 }
112 /* pick the highest rating: */
113 if (src->rating > best->rating)
114 best = src;
115 }
116
117 return best;
118}
119
120/**
121 * is_registered_source - Checks if clocksource is registered
122 * @c: pointer to a clocksource
123 *
124 * Private helper function. Must hold clocksource_lock when called.
125 *
126 * Returns one if the clocksource is already registered, zero otherwise.
127 */
128static int is_registered_source(struct clocksource *c)
129{
130 int len = strlen(c->name);
131 struct list_head *tmp;
132
133 list_for_each(tmp, &clocksource_list) {
134 struct clocksource *src;
135
136 src = list_entry(tmp, struct clocksource, list);
137 if (strlen(src->name) == len && !strcmp(src->name, c->name))
138 return 1;
139 }
140
141 return 0;
142}
143
144/**
145 * clocksource_register - Used to install new clocksources
146 * @t: clocksource to be registered
147 *
148 * Returns -EBUSY if registration fails, zero otherwise.
149 */
150int clocksource_register(struct clocksource *c)
151{
152 int ret = 0;
153 unsigned long flags;
154
155 spin_lock_irqsave(&clocksource_lock, flags);
156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name);
160 ret = -EBUSY;
161 } else {
162 /* register it */
163 list_add(&c->list, &clocksource_list);
164 /* scan the registered clocksources, and pick the best one */
165 next_clocksource = select_clocksource();
166 }
167 spin_unlock_irqrestore(&clocksource_lock, flags);
168 return ret;
169}
170EXPORT_SYMBOL(clocksource_register);
171
172/**
173 * clocksource_reselect - Rescan list for next clocksource
174 *
175 * A quick helper function to be used if a clocksource changes its
176 * rating. Forces the clocksource list to be re-scanned for the best
177 * clocksource.
178 */
179void clocksource_reselect(void)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&clocksource_lock, flags);
184 next_clocksource = select_clocksource();
185 spin_unlock_irqrestore(&clocksource_lock, flags);
186}
187EXPORT_SYMBOL(clocksource_reselect);
188
189/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused
192 * @buf: char buffer to be filled with clocksource list
193 *
194 * Provides sysfs interface for listing current clocksource.
195 */
196static ssize_t
197sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
198{
199 char *curr = buf;
200
201 spin_lock_irq(&clocksource_lock);
202 curr += sprintf(curr, "%s ", curr_clocksource->name);
203 spin_unlock_irq(&clocksource_lock);
204
205 curr += sprintf(curr, "\n");
206
207 return curr - buf;
208}
209
210/**
211 * sysfs_override_clocksource - interface for manually overriding clocksource
212 * @dev: unused
213 * @buf: name of override clocksource
214 * @count: length of buffer
215 *
216 * Takes input from sysfs interface for manually overriding the default
217 * clocksource selction.
218 */
219static ssize_t sysfs_override_clocksource(struct sys_device *dev,
220 const char *buf, size_t count)
221{
222 size_t ret = count;
223 /* strings from sysfs write are not 0 terminated! */
224 if (count >= sizeof(override_name))
225 return -EINVAL;
226
227 /* strip of \n: */
228 if (buf[count-1] == '\n')
229 count--;
230 if (count < 1)
231 return -EINVAL;
232
233 spin_lock_irq(&clocksource_lock);
234
235 /* copy the name given: */
236 memcpy(override_name, buf, count);
237 override_name[count] = 0;
238
239 /* try to select it: */
240 next_clocksource = select_clocksource();
241
242 spin_unlock_irq(&clocksource_lock);
243
244 return ret;
245}
246
247/**
248 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
249 * @dev: unused
250 * @buf: char buffer to be filled with clocksource list
251 *
252 * Provides sysfs interface for listing registered clocksources
253 */
254static ssize_t
255sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
256{
257 struct list_head *tmp;
258 char *curr = buf;
259
260 spin_lock_irq(&clocksource_lock);
261 list_for_each(tmp, &clocksource_list) {
262 struct clocksource *src;
263
264 src = list_entry(tmp, struct clocksource, list);
265 curr += sprintf(curr, "%s ", src->name);
266 }
267 spin_unlock_irq(&clocksource_lock);
268
269 curr += sprintf(curr, "\n");
270
271 return curr - buf;
272}
273
274/*
275 * Sysfs setup bits:
276 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource);
279
280static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL);
282
283static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"),
285};
286
287static struct sys_device device_clocksource = {
288 .id = 0,
289 .cls = &clocksource_sysclass,
290};
291
292static int __init init_clocksource_sysfs(void)
293{
294 int error = sysdev_class_register(&clocksource_sysclass);
295
296 if (!error)
297 error = sysdev_register(&device_clocksource);
298 if (!error)
299 error = sysdev_create_file(
300 &device_clocksource,
301 &attr_current_clocksource);
302 if (!error)
303 error = sysdev_create_file(
304 &device_clocksource,
305 &attr_available_clocksource);
306 return error;
307}
308
309device_initcall(init_clocksource_sysfs);
310
311/**
312 * boot_override_clocksource - boot clock override
313 * @str: override name
314 *
315 * Takes a clocksource= boot argument and uses it
316 * as the clocksource override name.
317 */
318static int __init boot_override_clocksource(char* str)
319{
320 unsigned long flags;
321 spin_lock_irqsave(&clocksource_lock, flags);
322 if (str)
323 strlcpy(override_name, str, sizeof(override_name));
324 spin_unlock_irqrestore(&clocksource_lock, flags);
325 return 1;
326}
327
328__setup("clocksource=", boot_override_clocksource);
329
330/**
331 * boot_override_clock - Compatibility layer for deprecated boot option
332 * @str: override name
333 *
334 * DEPRECATED! Takes a clock= boot argument and uses it
335 * as the clocksource override name
336 */
337static int __init boot_override_clock(char* str)
338{
339 if (!strcmp(str, "pmtmr")) {
340 printk("Warning: clock=pmtmr is deprecated. "
341 "Use clocksource=acpi_pm.\n");
342 return boot_override_clocksource("acpi_pm");
343 }
344 printk("Warning! clock= boot option is deprecated. "
345 "Use clocksource=xyz\n");
346 return boot_override_clocksource(str);
347}
348
349__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..126bb30c4afe
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
1/***********************************************************************
2* linux/kernel/time/jiffies.c
3*
4* This file contains the jiffies based clocksource.
5*
6* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7*
8* This program is free software; you can redistribute it and/or modify
9* it under the terms of the GNU General Public License as published by
10* the Free Software Foundation; either version 2 of the License, or
11* (at your option) any later version.
12*
13* This program is distributed in the hope that it will be useful,
14* but WITHOUT ANY WARRANTY; without even the implied warranty of
15* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16* GNU General Public License for more details.
17*
18* You should have received a copy of the GNU General Public License
19* along with this program; if not, write to the Free Software
20* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*
22************************************************************************/
23#include <linux/clocksource.h>
24#include <linux/jiffies.h>
25#include <linux/init.h>
26
27/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as
30 * the timer interrupt frequency HZ and it suffers
31 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended
35 * for "tick-less" systems.
36 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
38
39/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
40 * conversion, the .shift value could be zero. However
41 * this would make NTP adjustments impossible as they are
42 * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
43 * shift both the nominator and denominator the same
44 * amount, and give ntp adjustments in units of 1/2^8
45 *
46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when
49 * HZ=100.
50 */
51#define JIFFIES_SHIFT 8
52
53static cycle_t jiffies_read(void)
54{
55 return (cycle_t) jiffies;
56}
57
58struct clocksource clocksource_jiffies = {
59 .name = "jiffies",
60 .rating = 0, /* lowest rating*/
61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66};
67
68static int __init init_jiffies_clocksource(void)
69{
70 return clocksource_register(&clocksource_jiffies);
71}
72
73module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index 9e49deed468c..5bb6b7976eec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
146void fastcall init_timer(struct timer_list *timer) 146void fastcall init_timer(struct timer_list *timer)
147{ 147{
148 timer->entry.next = NULL; 148 timer->entry.next = NULL;
149 timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); 149 timer->base = __raw_get_cpu_var(tvec_bases);
150} 150}
151EXPORT_SYMBOL(init_timer); 151EXPORT_SYMBOL(init_timer);
152 152
@@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync);
383static int cascade(tvec_base_t *base, tvec_t *tv, int index) 383static int cascade(tvec_base_t *base, tvec_t *tv, int index)
384{ 384{
385 /* cascade all the timers from tv up one level */ 385 /* cascade all the timers from tv up one level */
386 struct list_head *head, *curr; 386 struct timer_list *timer, *tmp;
387 struct list_head tv_list;
388
389 list_replace_init(tv->vec + index, &tv_list);
387 390
388 head = tv->vec + index;
389 curr = head->next;
390 /* 391 /*
391 * We are removing _all_ timers from the list, so we don't have to 392 * We are removing _all_ timers from the list, so we
392 * detach them individually, just clear the list afterwards. 393 * don't have to detach them individually.
393 */ 394 */
394 while (curr != head) { 395 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
395 struct timer_list *tmp; 396 BUG_ON(timer->base != base);
396 397 internal_add_timer(base, timer);
397 tmp = list_entry(curr, struct timer_list, entry);
398 BUG_ON(tmp->base != base);
399 curr = curr->next;
400 internal_add_timer(base, tmp);
401 } 398 }
402 INIT_LIST_HEAD(head);
403 399
404 return index; 400 return index;
405} 401}
@@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base)
419 415
420 spin_lock_irq(&base->lock); 416 spin_lock_irq(&base->lock);
421 while (time_after_eq(jiffies, base->timer_jiffies)) { 417 while (time_after_eq(jiffies, base->timer_jiffies)) {
422 struct list_head work_list = LIST_HEAD_INIT(work_list); 418 struct list_head work_list;
423 struct list_head *head = &work_list; 419 struct list_head *head = &work_list;
424 int index = base->timer_jiffies & TVR_MASK; 420 int index = base->timer_jiffies & TVR_MASK;
425 421
426 /* 422 /*
427 * Cascade timers: 423 * Cascade timers:
428 */ 424 */
@@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base)
431 (!cascade(base, &base->tv3, INDEX(1))) && 427 (!cascade(base, &base->tv3, INDEX(1))) &&
432 !cascade(base, &base->tv4, INDEX(2))) 428 !cascade(base, &base->tv4, INDEX(2)))
433 cascade(base, &base->tv5, INDEX(3)); 429 cascade(base, &base->tv5, INDEX(3));
434 ++base->timer_jiffies; 430 ++base->timer_jiffies;
435 list_splice_init(base->tv1.vec + index, &work_list); 431 list_replace_init(base->tv1.vec + index, &work_list);
436 while (!list_empty(head)) { 432 while (!list_empty(head)) {
437 void (*fn)(unsigned long); 433 void (*fn)(unsigned long);
438 unsigned long data; 434 unsigned long data;
@@ -601,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
601long time_precision = 1; /* clock precision (us) */ 597long time_precision = 1; /* clock precision (us) */
602long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 598long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
603long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 599long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
604static long time_phase; /* phase offset (scaled us) */
605long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; 600long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
606 /* frequency offset (scaled ppm)*/ 601 /* frequency offset (scaled ppm)*/
607static long time_adj; /* tick adjust (scaled 1 / HZ) */ 602static long time_adj; /* tick adjust (scaled 1 / HZ) */
@@ -751,27 +746,14 @@ static long adjtime_adjustment(void)
751} 746}
752 747
753/* in the NTP reference this is called "hardclock()" */ 748/* in the NTP reference this is called "hardclock()" */
754static void update_wall_time_one_tick(void) 749static void update_ntp_one_tick(void)
755{ 750{
756 long time_adjust_step, delta_nsec; 751 long time_adjust_step;
757 752
758 time_adjust_step = adjtime_adjustment(); 753 time_adjust_step = adjtime_adjustment();
759 if (time_adjust_step) 754 if (time_adjust_step)
760 /* Reduce by this step the amount of time left */ 755 /* Reduce by this step the amount of time left */
761 time_adjust -= time_adjust_step; 756 time_adjust -= time_adjust_step;
762 delta_nsec = tick_nsec + time_adjust_step * 1000;
763 /*
764 * Advance the phase, once it gets to one microsecond, then
765 * advance the tick more.
766 */
767 time_phase += time_adj;
768 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
769 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
770 time_phase -= ltemp << (SHIFT_SCALE - 10);
771 delta_nsec += ltemp;
772 }
773 xtime.tv_nsec += delta_nsec;
774 time_interpolator_update(delta_nsec);
775 757
776 /* Changes by adjtime() do not take effect till next tick. */ 758 /* Changes by adjtime() do not take effect till next tick. */
777 if (time_next_adjust != 0) { 759 if (time_next_adjust != 0) {
@@ -784,36 +766,378 @@ static void update_wall_time_one_tick(void)
784 * Return how long ticks are at the moment, that is, how much time 766 * Return how long ticks are at the moment, that is, how much time
785 * update_wall_time_one_tick will add to xtime next time we call it 767 * update_wall_time_one_tick will add to xtime next time we call it
786 * (assuming no calls to do_adjtimex in the meantime). 768 * (assuming no calls to do_adjtimex in the meantime).
787 * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 769 * The return value is in fixed-point nanoseconds shifted by the
788 * bits to the right of the binary point. 770 * specified number of bits to the right of the binary point.
789 * This function has no side-effects. 771 * This function has no side-effects.
790 */ 772 */
791u64 current_tick_length(void) 773u64 current_tick_length(void)
792{ 774{
793 long delta_nsec; 775 long delta_nsec;
776 u64 ret;
794 777
778 /* calculate the finest interval NTP will allow.
779 * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
780 */
795 delta_nsec = tick_nsec + adjtime_adjustment() * 1000; 781 delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
796 return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; 782 ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
783 ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
784
785 return ret;
797} 786}
798 787
799/* 788/* XXX - all of this timekeeping code should be later moved to time.c */
800 * Using a loop looks inefficient, but "ticks" is 789#include <linux/clocksource.h>
801 * usually just one (we shouldn't be losing ticks, 790static struct clocksource *clock; /* pointer to current clocksource */
802 * we're doing this this way mainly for interrupt 791
803 * latency reasons, not because we think we'll 792#ifdef CONFIG_GENERIC_TIME
804 * have lots of lost timer ticks 793/**
794 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
795 *
796 * private function, must hold xtime_lock lock when being
797 * called. Returns the number of nanoseconds since the
798 * last call to update_wall_time() (adjusted by NTP scaling)
799 */
800static inline s64 __get_nsec_offset(void)
801{
802 cycle_t cycle_now, cycle_delta;
803 s64 ns_offset;
804
805 /* read clocksource: */
806 cycle_now = clocksource_read(clock);
807
808 /* calculate the delta since the last update_wall_time: */
809 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
810
811 /* convert to nanoseconds: */
812 ns_offset = cyc2ns(clock, cycle_delta);
813
814 return ns_offset;
815}
816
817/**
818 * __get_realtime_clock_ts - Returns the time of day in a timespec
819 * @ts: pointer to the timespec to be set
820 *
821 * Returns the time of day in a timespec. Used by
822 * do_gettimeofday() and get_realtime_clock_ts().
823 */
824static inline void __get_realtime_clock_ts(struct timespec *ts)
825{
826 unsigned long seq;
827 s64 nsecs;
828
829 do {
830 seq = read_seqbegin(&xtime_lock);
831
832 *ts = xtime;
833 nsecs = __get_nsec_offset();
834
835 } while (read_seqretry(&xtime_lock, seq));
836
837 timespec_add_ns(ts, nsecs);
838}
839
840/**
841 * getnstimeofday - Returns the time of day in a timespec
842 * @ts: pointer to the timespec to be set
843 *
844 * Returns the time of day in a timespec.
845 */
846void getnstimeofday(struct timespec *ts)
847{
848 __get_realtime_clock_ts(ts);
849}
850
851EXPORT_SYMBOL(getnstimeofday);
852
853/**
854 * do_gettimeofday - Returns the time of day in a timeval
855 * @tv: pointer to the timeval to be set
856 *
857 * NOTE: Users should be converted to using get_realtime_clock_ts()
858 */
859void do_gettimeofday(struct timeval *tv)
860{
861 struct timespec now;
862
863 __get_realtime_clock_ts(&now);
864 tv->tv_sec = now.tv_sec;
865 tv->tv_usec = now.tv_nsec/1000;
866}
867
868EXPORT_SYMBOL(do_gettimeofday);
869/**
870 * do_settimeofday - Sets the time of day
871 * @tv: pointer to the timespec variable containing the new time
872 *
873 * Sets the time of day to the new time and update NTP and notify hrtimers
874 */
875int do_settimeofday(struct timespec *tv)
876{
877 unsigned long flags;
878 time_t wtm_sec, sec = tv->tv_sec;
879 long wtm_nsec, nsec = tv->tv_nsec;
880
881 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
882 return -EINVAL;
883
884 write_seqlock_irqsave(&xtime_lock, flags);
885
886 nsec -= __get_nsec_offset();
887
888 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
889 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
890
891 set_normalized_timespec(&xtime, sec, nsec);
892 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
893
894 ntp_clear();
895
896 write_sequnlock_irqrestore(&xtime_lock, flags);
897
898 /* signal hrtimers about time change */
899 clock_was_set();
900
901 return 0;
902}
903
904EXPORT_SYMBOL(do_settimeofday);
905
906/**
907 * change_clocksource - Swaps clocksources if a new one is available
908 *
909 * Accumulates current time interval and initializes new clocksource
910 */
911static int change_clocksource(void)
912{
913 struct clocksource *new;
914 cycle_t now;
915 u64 nsec;
916 new = clocksource_get_next();
917 if (clock != new) {
918 now = clocksource_read(new);
919 nsec = __get_nsec_offset();
920 timespec_add_ns(&xtime, nsec);
921
922 clock = new;
923 clock->cycle_last = now;
924 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
925 clock->name);
926 return 1;
927 } else if (clock->update_callback) {
928 return clock->update_callback();
929 }
930 return 0;
931}
932#else
933#define change_clocksource() (0)
934#endif
935
936/**
937 * timeofday_is_continuous - check to see if timekeeping is free running
805 */ 938 */
806static void update_wall_time(unsigned long ticks) 939int timekeeping_is_continuous(void)
807{ 940{
941 unsigned long seq;
942 int ret;
943
808 do { 944 do {
809 ticks--; 945 seq = read_seqbegin(&xtime_lock);
810 update_wall_time_one_tick(); 946
811 if (xtime.tv_nsec >= 1000000000) { 947 ret = clock->is_continuous;
812 xtime.tv_nsec -= 1000000000; 948
949 } while (read_seqretry(&xtime_lock, seq));
950
951 return ret;
952}
953
954/*
955 * timekeeping_init - Initializes the clocksource and common timekeeping values
956 */
957void __init timekeeping_init(void)
958{
959 unsigned long flags;
960
961 write_seqlock_irqsave(&xtime_lock, flags);
962 clock = clocksource_get_next();
963 clocksource_calculate_interval(clock, tick_nsec);
964 clock->cycle_last = clocksource_read(clock);
965 ntp_clear();
966 write_sequnlock_irqrestore(&xtime_lock, flags);
967}
968
969
970/*
971 * timekeeping_resume - Resumes the generic timekeeping subsystem.
972 * @dev: unused
973 *
974 * This is for the generic clocksource timekeeping.
975 * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
976 * still managed by arch specific suspend/resume code.
977 */
978static int timekeeping_resume(struct sys_device *dev)
979{
980 unsigned long flags;
981
982 write_seqlock_irqsave(&xtime_lock, flags);
983 /* restart the last cycle value */
984 clock->cycle_last = clocksource_read(clock);
985 write_sequnlock_irqrestore(&xtime_lock, flags);
986 return 0;
987}
988
989/* sysfs resume/suspend bits for timekeeping */
990static struct sysdev_class timekeeping_sysclass = {
991 .resume = timekeeping_resume,
992 set_kset_name("timekeeping"),
993};
994
995static struct sys_device device_timer = {
996 .id = 0,
997 .cls = &timekeeping_sysclass,
998};
999
1000static int __init timekeeping_init_device(void)
1001{
1002 int error = sysdev_class_register(&timekeeping_sysclass);
1003 if (!error)
1004 error = sysdev_register(&device_timer);
1005 return error;
1006}
1007
1008device_initcall(timekeeping_init_device);
1009
1010/*
1011 * If the error is already larger, we look ahead another tick,
1012 * to compensate for late or lost adjustments.
1013 */
1014static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
1015{
1016 int adj;
1017
1018 /*
1019 * As soon as the machine is synchronized to the external time
1020 * source this should be the common case.
1021 */
1022 error >>= 2;
1023 if (likely(sign > 0 ? error <= *interval : error >= *interval))
1024 return sign;
1025
1026 /*
1027 * An extra look ahead dampens the effect of the current error,
1028 * which can grow quite large with continously late updates, as
1029 * it would dominate the adjustment value and can lead to
1030 * oscillation.
1031 */
1032 error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
1033 error -= clock->xtime_interval >> 1;
1034
1035 adj = 0;
1036 while (1) {
1037 error >>= 1;
1038 if (sign > 0 ? error <= *interval : error >= *interval)
1039 break;
1040 adj++;
1041 }
1042
1043 /*
1044 * Add the current adjustments to the error and take the offset
1045 * into account, the latter can cause the error to be hardly
1046 * reduced at the next tick. Check the error again if there's
1047 * room for another adjustment, thus further reducing the error
1048 * which otherwise had to be corrected at the next update.
1049 */
1050 error = (error << 1) - *interval + *offset;
1051 if (sign > 0 ? error > *interval : error < *interval)
1052 adj++;
1053
1054 *interval <<= adj;
1055 *offset <<= adj;
1056 return sign << adj;
1057}
1058
1059/*
1060 * Adjust the multiplier to reduce the error value,
1061 * this is optimized for the most common adjustments of -1,0,1,
1062 * for other values we can do a bit more work.
1063 */
1064static void clocksource_adjust(struct clocksource *clock, s64 offset)
1065{
1066 s64 error, interval = clock->cycle_interval;
1067 int adj;
1068
1069 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
1070 if (error > interval) {
1071 adj = clocksource_bigadjust(1, error, &interval, &offset);
1072 } else if (error < -interval) {
1073 interval = -interval;
1074 offset = -offset;
1075 adj = clocksource_bigadjust(-1, error, &interval, &offset);
1076 } else
1077 return;
1078
1079 clock->mult += adj;
1080 clock->xtime_interval += interval;
1081 clock->xtime_nsec -= offset;
1082 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
1083}
1084
1085/*
1086 * update_wall_time - Uses the current clocksource to increment the wall time
1087 *
1088 * Called from the timer interrupt, must hold a write on xtime_lock.
1089 */
1090static void update_wall_time(void)
1091{
1092 cycle_t offset;
1093
1094 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
1095
1096#ifdef CONFIG_GENERIC_TIME
1097 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
1098#else
1099 offset = clock->cycle_interval;
1100#endif
1101
1102 /* normally this loop will run just once, however in the
1103 * case of lost or late ticks, it will accumulate correctly.
1104 */
1105 while (offset >= clock->cycle_interval) {
1106 /* accumulate one interval */
1107 clock->xtime_nsec += clock->xtime_interval;
1108 clock->cycle_last += clock->cycle_interval;
1109 offset -= clock->cycle_interval;
1110
1111 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
1112 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
813 xtime.tv_sec++; 1113 xtime.tv_sec++;
814 second_overflow(); 1114 second_overflow();
815 } 1115 }
816 } while (ticks); 1116
1117 /* interpolator bits */
1118 time_interpolator_update(clock->xtime_interval
1119 >> clock->shift);
1120 /* increment the NTP state machine */
1121 update_ntp_one_tick();
1122
1123 /* accumulate error between NTP and clock interval */
1124 clock->error += current_tick_length();
1125 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
1126 }
1127
1128 /* correct the clock when NTP error is too big */
1129 clocksource_adjust(clock, offset);
1130
1131 /* store full nanoseconds into xtime */
1132 xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
1133 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1134
1135 /* check to see if there is a new clocksource to use */
1136 if (change_clocksource()) {
1137 clock->error = 0;
1138 clock->xtime_nsec = 0;
1139 clocksource_calculate_interval(clock, tick_nsec);
1140 }
817} 1141}
818 1142
819/* 1143/*
@@ -919,10 +1243,8 @@ static inline void update_times(void)
919 unsigned long ticks; 1243 unsigned long ticks;
920 1244
921 ticks = jiffies - wall_jiffies; 1245 ticks = jiffies - wall_jiffies;
922 if (ticks) { 1246 wall_jiffies += ticks;
923 wall_jiffies += ticks; 1247 update_wall_time();
924 update_wall_time(ticks);
925 }
926 calc_load(ticks); 1248 calc_load(ticks);
927} 1249}
928 1250
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..f69c804c8e62
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,918 @@
1/*
2 * Copyright (C) 2002-2006 Novell, Inc.
3 * Jan Beulich <jbeulich@novell.com>
4 * This code is released under version 2 of the GNU GPL.
5 *
6 * A simple API for unwinding kernel stacks. This is used for
7 * debugging and error reporting purposes. The kernel doesn't need
8 * full-blown stack unwinding with all the bells and whistles, so there
9 * is not much point in implementing the full Dwarf2 unwind API.
10 */
11
12#include <linux/unwind.h>
13#include <linux/module.h>
14#include <linux/delay.h>
15#include <linux/stop_machine.h>
16#include <asm/sections.h>
17#include <asm/uaccess.h>
18#include <asm/unaligned.h>
19
20extern char __start_unwind[], __end_unwind[];
21
22#define MAX_STACK_DEPTH 8
23
24#define EXTRA_INFO(f) { \
25 BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
26 % FIELD_SIZEOF(struct unwind_frame_info, f)) \
27 + offsetof(struct unwind_frame_info, f) \
28 / FIELD_SIZEOF(struct unwind_frame_info, f), \
29 FIELD_SIZEOF(struct unwind_frame_info, f) \
30 }
31#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
32
33static const struct {
34 unsigned offs:BITS_PER_LONG / 2;
35 unsigned width:BITS_PER_LONG / 2;
36} reg_info[] = {
37 UNW_REGISTER_INFO
38};
39
40#undef PTREGS_INFO
41#undef EXTRA_INFO
42
43#ifndef REG_INVALID
44#define REG_INVALID(r) (reg_info[r].width == 0)
45#endif
46
47#define DW_CFA_nop 0x00
48#define DW_CFA_set_loc 0x01
49#define DW_CFA_advance_loc1 0x02
50#define DW_CFA_advance_loc2 0x03
51#define DW_CFA_advance_loc4 0x04
52#define DW_CFA_offset_extended 0x05
53#define DW_CFA_restore_extended 0x06
54#define DW_CFA_undefined 0x07
55#define DW_CFA_same_value 0x08
56#define DW_CFA_register 0x09
57#define DW_CFA_remember_state 0x0a
58#define DW_CFA_restore_state 0x0b
59#define DW_CFA_def_cfa 0x0c
60#define DW_CFA_def_cfa_register 0x0d
61#define DW_CFA_def_cfa_offset 0x0e
62#define DW_CFA_def_cfa_expression 0x0f
63#define DW_CFA_expression 0x10
64#define DW_CFA_offset_extended_sf 0x11
65#define DW_CFA_def_cfa_sf 0x12
66#define DW_CFA_def_cfa_offset_sf 0x13
67#define DW_CFA_val_offset 0x14
68#define DW_CFA_val_offset_sf 0x15
69#define DW_CFA_val_expression 0x16
70#define DW_CFA_lo_user 0x1c
71#define DW_CFA_GNU_window_save 0x2d
72#define DW_CFA_GNU_args_size 0x2e
73#define DW_CFA_GNU_negative_offset_extended 0x2f
74#define DW_CFA_hi_user 0x3f
75
76#define DW_EH_PE_FORM 0x07
77#define DW_EH_PE_native 0x00
78#define DW_EH_PE_leb128 0x01
79#define DW_EH_PE_data2 0x02
80#define DW_EH_PE_data4 0x03
81#define DW_EH_PE_data8 0x04
82#define DW_EH_PE_signed 0x08
83#define DW_EH_PE_ADJUST 0x70
84#define DW_EH_PE_abs 0x00
85#define DW_EH_PE_pcrel 0x10
86#define DW_EH_PE_textrel 0x20
87#define DW_EH_PE_datarel 0x30
88#define DW_EH_PE_funcrel 0x40
89#define DW_EH_PE_aligned 0x50
90#define DW_EH_PE_indirect 0x80
91#define DW_EH_PE_omit 0xff
92
93typedef unsigned long uleb128_t;
94typedef signed long sleb128_t;
95
96static struct unwind_table {
97 struct {
98 unsigned long pc;
99 unsigned long range;
100 } core, init;
101 const void *address;
102 unsigned long size;
103 struct unwind_table *link;
104 const char *name;
105} root_table, *last_table;
106
107struct unwind_item {
108 enum item_location {
109 Nowhere,
110 Memory,
111 Register,
112 Value
113 } where;
114 uleb128_t value;
115};
116
117struct unwind_state {
118 uleb128_t loc, org;
119 const u8 *cieStart, *cieEnd;
120 uleb128_t codeAlign;
121 sleb128_t dataAlign;
122 struct cfa {
123 uleb128_t reg, offs;
124 } cfa;
125 struct unwind_item regs[ARRAY_SIZE(reg_info)];
126 unsigned stackDepth:8;
127 unsigned version:8;
128 const u8 *label;
129 const u8 *stack[MAX_STACK_DEPTH];
130};
131
132static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
133
134static struct unwind_table *find_table(unsigned long pc)
135{
136 struct unwind_table *table;
137
138 for (table = &root_table; table; table = table->link)
139 if ((pc >= table->core.pc
140 && pc < table->core.pc + table->core.range)
141 || (pc >= table->init.pc
142 && pc < table->init.pc + table->init.range))
143 break;
144
145 return table;
146}
147
148static void init_unwind_table(struct unwind_table *table,
149 const char *name,
150 const void *core_start,
151 unsigned long core_size,
152 const void *init_start,
153 unsigned long init_size,
154 const void *table_start,
155 unsigned long table_size)
156{
157 table->core.pc = (unsigned long)core_start;
158 table->core.range = core_size;
159 table->init.pc = (unsigned long)init_start;
160 table->init.range = init_size;
161 table->address = table_start;
162 table->size = table_size;
163 table->link = NULL;
164 table->name = name;
165}
166
167void __init unwind_init(void)
168{
169 init_unwind_table(&root_table, "kernel",
170 _text, _end - _text,
171 NULL, 0,
172 __start_unwind, __end_unwind - __start_unwind);
173}
174
175#ifdef CONFIG_MODULES
176
177/* Must be called with module_mutex held. */
178void *unwind_add_table(struct module *module,
179 const void *table_start,
180 unsigned long table_size)
181{
182 struct unwind_table *table;
183
184 if (table_size <= 0)
185 return NULL;
186
187 table = kmalloc(sizeof(*table), GFP_KERNEL);
188 if (!table)
189 return NULL;
190
191 init_unwind_table(table, module->name,
192 module->module_core, module->core_size,
193 module->module_init, module->init_size,
194 table_start, table_size);
195
196 if (last_table)
197 last_table->link = table;
198 else
199 root_table.link = table;
200 last_table = table;
201
202 return table;
203}
204
205struct unlink_table_info
206{
207 struct unwind_table *table;
208 int init_only;
209};
210
211static int unlink_table(void *arg)
212{
213 struct unlink_table_info *info = arg;
214 struct unwind_table *table = info->table, *prev;
215
216 for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
217 ;
218
219 if (prev->link) {
220 if (info->init_only) {
221 table->init.pc = 0;
222 table->init.range = 0;
223 info->table = NULL;
224 } else {
225 prev->link = table->link;
226 if (!prev->link)
227 last_table = prev;
228 }
229 } else
230 info->table = NULL;
231
232 return 0;
233}
234
235/* Must be called with module_mutex held. */
236void unwind_remove_table(void *handle, int init_only)
237{
238 struct unwind_table *table = handle;
239 struct unlink_table_info info;
240
241 if (!table || table == &root_table)
242 return;
243
244 if (init_only && table == last_table) {
245 table->init.pc = 0;
246 table->init.range = 0;
247 return;
248 }
249
250 info.table = table;
251 info.init_only = init_only;
252 stop_machine_run(unlink_table, &info, NR_CPUS);
253
254 if (info.table)
255 kfree(table);
256}
257
258#endif /* CONFIG_MODULES */
259
260static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
261{
262 const u8 *cur = *pcur;
263 uleb128_t value;
264 unsigned shift;
265
266 for (shift = 0, value = 0; cur < end; shift += 7) {
267 if (shift + 7 > 8 * sizeof(value)
268 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
269 cur = end + 1;
270 break;
271 }
272 value |= (uleb128_t)(*cur & 0x7f) << shift;
273 if (!(*cur++ & 0x80))
274 break;
275 }
276 *pcur = cur;
277
278 return value;
279}
280
281static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
282{
283 const u8 *cur = *pcur;
284 sleb128_t value;
285 unsigned shift;
286
287 for (shift = 0, value = 0; cur < end; shift += 7) {
288 if (shift + 7 > 8 * sizeof(value)
289 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
290 cur = end + 1;
291 break;
292 }
293 value |= (sleb128_t)(*cur & 0x7f) << shift;
294 if (!(*cur & 0x80)) {
295 value |= -(*cur++ & 0x40) << shift;
296 break;
297 }
298 }
299 *pcur = cur;
300
301 return value;
302}
303
304static unsigned long read_pointer(const u8 **pLoc,
305 const void *end,
306 signed ptrType)
307{
308 unsigned long value = 0;
309 union {
310 const u8 *p8;
311 const u16 *p16u;
312 const s16 *p16s;
313 const u32 *p32u;
314 const s32 *p32s;
315 const unsigned long *pul;
316 } ptr;
317
318 if (ptrType < 0 || ptrType == DW_EH_PE_omit)
319 return 0;
320 ptr.p8 = *pLoc;
321 switch(ptrType & DW_EH_PE_FORM) {
322 case DW_EH_PE_data2:
323 if (end < (const void *)(ptr.p16u + 1))
324 return 0;
325 if(ptrType & DW_EH_PE_signed)
326 value = get_unaligned(ptr.p16s++);
327 else
328 value = get_unaligned(ptr.p16u++);
329 break;
330 case DW_EH_PE_data4:
331#ifdef CONFIG_64BIT
332 if (end < (const void *)(ptr.p32u + 1))
333 return 0;
334 if(ptrType & DW_EH_PE_signed)
335 value = get_unaligned(ptr.p32s++);
336 else
337 value = get_unaligned(ptr.p32u++);
338 break;
339 case DW_EH_PE_data8:
340 BUILD_BUG_ON(sizeof(u64) != sizeof(value));
341#else
342 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
343#endif
344 case DW_EH_PE_native:
345 if (end < (const void *)(ptr.pul + 1))
346 return 0;
347 value = get_unaligned(ptr.pul++);
348 break;
349 case DW_EH_PE_leb128:
350 BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
351 value = ptrType & DW_EH_PE_signed
352 ? get_sleb128(&ptr.p8, end)
353 : get_uleb128(&ptr.p8, end);
354 if ((const void *)ptr.p8 > end)
355 return 0;
356 break;
357 default:
358 return 0;
359 }
360 switch(ptrType & DW_EH_PE_ADJUST) {
361 case DW_EH_PE_abs:
362 break;
363 case DW_EH_PE_pcrel:
364 value += (unsigned long)*pLoc;
365 break;
366 default:
367 return 0;
368 }
369 if ((ptrType & DW_EH_PE_indirect)
370 && __get_user(value, (unsigned long *)value))
371 return 0;
372 *pLoc = ptr.p8;
373
374 return value;
375}
376
377static signed fde_pointer_type(const u32 *cie)
378{
379 const u8 *ptr = (const u8 *)(cie + 2);
380 unsigned version = *ptr;
381
382 if (version != 1)
383 return -1; /* unsupported */
384 if (*++ptr) {
385 const char *aug;
386 const u8 *end = (const u8 *)(cie + 1) + *cie;
387 uleb128_t len;
388
389 /* check if augmentation size is first (and thus present) */
390 if (*ptr != 'z')
391 return -1;
392 /* check if augmentation string is nul-terminated */
393 if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
394 return -1;
395 ++ptr; /* skip terminator */
396 get_uleb128(&ptr, end); /* skip code alignment */
397 get_sleb128(&ptr, end); /* skip data alignment */
398 /* skip return address column */
399 version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
400 len = get_uleb128(&ptr, end); /* augmentation length */
401 if (ptr + len < ptr || ptr + len > end)
402 return -1;
403 end = ptr + len;
404 while (*++aug) {
405 if (ptr >= end)
406 return -1;
407 switch(*aug) {
408 case 'L':
409 ++ptr;
410 break;
411 case 'P': {
412 signed ptrType = *ptr++;
413
414 if (!read_pointer(&ptr, end, ptrType) || ptr > end)
415 return -1;
416 }
417 break;
418 case 'R':
419 return *ptr;
420 default:
421 return -1;
422 }
423 }
424 }
425 return DW_EH_PE_native|DW_EH_PE_abs;
426}
427
428static int advance_loc(unsigned long delta, struct unwind_state *state)
429{
430 state->loc += delta * state->codeAlign;
431
432 return delta > 0;
433}
434
435static void set_rule(uleb128_t reg,
436 enum item_location where,
437 uleb128_t value,
438 struct unwind_state *state)
439{
440 if (reg < ARRAY_SIZE(state->regs)) {
441 state->regs[reg].where = where;
442 state->regs[reg].value = value;
443 }
444}
445
446static int processCFI(const u8 *start,
447 const u8 *end,
448 unsigned long targetLoc,
449 signed ptrType,
450 struct unwind_state *state)
451{
452 union {
453 const u8 *p8;
454 const u16 *p16;
455 const u32 *p32;
456 } ptr;
457 int result = 1;
458
459 if (start != state->cieStart) {
460 state->loc = state->org;
461 result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
462 if (targetLoc == 0 && state->label == NULL)
463 return result;
464 }
465 for (ptr.p8 = start; result && ptr.p8 < end; ) {
466 switch(*ptr.p8 >> 6) {
467 uleb128_t value;
468
469 case 0:
470 switch(*ptr.p8++) {
471 case DW_CFA_nop:
472 break;
473 case DW_CFA_set_loc:
474 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
475 result = 0;
476 break;
477 case DW_CFA_advance_loc1:
478 result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
479 break;
480 case DW_CFA_advance_loc2:
481 result = ptr.p8 <= end + 2
482 && advance_loc(*ptr.p16++, state);
483 break;
484 case DW_CFA_advance_loc4:
485 result = ptr.p8 <= end + 4
486 && advance_loc(*ptr.p32++, state);
487 break;
488 case DW_CFA_offset_extended:
489 value = get_uleb128(&ptr.p8, end);
490 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
491 break;
492 case DW_CFA_val_offset:
493 value = get_uleb128(&ptr.p8, end);
494 set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
495 break;
496 case DW_CFA_offset_extended_sf:
497 value = get_uleb128(&ptr.p8, end);
498 set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
499 break;
500 case DW_CFA_val_offset_sf:
501 value = get_uleb128(&ptr.p8, end);
502 set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
503 break;
504 case DW_CFA_restore_extended:
505 case DW_CFA_undefined:
506 case DW_CFA_same_value:
507 set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
508 break;
509 case DW_CFA_register:
510 value = get_uleb128(&ptr.p8, end);
511 set_rule(value,
512 Register,
513 get_uleb128(&ptr.p8, end), state);
514 break;
515 case DW_CFA_remember_state:
516 if (ptr.p8 == state->label) {
517 state->label = NULL;
518 return 1;
519 }
520 if (state->stackDepth >= MAX_STACK_DEPTH)
521 return 0;
522 state->stack[state->stackDepth++] = ptr.p8;
523 break;
524 case DW_CFA_restore_state:
525 if (state->stackDepth) {
526 const uleb128_t loc = state->loc;
527 const u8 *label = state->label;
528
529 state->label = state->stack[state->stackDepth - 1];
530 memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
531 memset(state->regs, 0, sizeof(state->regs));
532 state->stackDepth = 0;
533 result = processCFI(start, end, 0, ptrType, state);
534 state->loc = loc;
535 state->label = label;
536 } else
537 return 0;
538 break;
539 case DW_CFA_def_cfa:
540 state->cfa.reg = get_uleb128(&ptr.p8, end);
541 /*nobreak*/
542 case DW_CFA_def_cfa_offset:
543 state->cfa.offs = get_uleb128(&ptr.p8, end);
544 break;
545 case DW_CFA_def_cfa_sf:
546 state->cfa.reg = get_uleb128(&ptr.p8, end);
547 /*nobreak*/
548 case DW_CFA_def_cfa_offset_sf:
549 state->cfa.offs = get_sleb128(&ptr.p8, end)
550 * state->dataAlign;
551 break;
552 case DW_CFA_def_cfa_register:
553 state->cfa.reg = get_uleb128(&ptr.p8, end);
554 break;
555 /*todo case DW_CFA_def_cfa_expression: */
556 /*todo case DW_CFA_expression: */
557 /*todo case DW_CFA_val_expression: */
558 case DW_CFA_GNU_args_size:
559 get_uleb128(&ptr.p8, end);
560 break;
561 case DW_CFA_GNU_negative_offset_extended:
562 value = get_uleb128(&ptr.p8, end);
563 set_rule(value,
564 Memory,
565 (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
566 break;
567 case DW_CFA_GNU_window_save:
568 default:
569 result = 0;
570 break;
571 }
572 break;
573 case 1:
574 result = advance_loc(*ptr.p8++ & 0x3f, state);
575 break;
576 case 2:
577 value = *ptr.p8++ & 0x3f;
578 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
579 break;
580 case 3:
581 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
582 break;
583 }
584 if (ptr.p8 > end)
585 result = 0;
586 if (result && targetLoc != 0 && targetLoc < state->loc)
587 return 1;
588 }
589
590 return result
591 && ptr.p8 == end
592 && (targetLoc == 0
593 || (/*todo While in theory this should apply, gcc in practice omits
594 everything past the function prolog, and hence the location
595 never reaches the end of the function.
596 targetLoc < state->loc &&*/ state->label == NULL));
597}
598
599/* Unwind to previous to frame. Returns 0 if successful, negative
600 * number in case of an error. */
601int unwind(struct unwind_frame_info *frame)
602{
603#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
604 const u32 *fde = NULL, *cie = NULL;
605 const u8 *ptr = NULL, *end = NULL;
606 unsigned long startLoc = 0, endLoc = 0, cfa;
607 unsigned i;
608 signed ptrType = -1;
609 uleb128_t retAddrReg = 0;
610 struct unwind_table *table;
611 struct unwind_state state;
612
613 if (UNW_PC(frame) == 0)
614 return -EINVAL;
615 if ((table = find_table(UNW_PC(frame))) != NULL
616 && !(table->size & (sizeof(*fde) - 1))) {
617 unsigned long tableSize = table->size;
618
619 for (fde = table->address;
620 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
621 tableSize -= sizeof(*fde) + *fde,
622 fde += 1 + *fde / sizeof(*fde)) {
623 if (!*fde || (*fde & (sizeof(*fde) - 1)))
624 break;
625 if (!fde[1])
626 continue; /* this is a CIE */
627 if ((fde[1] & (sizeof(*fde) - 1))
628 || fde[1] > (unsigned long)(fde + 1)
629 - (unsigned long)table->address)
630 continue; /* this is not a valid FDE */
631 cie = fde + 1 - fde[1] / sizeof(*fde);
632 if (*cie <= sizeof(*cie) + 4
633 || *cie >= fde[1] - sizeof(*fde)
634 || (*cie & (sizeof(*cie) - 1))
635 || cie[1]
636 || (ptrType = fde_pointer_type(cie)) < 0) {
637 cie = NULL; /* this is not a (valid) CIE */
638 continue;
639 }
640 ptr = (const u8 *)(fde + 2);
641 startLoc = read_pointer(&ptr,
642 (const u8 *)(fde + 1) + *fde,
643 ptrType);
644 endLoc = startLoc
645 + read_pointer(&ptr,
646 (const u8 *)(fde + 1) + *fde,
647 ptrType & DW_EH_PE_indirect
648 ? ptrType
649 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
650 if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
651 break;
652 cie = NULL;
653 }
654 }
655 if (cie != NULL) {
656 memset(&state, 0, sizeof(state));
657 state.cieEnd = ptr; /* keep here temporarily */
658 ptr = (const u8 *)(cie + 2);
659 end = (const u8 *)(cie + 1) + *cie;
660 if ((state.version = *ptr) != 1)
661 cie = NULL; /* unsupported version */
662 else if (*++ptr) {
663 /* check if augmentation size is first (and thus present) */
664 if (*ptr == 'z') {
665 /* check for ignorable (or already handled)
666 * nul-terminated augmentation string */
667 while (++ptr < end && *ptr)
668 if (strchr("LPR", *ptr) == NULL)
669 break;
670 }
671 if (ptr >= end || *ptr)
672 cie = NULL;
673 }
674 ++ptr;
675 }
676 if (cie != NULL) {
677 /* get code aligment factor */
678 state.codeAlign = get_uleb128(&ptr, end);
679 /* get data aligment factor */
680 state.dataAlign = get_sleb128(&ptr, end);
681 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
682 cie = NULL;
683 else {
684 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
685 /* skip augmentation */
686 if (((const char *)(cie + 2))[1] == 'z')
687 ptr += get_uleb128(&ptr, end);
688 if (ptr > end
689 || retAddrReg >= ARRAY_SIZE(reg_info)
690 || REG_INVALID(retAddrReg)
691 || reg_info[retAddrReg].width != sizeof(unsigned long))
692 cie = NULL;
693 }
694 }
695 if (cie != NULL) {
696 state.cieStart = ptr;
697 ptr = state.cieEnd;
698 state.cieEnd = end;
699 end = (const u8 *)(fde + 1) + *fde;
700 /* skip augmentation */
701 if (((const char *)(cie + 2))[1] == 'z') {
702 uleb128_t augSize = get_uleb128(&ptr, end);
703
704 if ((ptr += augSize) > end)
705 fde = NULL;
706 }
707 }
708 if (cie == NULL || fde == NULL) {
709#ifdef CONFIG_FRAME_POINTER
710 unsigned long top, bottom;
711#endif
712
713#ifdef CONFIG_FRAME_POINTER
714 top = STACK_TOP(frame->task);
715 bottom = STACK_BOTTOM(frame->task);
716# if FRAME_RETADDR_OFFSET < 0
717 if (UNW_SP(frame) < top
718 && UNW_FP(frame) <= UNW_SP(frame)
719 && bottom < UNW_FP(frame)
720# else
721 if (UNW_SP(frame) > top
722 && UNW_FP(frame) >= UNW_SP(frame)
723 && bottom > UNW_FP(frame)
724# endif
725 && !((UNW_SP(frame) | UNW_FP(frame))
726 & (sizeof(unsigned long) - 1))) {
727 unsigned long link;
728
729 if (!__get_user(link,
730 (unsigned long *)(UNW_FP(frame)
731 + FRAME_LINK_OFFSET))
732# if FRAME_RETADDR_OFFSET < 0
733 && link > bottom && link < UNW_FP(frame)
734# else
735 && link > UNW_FP(frame) && link < bottom
736# endif
737 && !(link & (sizeof(link) - 1))
738 && !__get_user(UNW_PC(frame),
739 (unsigned long *)(UNW_FP(frame)
740 + FRAME_RETADDR_OFFSET))) {
741 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
742# if FRAME_RETADDR_OFFSET < 0
743 -
744# else
745 +
746# endif
747 sizeof(UNW_PC(frame));
748 UNW_FP(frame) = link;
749 return 0;
750 }
751 }
752#endif
753 return -ENXIO;
754 }
755 state.org = startLoc;
756 memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
757 /* process instructions */
758 if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
759 || state.loc > endLoc
760 || state.regs[retAddrReg].where == Nowhere
761 || state.cfa.reg >= ARRAY_SIZE(reg_info)
762 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
763 || state.cfa.offs % sizeof(unsigned long))
764 return -EIO;
765 /* update frame */
766 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
767 startLoc = min((unsigned long)UNW_SP(frame), cfa);
768 endLoc = max((unsigned long)UNW_SP(frame), cfa);
769 if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
770 startLoc = min(STACK_LIMIT(cfa), cfa);
771 endLoc = max(STACK_LIMIT(cfa), cfa);
772 }
773#ifndef CONFIG_64BIT
774# define CASES CASE(8); CASE(16); CASE(32)
775#else
776# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
777#endif
778 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
779 if (REG_INVALID(i)) {
780 if (state.regs[i].where == Nowhere)
781 continue;
782 return -EIO;
783 }
784 switch(state.regs[i].where) {
785 default:
786 break;
787 case Register:
788 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
789 || REG_INVALID(state.regs[i].value)
790 || reg_info[i].width > reg_info[state.regs[i].value].width)
791 return -EIO;
792 switch(reg_info[state.regs[i].value].width) {
793#define CASE(n) \
794 case sizeof(u##n): \
795 state.regs[i].value = FRAME_REG(state.regs[i].value, \
796 const u##n); \
797 break
798 CASES;
799#undef CASE
800 default:
801 return -EIO;
802 }
803 break;
804 }
805 }
806 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
807 if (REG_INVALID(i))
808 continue;
809 switch(state.regs[i].where) {
810 case Nowhere:
811 if (reg_info[i].width != sizeof(UNW_SP(frame))
812 || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
813 != &UNW_SP(frame))
814 continue;
815 UNW_SP(frame) = cfa;
816 break;
817 case Register:
818 switch(reg_info[i].width) {
819#define CASE(n) case sizeof(u##n): \
820 FRAME_REG(i, u##n) = state.regs[i].value; \
821 break
822 CASES;
823#undef CASE
824 default:
825 return -EIO;
826 }
827 break;
828 case Value:
829 if (reg_info[i].width != sizeof(unsigned long))
830 return -EIO;
831 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
832 * state.dataAlign;
833 break;
834 case Memory: {
835 unsigned long addr = cfa + state.regs[i].value
836 * state.dataAlign;
837
838 if ((state.regs[i].value * state.dataAlign)
839 % sizeof(unsigned long)
840 || addr < startLoc
841 || addr + sizeof(unsigned long) < addr
842 || addr + sizeof(unsigned long) > endLoc)
843 return -EIO;
844 switch(reg_info[i].width) {
845#define CASE(n) case sizeof(u##n): \
846 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
847 break
848 CASES;
849#undef CASE
850 default:
851 return -EIO;
852 }
853 }
854 break;
855 }
856 }
857
858 return 0;
859#undef CASES
860#undef FRAME_REG
861}
862EXPORT_SYMBOL(unwind);
863
864int unwind_init_frame_info(struct unwind_frame_info *info,
865 struct task_struct *tsk,
866 /*const*/ struct pt_regs *regs)
867{
868 info->task = tsk;
869 arch_unw_init_frame_info(info, regs);
870
871 return 0;
872}
873EXPORT_SYMBOL(unwind_init_frame_info);
874
875/*
876 * Prepare to unwind a blocked task.
877 */
878int unwind_init_blocked(struct unwind_frame_info *info,
879 struct task_struct *tsk)
880{
881 info->task = tsk;
882 arch_unw_init_blocked(info);
883
884 return 0;
885}
886EXPORT_SYMBOL(unwind_init_blocked);
887
888/*
889 * Prepare to unwind the currently running thread.
890 */
891int unwind_init_running(struct unwind_frame_info *info,
892 asmlinkage int (*callback)(struct unwind_frame_info *,
893 void *arg),
894 void *arg)
895{
896 info->task = current;
897
898 return arch_unwind_init_running(info, callback, arg);
899}
900EXPORT_SYMBOL(unwind_init_running);
901
902/*
903 * Unwind until the return pointer is in user-land (or until an error
904 * occurs). Returns 0 if successful, negative number in case of
905 * error.
906 */
907int unwind_to_user(struct unwind_frame_info *info)
908{
909 while (!arch_unw_user_mode(info)) {
910 int err = unwind(info);
911
912 if (err < 0)
913 return err;
914 }
915
916 return 0;
917}
918EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/user.c b/kernel/user.c
index 2116642f42c6..6408c0424291 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid)
140 atomic_set(&new->processes, 0); 140 atomic_set(&new->processes, 0);
141 atomic_set(&new->files, 0); 141 atomic_set(&new->files, 0);
142 atomic_set(&new->sigpending, 0); 142 atomic_set(&new->sigpending, 0);
143#ifdef CONFIG_INOTIFY 143#ifdef CONFIG_INOTIFY_USER
144 atomic_set(&new->inotify_watches, 0); 144 atomic_set(&new->inotify_watches, 0);
145 atomic_set(&new->inotify_devs, 0); 145 atomic_set(&new->inotify_devs, 0);
146#endif 146#endif
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
148 new->mq_bytes = 0; 148 new->mq_bytes = 0;
149 new->locked_shm = 0; 149 new->locked_shm = 0;
150 150
151 if (alloc_uid_keyring(new) < 0) { 151 if (alloc_uid_keyring(new, current) < 0) {
152 kmem_cache_free(uid_cachep, new); 152 kmem_cache_free(uid_cachep, new);
153 return NULL; 153 return NULL;
154 } 154 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 880fb415a8f6..565cf7a1febd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu,
428 return ret; 428 return ret;
429} 429}
430 430
431int schedule_on_each_cpu(void (*func) (void *info), void *info) 431/**
432 * schedule_on_each_cpu - call a function on each online CPU from keventd
433 * @func: the function to call
434 * @info: a pointer to pass to func()
435 *
436 * Returns zero on success.
437 * Returns -ve errno on failure.
438 *
439 * Appears to be racy against CPU hotplug.
440 *
441 * schedule_on_each_cpu() is very slow.
442 */
443int schedule_on_each_cpu(void (*func)(void *info), void *info)
432{ 444{
433 int cpu; 445 int cpu;
434 struct work_struct *work; 446 struct work_struct *works;
435 447
436 work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); 448 works = alloc_percpu(struct work_struct);
437 449 if (!works)
438 if (!work)
439 return -ENOMEM; 450 return -ENOMEM;
451
440 for_each_online_cpu(cpu) { 452 for_each_online_cpu(cpu) {
441 INIT_WORK(work + cpu, func, info); 453 INIT_WORK(per_cpu_ptr(works, cpu), func, info);
442 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 454 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
443 work + cpu); 455 per_cpu_ptr(works, cpu));
444 } 456 }
445 flush_workqueue(keventd_wq); 457 flush_workqueue(keventd_wq);
446 kfree(work); 458 free_percpu(works);
447 return 0; 459 return 0;
448} 460}
449 461
@@ -531,11 +543,11 @@ int current_is_keventd(void)
531static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 543static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
532{ 544{
533 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 545 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
534 LIST_HEAD(list); 546 struct list_head list;
535 struct work_struct *work; 547 struct work_struct *work;
536 548
537 spin_lock_irq(&cwq->lock); 549 spin_lock_irq(&cwq->lock);
538 list_splice_init(&cwq->worklist, &list); 550 list_replace_init(&cwq->worklist, &list);
539 551
540 while (!list_empty(&list)) { 552 while (!list_empty(&list)) {
541 printk("Taking work for %s\n", wq->name); 553 printk("Taking work for %s\n", wq->name);
@@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
578 590
579 case CPU_UP_CANCELED: 591 case CPU_UP_CANCELED:
580 list_for_each_entry(wq, &workqueues, list) { 592 list_for_each_entry(wq, &workqueues, list) {
593 if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
594 continue;
581 /* Unbind so it can run. */ 595 /* Unbind so it can run. */
582 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, 596 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
583 any_online_cpu(cpu_online_map)); 597 any_online_cpu(cpu_online_map));