aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore5
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c17
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/compat.c48
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpuset.c582
-rw-r--r--kernel/crash_dump.c61
-rw-r--r--kernel/exit.c63
-rw-r--r--kernel/fork.c50
-rw-r--r--kernel/futex.c13
-rw-r--r--kernel/hrtimer.c826
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/proc.c6
-rw-r--r--kernel/itimer.c106
-rw-r--r--kernel/kexec.c21
-rw-r--r--kernel/kprobes.c157
-rw-r--r--kernel/ksysfs.c37
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/mutex-debug.c462
-rw-r--r--kernel/mutex-debug.h134
-rw-r--r--kernel/mutex.c315
-rw-r--r--kernel/mutex.h35
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c22
-rw-r--r--kernel/posix-cpu-timers.c76
-rw-r--r--kernel/posix-timers.c895
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c101
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/power.h24
-rw-r--r--kernel/power/snapshot.c89
-rw-r--r--kernel/power/swsusp.c1020
-rw-r--r--kernel/printk.c8
-rw-r--r--kernel/ptrace.c78
-rw-r--r--kernel/rcupdate.c190
-rw-r--r--kernel/rcutorture.c102
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c561
-rw-r--r--kernel/signal.c168
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c103
-rw-r--r--kernel/sys_ni.c24
-rw-r--r--kernel/sysctl.c77
-rw-r--r--kernel/time.c106
-rw-r--r--kernel/timer.c58
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c42
51 files changed, 4773 insertions, 2013 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
new file mode 100644
index 000000000000..f2ab70073bd4
--- /dev/null
+++ b/kernel/.gitignore
@@ -0,0 +1,5 @@
1#
2# Generated files
3#
4config_data.h
5config_data.gz
diff --git a/kernel/Makefile b/kernel/Makefile
index 4f5a1453093a..4ae0fbde815d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,15 +6,18 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o intermodule.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o
11 12
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
12obj-$(CONFIG_FUTEX) += futex.o 14obj-$(CONFIG_FUTEX) += futex.o
13obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 15obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
14obj-$(CONFIG_SMP) += cpu.o spinlock.o 16obj-$(CONFIG_SMP) += cpu.o spinlock.o
15obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 17obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
16obj-$(CONFIG_UID16) += uid16.o 18obj-$(CONFIG_UID16) += uid16.o
17obj-$(CONFIG_MODULES) += module.o 19obj-$(CONFIG_MODULES) += module.o
20obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
18obj-$(CONFIG_KALLSYMS) += kallsyms.o 21obj-$(CONFIG_KALLSYMS) += kallsyms.o
19obj-$(CONFIG_PM) += power/ 22obj-$(CONFIG_PM) += power/
20obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 23obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
@@ -29,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
29obj-$(CONFIG_SYSFS) += ksysfs.o 32obj-$(CONFIG_SYSFS) += ksysfs.o
30obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 33obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
31obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 34obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
32obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
33obj-$(CONFIG_SECCOMP) += seccomp.o 35obj-$(CONFIG_SECCOMP) += seccomp.o
34obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 36obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
35 37
diff --git a/kernel/acct.c b/kernel/acct.c
index 6312d6bd43e3..065d8b4e51ef 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -47,6 +47,7 @@
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/acct.h> 49#include <linux/acct.h>
50#include <linux/capability.h>
50#include <linux/file.h> 51#include <linux/file.h>
51#include <linux/tty.h> 52#include <linux/tty.h>
52#include <linux/security.h> 53#include <linux/security.h>
@@ -427,6 +428,7 @@ static void do_acct_process(long exitcode, struct file *file)
427 u64 elapsed; 428 u64 elapsed;
428 u64 run_time; 429 u64 run_time;
429 struct timespec uptime; 430 struct timespec uptime;
431 unsigned long jiffies;
430 432
431 /* 433 /*
432 * First check to see if there is enough free_space to continue 434 * First check to see if there is enough free_space to continue
@@ -467,12 +469,12 @@ static void do_acct_process(long exitcode, struct file *file)
467#endif 469#endif
468 do_div(elapsed, AHZ); 470 do_div(elapsed, AHZ);
469 ac.ac_btime = xtime.tv_sec - elapsed; 471 ac.ac_btime = xtime.tv_sec - elapsed;
470 ac.ac_utime = encode_comp_t(jiffies_to_AHZ( 472 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
471 current->signal->utime + 473 current->signal->utime));
472 current->group_leader->utime)); 474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
473 ac.ac_stime = encode_comp_t(jiffies_to_AHZ( 475 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
474 current->signal->stime + 476 current->signal->stime));
475 current->group_leader->stime)); 477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
476 /* we really need to bite the bullet and change layout */ 478 /* we really need to bite the bullet and change layout */
477 ac.ac_uid = current->uid; 479 ac.ac_uid = current->uid;
478 ac.ac_gid = current->gid; 480 ac.ac_gid = current->gid;
@@ -580,7 +582,8 @@ void acct_process(long exitcode)
580void acct_update_integrals(struct task_struct *tsk) 582void acct_update_integrals(struct task_struct *tsk)
581{ 583{
582 if (likely(tsk->mm)) { 584 if (likely(tsk->mm)) {
583 long delta = tsk->stime - tsk->acct_stimexpd; 585 long delta =
586 cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
584 587
585 if (delta == 0) 588 if (delta == 0)
586 return; 589 return;
diff --git a/kernel/audit.c b/kernel/audit.c
index 0c56320d38dc..0a813d2883e5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -42,8 +42,8 @@
42 */ 42 */
43 43
44#include <linux/init.h> 44#include <linux/init.h>
45#include <asm/atomic.h>
46#include <asm/types.h> 45#include <asm/types.h>
46#include <asm/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/err.h> 49#include <linux/err.h>
@@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid)
267 return old; 267 return old;
268} 268}
269 269
270int kauditd_thread(void *dummy) 270static int kauditd_thread(void *dummy)
271{ 271{
272 struct sk_buff *skb; 272 struct sk_buff *skb;
273 273
@@ -291,8 +291,10 @@ int kauditd_thread(void *dummy)
291 set_current_state(TASK_INTERRUPTIBLE); 291 set_current_state(TASK_INTERRUPTIBLE);
292 add_wait_queue(&kauditd_wait, &wait); 292 add_wait_queue(&kauditd_wait, &wait);
293 293
294 if (!skb_queue_len(&audit_skb_queue)) 294 if (!skb_queue_len(&audit_skb_queue)) {
295 try_to_freeze();
295 schedule(); 296 schedule();
297 }
296 298
297 __set_current_state(TASK_RUNNING); 299 __set_current_state(TASK_RUNNING);
298 remove_wait_queue(&kauditd_wait, &wait); 300 remove_wait_queue(&kauditd_wait, &wait);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d8a68509e729..685c25175d96 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -30,8 +30,8 @@
30 */ 30 */
31 31
32#include <linux/init.h> 32#include <linux/init.h>
33#include <asm/atomic.h>
34#include <asm/types.h> 33#include <asm/types.h>
34#include <asm/atomic.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/mount.h> 37#include <linux/mount.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 8986a37a67ea..bfa3c92e16f2 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#include <linux/capability.h>
10#include <linux/mm.h> 11#include <linux/mm.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/security.h> 13#include <linux/security.h>
diff --git a/kernel/compat.c b/kernel/compat.c
index 102296e21ea8..1867290c37e3 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -514,6 +514,24 @@ static int put_compat_itimerspec(struct compat_itimerspec __user *dst,
514 return 0; 514 return 0;
515} 515}
516 516
517long compat_sys_timer_create(clockid_t which_clock,
518 struct compat_sigevent __user *timer_event_spec,
519 timer_t __user *created_timer_id)
520{
521 struct sigevent __user *event = NULL;
522
523 if (timer_event_spec) {
524 struct sigevent kevent;
525
526 event = compat_alloc_user_space(sizeof(*event));
527 if (get_compat_sigevent(&kevent, timer_event_spec) ||
528 copy_to_user(event, &kevent, sizeof(*event)))
529 return -EFAULT;
530 }
531
532 return sys_timer_create(which_clock, event, created_timer_id);
533}
534
517long compat_sys_timer_settime(timer_t timer_id, int flags, 535long compat_sys_timer_settime(timer_t timer_id, int flags,
518 struct compat_itimerspec __user *new, 536 struct compat_itimerspec __user *new,
519 struct compat_itimerspec __user *old) 537 struct compat_itimerspec __user *old)
@@ -649,8 +667,6 @@ int get_compat_sigevent(struct sigevent *event,
649 ? -EFAULT : 0; 667 ? -EFAULT : 0;
650} 668}
651 669
652/* timer_create is architecture specific because it needs sigevent conversion */
653
654long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, 670long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
655 unsigned long bitmap_size) 671 unsigned long bitmap_size)
656{ 672{
@@ -855,3 +871,31 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
855} 871}
856 872
857#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ 873#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
874
875#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
876asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
877{
878 sigset_t newset;
879 compat_sigset_t newset32;
880
881 /* XXX: Don't preclude handling different sized sigset_t's. */
882 if (sigsetsize != sizeof(sigset_t))
883 return -EINVAL;
884
885 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
886 return -EFAULT;
887 sigset_from_compat(&newset, &newset32);
888 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
889
890 spin_lock_irq(&current->sighand->siglock);
891 current->saved_sigmask = current->blocked;
892 current->blocked = newset;
893 recalc_sigpending();
894 spin_unlock_irq(&current->sighand->siglock);
895
896 current->state = TASK_INTERRUPTIBLE;
897 schedule();
898 set_thread_flag(TIF_RESTORE_SIGMASK);
899 return -ERESTARTNOHAND;
900}
901#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/configs.c b/kernel/configs.c
index 986f7af31e0a..009e1ebdcb88 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -3,7 +3,7 @@
3 * Echo the kernel .config file used to build the kernel 3 * Echo the kernel .config file used to build the kernel
4 * 4 *
5 * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> 5 * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
6 * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> 6 * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
7 * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> 7 * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
8 * Copyright (C) 2002 Hewlett-Packard Company 8 * Copyright (C) 2002 Hewlett-Packard Company
9 * 9 *
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7430640f9816..fe2f71f92ae0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -39,6 +39,7 @@
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/proc_fs.h> 41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
42#include <linux/sched.h> 43#include <linux/sched.h>
43#include <linux/seq_file.h> 44#include <linux/seq_file.h>
44#include <linux/slab.h> 45#include <linux/slab.h>
@@ -54,7 +55,23 @@
54#include <asm/atomic.h> 55#include <asm/atomic.h>
55#include <asm/semaphore.h> 56#include <asm/semaphore.h>
56 57
57#define CPUSET_SUPER_MAGIC 0x27e0eb 58#define CPUSET_SUPER_MAGIC 0x27e0eb
59
60/*
61 * Tracks how many cpusets are currently defined in system.
62 * When there is only one cpuset (the root cpuset) we can
63 * short circuit some hooks.
64 */
65int number_of_cpusets __read_mostly;
66
67/* See "Frequency meter" comments, below. */
68
69struct fmeter {
70 int cnt; /* unprocessed events count */
71 int val; /* most recent output value */
72 time_t time; /* clock (secs) when val computed */
73 spinlock_t lock; /* guards read or write of above */
74};
58 75
59struct cpuset { 76struct cpuset {
60 unsigned long flags; /* "unsigned long" so bitops work */ 77 unsigned long flags; /* "unsigned long" so bitops work */
@@ -80,13 +97,16 @@ struct cpuset {
80 * Copy of global cpuset_mems_generation as of the most 97 * Copy of global cpuset_mems_generation as of the most
81 * recent time this cpuset changed its mems_allowed. 98 * recent time this cpuset changed its mems_allowed.
82 */ 99 */
83 int mems_generation; 100 int mems_generation;
101
102 struct fmeter fmeter; /* memory_pressure filter */
84}; 103};
85 104
86/* bits in struct cpuset flags field */ 105/* bits in struct cpuset flags field */
87typedef enum { 106typedef enum {
88 CS_CPU_EXCLUSIVE, 107 CS_CPU_EXCLUSIVE,
89 CS_MEM_EXCLUSIVE, 108 CS_MEM_EXCLUSIVE,
109 CS_MEMORY_MIGRATE,
90 CS_REMOVED, 110 CS_REMOVED,
91 CS_NOTIFY_ON_RELEASE 111 CS_NOTIFY_ON_RELEASE
92} cpuset_flagbits_t; 112} cpuset_flagbits_t;
@@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs)
112 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 132 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
113} 133}
114 134
135static inline int is_memory_migrate(const struct cpuset *cs)
136{
137 return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
138}
139
115/* 140/*
116 * Increment this atomic integer everytime any cpuset changes its 141 * Increment this atomic integer everytime any cpuset changes its
117 * mems_allowed value. Users of cpusets can track this generation 142 * mems_allowed value. Users of cpusets can track this generation
@@ -137,13 +162,10 @@ static struct cpuset top_cpuset = {
137 .count = ATOMIC_INIT(0), 162 .count = ATOMIC_INIT(0),
138 .sibling = LIST_HEAD_INIT(top_cpuset.sibling), 163 .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
139 .children = LIST_HEAD_INIT(top_cpuset.children), 164 .children = LIST_HEAD_INIT(top_cpuset.children),
140 .parent = NULL,
141 .dentry = NULL,
142 .mems_generation = 0,
143}; 165};
144 166
145static struct vfsmount *cpuset_mount; 167static struct vfsmount *cpuset_mount;
146static struct super_block *cpuset_sb = NULL; 168static struct super_block *cpuset_sb;
147 169
148/* 170/*
149 * We have two global cpuset semaphores below. They can nest. 171 * We have two global cpuset semaphores below. They can nest.
@@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL;
227 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock 249 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
228 * (task->alloc_lock) already in the task_struct routinely used for 250 * (task->alloc_lock) already in the task_struct routinely used for
229 * such matters. 251 * such matters.
252 *
253 * P.S. One more locking exception. RCU is used to guard the
254 * update of a tasks cpuset pointer by attach_task() and the
255 * access of task->cpuset->mems_generation via that pointer in
256 * the routine cpuset_update_task_memory_state().
230 */ 257 */
231 258
232static DECLARE_MUTEX(manage_sem); 259static DECLARE_MUTEX(manage_sem);
@@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
304 spin_lock(&dcache_lock); 331 spin_lock(&dcache_lock);
305 node = dentry->d_subdirs.next; 332 node = dentry->d_subdirs.next;
306 while (node != &dentry->d_subdirs) { 333 while (node != &dentry->d_subdirs) {
307 struct dentry *d = list_entry(node, struct dentry, d_child); 334 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
308 list_del_init(node); 335 list_del_init(node);
309 if (d->d_inode) { 336 if (d->d_inode) {
310 d = dget_locked(d); 337 d = dget_locked(d);
@@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
316 } 343 }
317 node = dentry->d_subdirs.next; 344 node = dentry->d_subdirs.next;
318 } 345 }
319 list_del_init(&dentry->d_child); 346 list_del_init(&dentry->d_u.d_child);
320 spin_unlock(&dcache_lock); 347 spin_unlock(&dcache_lock);
321 remove_dir(dentry); 348 remove_dir(dentry);
322} 349}
@@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
570 BUG_ON(!nodes_intersects(*pmask, node_online_map)); 597 BUG_ON(!nodes_intersects(*pmask, node_online_map));
571} 598}
572 599
573/* 600/**
574 * Refresh current tasks mems_allowed and mems_generation from current 601 * cpuset_update_task_memory_state - update task memory placement
575 * tasks cpuset. 602 *
603 * If the current tasks cpusets mems_allowed changed behind our
604 * backs, update current->mems_allowed, mems_generation and task NUMA
605 * mempolicy to the new value.
576 * 606 *
577 * Call without callback_sem or task_lock() held. May be called with 607 * Task mempolicy is updated by rebinding it relative to the
578 * or without manage_sem held. Will acquire task_lock() and might 608 * current->cpuset if a task has its memory placement changed.
579 * acquire callback_sem during call. 609 * Do not call this routine if in_interrupt().
580 * 610 *
581 * The task_lock() is required to dereference current->cpuset safely. 611 * Call without callback_sem or task_lock() held. May be called
582 * Without it, we could pick up the pointer value of current->cpuset 612 * with or without manage_sem held. Doesn't need task_lock to guard
583 * in one instruction, and then attach_task could give us a different 613 * against another task changing a non-NULL cpuset pointer to NULL,
584 * cpuset, and then the cpuset we had could be removed and freed, 614 * as that is only done by a task on itself, and if the current task
585 * and then on our next instruction, we could dereference a no longer 615 * is here, it is not simultaneously in the exit code NULL'ing its
586 * valid cpuset pointer to get its mems_generation field. 616 * cpuset pointer. This routine also might acquire callback_sem and
617 * current->mm->mmap_sem during call.
618 *
619 * Reading current->cpuset->mems_generation doesn't need task_lock
620 * to guard the current->cpuset derefence, because it is guarded
621 * from concurrent freeing of current->cpuset by attach_task(),
622 * using RCU.
623 *
624 * The rcu_dereference() is technically probably not needed,
625 * as I don't actually mind if I see a new cpuset pointer but
626 * an old value of mems_generation. However this really only
627 * matters on alpha systems using cpusets heavily. If I dropped
628 * that rcu_dereference(), it would save them a memory barrier.
629 * For all other arch's, rcu_dereference is a no-op anyway, and for
630 * alpha systems not using cpusets, another planned optimization,
631 * avoiding the rcu critical section for tasks in the root cpuset
632 * which is statically allocated, so can't vanish, will make this
633 * irrelevant. Better to use RCU as intended, than to engage in
634 * some cute trick to save a memory barrier that is impossible to
635 * test, for alpha systems using cpusets heavily, which might not
636 * even exist.
587 * 637 *
588 * This routine is needed to update the per-task mems_allowed data, 638 * This routine is needed to update the per-task mems_allowed data,
589 * within the tasks context, when it is trying to allocate memory 639 * within the tasks context, when it is trying to allocate memory
@@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
591 * task has been modifying its cpuset. 641 * task has been modifying its cpuset.
592 */ 642 */
593 643
594static void refresh_mems(void) 644void cpuset_update_task_memory_state()
595{ 645{
596 int my_cpusets_mem_gen; 646 int my_cpusets_mem_gen;
647 struct task_struct *tsk = current;
648 struct cpuset *cs;
597 649
598 task_lock(current); 650 if (tsk->cpuset == &top_cpuset) {
599 my_cpusets_mem_gen = current->cpuset->mems_generation; 651 /* Don't need rcu for top_cpuset. It's never freed. */
600 task_unlock(current); 652 my_cpusets_mem_gen = top_cpuset.mems_generation;
601 653 } else {
602 if (current->cpuset_mems_generation != my_cpusets_mem_gen) { 654 rcu_read_lock();
603 struct cpuset *cs; 655 cs = rcu_dereference(tsk->cpuset);
604 nodemask_t oldmem = current->mems_allowed; 656 my_cpusets_mem_gen = cs->mems_generation;
657 rcu_read_unlock();
658 }
605 659
660 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
606 down(&callback_sem); 661 down(&callback_sem);
607 task_lock(current); 662 task_lock(tsk);
608 cs = current->cpuset; 663 cs = tsk->cpuset; /* Maybe changed when task not locked */
609 guarantee_online_mems(cs, &current->mems_allowed); 664 guarantee_online_mems(cs, &tsk->mems_allowed);
610 current->cpuset_mems_generation = cs->mems_generation; 665 tsk->cpuset_mems_generation = cs->mems_generation;
611 task_unlock(current); 666 task_unlock(tsk);
612 up(&callback_sem); 667 up(&callback_sem);
613 if (!nodes_equal(oldmem, current->mems_allowed)) 668 mpol_rebind_task(tsk, &tsk->mems_allowed);
614 numa_policy_rebind(&oldmem, &current->mems_allowed);
615 } 669 }
616} 670}
617 671
@@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf)
766} 820}
767 821
768/* 822/*
823 * Handle user request to change the 'mems' memory placement
824 * of a cpuset. Needs to validate the request, update the
825 * cpusets mems_allowed and mems_generation, and for each
826 * task in the cpuset, rebind any vma mempolicies and if
827 * the cpuset is marked 'memory_migrate', migrate the tasks
828 * pages to the new memory.
829 *
769 * Call with manage_sem held. May take callback_sem during call. 830 * Call with manage_sem held. May take callback_sem during call.
831 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
832 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
833 * their mempolicies to the cpusets new mems_allowed.
770 */ 834 */
771 835
772static int update_nodemask(struct cpuset *cs, char *buf) 836static int update_nodemask(struct cpuset *cs, char *buf)
773{ 837{
774 struct cpuset trialcs; 838 struct cpuset trialcs;
839 nodemask_t oldmem;
840 struct task_struct *g, *p;
841 struct mm_struct **mmarray;
842 int i, n, ntasks;
843 int migrate;
844 int fudge;
775 int retval; 845 int retval;
776 846
777 trialcs = *cs; 847 trialcs = *cs;
778 retval = nodelist_parse(buf, trialcs.mems_allowed); 848 retval = nodelist_parse(buf, trialcs.mems_allowed);
779 if (retval < 0) 849 if (retval < 0)
780 return retval; 850 goto done;
781 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); 851 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
782 if (nodes_empty(trialcs.mems_allowed)) 852 oldmem = cs->mems_allowed;
783 return -ENOSPC; 853 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
854 retval = 0; /* Too easy - nothing to do */
855 goto done;
856 }
857 if (nodes_empty(trialcs.mems_allowed)) {
858 retval = -ENOSPC;
859 goto done;
860 }
784 retval = validate_change(cs, &trialcs); 861 retval = validate_change(cs, &trialcs);
785 if (retval == 0) { 862 if (retval < 0)
786 down(&callback_sem); 863 goto done;
787 cs->mems_allowed = trialcs.mems_allowed; 864
788 atomic_inc(&cpuset_mems_generation); 865 down(&callback_sem);
789 cs->mems_generation = atomic_read(&cpuset_mems_generation); 866 cs->mems_allowed = trialcs.mems_allowed;
790 up(&callback_sem); 867 atomic_inc(&cpuset_mems_generation);
868 cs->mems_generation = atomic_read(&cpuset_mems_generation);
869 up(&callback_sem);
870
871 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
872
873 fudge = 10; /* spare mmarray[] slots */
874 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
875 retval = -ENOMEM;
876
877 /*
878 * Allocate mmarray[] to hold mm reference for each task
879 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
880 * tasklist_lock. We could use GFP_ATOMIC, but with a
881 * few more lines of code, we can retry until we get a big
882 * enough mmarray[] w/o using GFP_ATOMIC.
883 */
884 while (1) {
885 ntasks = atomic_read(&cs->count); /* guess */
886 ntasks += fudge;
887 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
888 if (!mmarray)
889 goto done;
890 write_lock_irq(&tasklist_lock); /* block fork */
891 if (atomic_read(&cs->count) <= ntasks)
892 break; /* got enough */
893 write_unlock_irq(&tasklist_lock); /* try again */
894 kfree(mmarray);
791 } 895 }
896
897 n = 0;
898
899 /* Load up mmarray[] with mm reference for each task in cpuset. */
900 do_each_thread(g, p) {
901 struct mm_struct *mm;
902
903 if (n >= ntasks) {
904 printk(KERN_WARNING
905 "Cpuset mempolicy rebind incomplete.\n");
906 continue;
907 }
908 if (p->cpuset != cs)
909 continue;
910 mm = get_task_mm(p);
911 if (!mm)
912 continue;
913 mmarray[n++] = mm;
914 } while_each_thread(g, p);
915 write_unlock_irq(&tasklist_lock);
916
917 /*
918 * Now that we've dropped the tasklist spinlock, we can
919 * rebind the vma mempolicies of each mm in mmarray[] to their
920 * new cpuset, and release that mm. The mpol_rebind_mm()
921 * call takes mmap_sem, which we couldn't take while holding
922 * tasklist_lock. Forks can happen again now - the mpol_copy()
923 * cpuset_being_rebound check will catch such forks, and rebind
924 * their vma mempolicies too. Because we still hold the global
925 * cpuset manage_sem, we know that no other rebind effort will
926 * be contending for the global variable cpuset_being_rebound.
927 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
928 * is idempotent. Also migrate pages in each mm to new nodes.
929 */
930 migrate = is_memory_migrate(cs);
931 for (i = 0; i < n; i++) {
932 struct mm_struct *mm = mmarray[i];
933
934 mpol_rebind_mm(mm, &cs->mems_allowed);
935 if (migrate) {
936 do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
937 MPOL_MF_MOVE_ALL);
938 }
939 mmput(mm);
940 }
941
942 /* We're done rebinding vma's to this cpusets new mems_allowed. */
943 kfree(mmarray);
944 set_cpuset_being_rebound(NULL);
945 retval = 0;
946done:
792 return retval; 947 return retval;
793} 948}
794 949
795/* 950/*
951 * Call with manage_sem held.
952 */
953
954static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
955{
956 if (simple_strtoul(buf, NULL, 10) != 0)
957 cpuset_memory_pressure_enabled = 1;
958 else
959 cpuset_memory_pressure_enabled = 0;
960 return 0;
961}
962
963/*
796 * update_flag - read a 0 or a 1 in a file and update associated flag 964 * update_flag - read a 0 or a 1 in a file and update associated flag
797 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 965 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
798 * CS_NOTIFY_ON_RELEASE) 966 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
799 * cs: the cpuset to update 967 * cs: the cpuset to update
800 * buf: the buffer where we read the 0 or 1 968 * buf: the buffer where we read the 0 or 1
801 * 969 *
@@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
834} 1002}
835 1003
836/* 1004/*
1005 * Frequency meter - How fast is some event occuring?
1006 *
1007 * These routines manage a digitally filtered, constant time based,
1008 * event frequency meter. There are four routines:
1009 * fmeter_init() - initialize a frequency meter.
1010 * fmeter_markevent() - called each time the event happens.
1011 * fmeter_getrate() - returns the recent rate of such events.
1012 * fmeter_update() - internal routine used to update fmeter.
1013 *
1014 * A common data structure is passed to each of these routines,
1015 * which is used to keep track of the state required to manage the
1016 * frequency meter and its digital filter.
1017 *
1018 * The filter works on the number of events marked per unit time.
1019 * The filter is single-pole low-pass recursive (IIR). The time unit
1020 * is 1 second. Arithmetic is done using 32-bit integers scaled to
1021 * simulate 3 decimal digits of precision (multiplied by 1000).
1022 *
1023 * With an FM_COEF of 933, and a time base of 1 second, the filter
1024 * has a half-life of 10 seconds, meaning that if the events quit
1025 * happening, then the rate returned from the fmeter_getrate()
1026 * will be cut in half each 10 seconds, until it converges to zero.
1027 *
1028 * It is not worth doing a real infinitely recursive filter. If more
1029 * than FM_MAXTICKS ticks have elapsed since the last filter event,
1030 * just compute FM_MAXTICKS ticks worth, by which point the level
1031 * will be stable.
1032 *
1033 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
1034 * arithmetic overflow in the fmeter_update() routine.
1035 *
1036 * Given the simple 32 bit integer arithmetic used, this meter works
1037 * best for reporting rates between one per millisecond (msec) and
1038 * one per 32 (approx) seconds. At constant rates faster than one
1039 * per msec it maxes out at values just under 1,000,000. At constant
1040 * rates between one per msec, and one per second it will stabilize
1041 * to a value N*1000, where N is the rate of events per second.
1042 * At constant rates between one per second and one per 32 seconds,
1043 * it will be choppy, moving up on the seconds that have an event,
1044 * and then decaying until the next event. At rates slower than
1045 * about one in 32 seconds, it decays all the way back to zero between
1046 * each event.
1047 */
1048
1049#define FM_COEF 933 /* coefficient for half-life of 10 secs */
1050#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
1051#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
1052#define FM_SCALE 1000 /* faux fixed point scale */
1053
1054/* Initialize a frequency meter */
1055static void fmeter_init(struct fmeter *fmp)
1056{
1057 fmp->cnt = 0;
1058 fmp->val = 0;
1059 fmp->time = 0;
1060 spin_lock_init(&fmp->lock);
1061}
1062
1063/* Internal meter update - process cnt events and update value */
1064static void fmeter_update(struct fmeter *fmp)
1065{
1066 time_t now = get_seconds();
1067 time_t ticks = now - fmp->time;
1068
1069 if (ticks == 0)
1070 return;
1071
1072 ticks = min(FM_MAXTICKS, ticks);
1073 while (ticks-- > 0)
1074 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1075 fmp->time = now;
1076
1077 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1078 fmp->cnt = 0;
1079}
1080
1081/* Process any previous ticks, then bump cnt by one (times scale). */
1082static void fmeter_markevent(struct fmeter *fmp)
1083{
1084 spin_lock(&fmp->lock);
1085 fmeter_update(fmp);
1086 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1087 spin_unlock(&fmp->lock);
1088}
1089
1090/* Process any previous ticks, then return current value. */
1091static int fmeter_getrate(struct fmeter *fmp)
1092{
1093 int val;
1094
1095 spin_lock(&fmp->lock);
1096 fmeter_update(fmp);
1097 val = fmp->val;
1098 spin_unlock(&fmp->lock);
1099 return val;
1100}
1101
1102/*
837 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 1103 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
838 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 1104 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
839 * notified on release. 1105 * notified on release.
@@ -848,6 +1114,8 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
848 struct task_struct *tsk; 1114 struct task_struct *tsk;
849 struct cpuset *oldcs; 1115 struct cpuset *oldcs;
850 cpumask_t cpus; 1116 cpumask_t cpus;
1117 nodemask_t from, to;
1118 struct mm_struct *mm;
851 1119
852 if (sscanf(pidbuf, "%d", &pid) != 1) 1120 if (sscanf(pidbuf, "%d", &pid) != 1)
853 return -EIO; 1121 return -EIO;
@@ -887,14 +1155,27 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
887 return -ESRCH; 1155 return -ESRCH;
888 } 1156 }
889 atomic_inc(&cs->count); 1157 atomic_inc(&cs->count);
890 tsk->cpuset = cs; 1158 rcu_assign_pointer(tsk->cpuset, cs);
891 task_unlock(tsk); 1159 task_unlock(tsk);
892 1160
893 guarantee_online_cpus(cs, &cpus); 1161 guarantee_online_cpus(cs, &cpus);
894 set_cpus_allowed(tsk, cpus); 1162 set_cpus_allowed(tsk, cpus);
895 1163
1164 from = oldcs->mems_allowed;
1165 to = cs->mems_allowed;
1166
896 up(&callback_sem); 1167 up(&callback_sem);
1168
1169 mm = get_task_mm(tsk);
1170 if (mm) {
1171 mpol_rebind_mm(mm, &to);
1172 mmput(mm);
1173 }
1174
1175 if (is_memory_migrate(cs))
1176 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
897 put_task_struct(tsk); 1177 put_task_struct(tsk);
1178 synchronize_rcu();
898 if (atomic_dec_and_test(&oldcs->count)) 1179 if (atomic_dec_and_test(&oldcs->count))
899 check_for_release(oldcs, ppathbuf); 1180 check_for_release(oldcs, ppathbuf);
900 return 0; 1181 return 0;
@@ -905,11 +1186,14 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
905typedef enum { 1186typedef enum {
906 FILE_ROOT, 1187 FILE_ROOT,
907 FILE_DIR, 1188 FILE_DIR,
1189 FILE_MEMORY_MIGRATE,
908 FILE_CPULIST, 1190 FILE_CPULIST,
909 FILE_MEMLIST, 1191 FILE_MEMLIST,
910 FILE_CPU_EXCLUSIVE, 1192 FILE_CPU_EXCLUSIVE,
911 FILE_MEM_EXCLUSIVE, 1193 FILE_MEM_EXCLUSIVE,
912 FILE_NOTIFY_ON_RELEASE, 1194 FILE_NOTIFY_ON_RELEASE,
1195 FILE_MEMORY_PRESSURE_ENABLED,
1196 FILE_MEMORY_PRESSURE,
913 FILE_TASKLIST, 1197 FILE_TASKLIST,
914} cpuset_filetype_t; 1198} cpuset_filetype_t;
915 1199
@@ -960,6 +1244,15 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
960 case FILE_NOTIFY_ON_RELEASE: 1244 case FILE_NOTIFY_ON_RELEASE:
961 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); 1245 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
962 break; 1246 break;
1247 case FILE_MEMORY_MIGRATE:
1248 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1249 break;
1250 case FILE_MEMORY_PRESSURE_ENABLED:
1251 retval = update_memory_pressure_enabled(cs, buffer);
1252 break;
1253 case FILE_MEMORY_PRESSURE:
1254 retval = -EACCES;
1255 break;
963 case FILE_TASKLIST: 1256 case FILE_TASKLIST:
964 retval = attach_task(cs, buffer, &pathbuf); 1257 retval = attach_task(cs, buffer, &pathbuf);
965 break; 1258 break;
@@ -1060,6 +1353,15 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1060 case FILE_NOTIFY_ON_RELEASE: 1353 case FILE_NOTIFY_ON_RELEASE:
1061 *s++ = notify_on_release(cs) ? '1' : '0'; 1354 *s++ = notify_on_release(cs) ? '1' : '0';
1062 break; 1355 break;
1356 case FILE_MEMORY_MIGRATE:
1357 *s++ = is_memory_migrate(cs) ? '1' : '0';
1358 break;
1359 case FILE_MEMORY_PRESSURE_ENABLED:
1360 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1361 break;
1362 case FILE_MEMORY_PRESSURE:
1363 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1364 break;
1063 default: 1365 default:
1064 retval = -EINVAL; 1366 retval = -EINVAL;
1065 goto out; 1367 goto out;
@@ -1178,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
1178 1480
1179/* 1481/*
1180 * cpuset_create_dir - create a directory for an object. 1482 * cpuset_create_dir - create a directory for an object.
1181 * cs: the cpuset we create the directory for. 1483 * cs: the cpuset we create the directory for.
1182 * It must have a valid ->parent field 1484 * It must have a valid ->parent field
1183 * And we are going to fill its ->dentry field. 1485 * And we are going to fill its ->dentry field.
1184 * name: The name to give to the cpuset directory. Will be copied. 1486 * name: The name to give to the cpuset directory. Will be copied.
@@ -1211,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1211 struct dentry *dentry; 1513 struct dentry *dentry;
1212 int error; 1514 int error;
1213 1515
1214 down(&dir->d_inode->i_sem); 1516 mutex_lock(&dir->d_inode->i_mutex);
1215 dentry = cpuset_get_dentry(dir, cft->name); 1517 dentry = cpuset_get_dentry(dir, cft->name);
1216 if (!IS_ERR(dentry)) { 1518 if (!IS_ERR(dentry)) {
1217 error = cpuset_create_file(dentry, 0644 | S_IFREG); 1519 error = cpuset_create_file(dentry, 0644 | S_IFREG);
@@ -1220,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1220 dput(dentry); 1522 dput(dentry);
1221 } else 1523 } else
1222 error = PTR_ERR(dentry); 1524 error = PTR_ERR(dentry);
1223 up(&dir->d_inode->i_sem); 1525 mutex_unlock(&dir->d_inode->i_mutex);
1224 return error; 1526 return error;
1225} 1527}
1226 1528
@@ -1252,7 +1554,7 @@ struct ctr_struct {
1252 * when reading out p->cpuset, as we don't really care if it changes 1554 * when reading out p->cpuset, as we don't really care if it changes
1253 * on the next cycle, and we are not going to try to dereference it. 1555 * on the next cycle, and we are not going to try to dereference it.
1254 */ 1556 */
1255static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) 1557static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1256{ 1558{
1257 int n = 0; 1559 int n = 0;
1258 struct task_struct *g, *p; 1560 struct task_struct *g, *p;
@@ -1408,6 +1710,21 @@ static struct cftype cft_notify_on_release = {
1408 .private = FILE_NOTIFY_ON_RELEASE, 1710 .private = FILE_NOTIFY_ON_RELEASE,
1409}; 1711};
1410 1712
1713static struct cftype cft_memory_migrate = {
1714 .name = "memory_migrate",
1715 .private = FILE_MEMORY_MIGRATE,
1716};
1717
1718static struct cftype cft_memory_pressure_enabled = {
1719 .name = "memory_pressure_enabled",
1720 .private = FILE_MEMORY_PRESSURE_ENABLED,
1721};
1722
1723static struct cftype cft_memory_pressure = {
1724 .name = "memory_pressure",
1725 .private = FILE_MEMORY_PRESSURE,
1726};
1727
1411static int cpuset_populate_dir(struct dentry *cs_dentry) 1728static int cpuset_populate_dir(struct dentry *cs_dentry)
1412{ 1729{
1413 int err; 1730 int err;
@@ -1422,6 +1739,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1422 return err; 1739 return err;
1423 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) 1740 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
1424 return err; 1741 return err;
1742 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
1743 return err;
1744 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1745 return err;
1425 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1746 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1426 return err; 1747 return err;
1427 return 0; 1748 return 0;
@@ -1446,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1446 return -ENOMEM; 1767 return -ENOMEM;
1447 1768
1448 down(&manage_sem); 1769 down(&manage_sem);
1449 refresh_mems(); 1770 cpuset_update_task_memory_state();
1450 cs->flags = 0; 1771 cs->flags = 0;
1451 if (notify_on_release(parent)) 1772 if (notify_on_release(parent))
1452 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1773 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1457,11 +1778,13 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1457 INIT_LIST_HEAD(&cs->children); 1778 INIT_LIST_HEAD(&cs->children);
1458 atomic_inc(&cpuset_mems_generation); 1779 atomic_inc(&cpuset_mems_generation);
1459 cs->mems_generation = atomic_read(&cpuset_mems_generation); 1780 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1781 fmeter_init(&cs->fmeter);
1460 1782
1461 cs->parent = parent; 1783 cs->parent = parent;
1462 1784
1463 down(&callback_sem); 1785 down(&callback_sem);
1464 list_add(&cs->sibling, &cs->parent->children); 1786 list_add(&cs->sibling, &cs->parent->children);
1787 number_of_cpusets++;
1465 up(&callback_sem); 1788 up(&callback_sem);
1466 1789
1467 err = cpuset_create_dir(cs, name, mode); 1790 err = cpuset_create_dir(cs, name, mode);
@@ -1470,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1470 1793
1471 /* 1794 /*
1472 * Release manage_sem before cpuset_populate_dir() because it 1795 * Release manage_sem before cpuset_populate_dir() because it
1473 * will down() this new directory's i_sem and if we race with 1796 * will down() this new directory's i_mutex and if we race with
1474 * another mkdir, we might deadlock. 1797 * another mkdir, we might deadlock.
1475 */ 1798 */
1476 up(&manage_sem); 1799 up(&manage_sem);
@@ -1489,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1489{ 1812{
1490 struct cpuset *c_parent = dentry->d_parent->d_fsdata; 1813 struct cpuset *c_parent = dentry->d_parent->d_fsdata;
1491 1814
1492 /* the vfs holds inode->i_sem already */ 1815 /* the vfs holds inode->i_mutex already */
1493 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1816 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1494} 1817}
1495 1818
@@ -1500,10 +1823,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1500 struct cpuset *parent; 1823 struct cpuset *parent;
1501 char *pathbuf = NULL; 1824 char *pathbuf = NULL;
1502 1825
1503 /* the vfs holds both inode->i_sem already */ 1826 /* the vfs holds both inode->i_mutex already */
1504 1827
1505 down(&manage_sem); 1828 down(&manage_sem);
1506 refresh_mems(); 1829 cpuset_update_task_memory_state();
1507 if (atomic_read(&cs->count) > 0) { 1830 if (atomic_read(&cs->count) > 0) {
1508 up(&manage_sem); 1831 up(&manage_sem);
1509 return -EBUSY; 1832 return -EBUSY;
@@ -1524,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1524 spin_unlock(&d->d_lock); 1847 spin_unlock(&d->d_lock);
1525 cpuset_d_remove_dir(d); 1848 cpuset_d_remove_dir(d);
1526 dput(d); 1849 dput(d);
1850 number_of_cpusets--;
1527 up(&callback_sem); 1851 up(&callback_sem);
1528 if (list_empty(&parent->children)) 1852 if (list_empty(&parent->children))
1529 check_for_release(parent, &pathbuf); 1853 check_for_release(parent, &pathbuf);
@@ -1532,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1532 return 0; 1856 return 0;
1533} 1857}
1534 1858
1859/*
1860 * cpuset_init_early - just enough so that the calls to
1861 * cpuset_update_task_memory_state() in early init code
1862 * are harmless.
1863 */
1864
1865int __init cpuset_init_early(void)
1866{
1867 struct task_struct *tsk = current;
1868
1869 tsk->cpuset = &top_cpuset;
1870 tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
1871 return 0;
1872}
1873
1535/** 1874/**
1536 * cpuset_init - initialize cpusets at system boot 1875 * cpuset_init - initialize cpusets at system boot
1537 * 1876 *
@@ -1546,6 +1885,7 @@ int __init cpuset_init(void)
1546 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1885 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1547 top_cpuset.mems_allowed = NODE_MASK_ALL; 1886 top_cpuset.mems_allowed = NODE_MASK_ALL;
1548 1887
1888 fmeter_init(&top_cpuset.fmeter);
1549 atomic_inc(&cpuset_mems_generation); 1889 atomic_inc(&cpuset_mems_generation);
1550 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); 1890 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1551 1891
@@ -1566,7 +1906,11 @@ int __init cpuset_init(void)
1566 root->d_inode->i_nlink++; 1906 root->d_inode->i_nlink++;
1567 top_cpuset.dentry = root; 1907 top_cpuset.dentry = root;
1568 root->d_inode->i_op = &cpuset_dir_inode_operations; 1908 root->d_inode->i_op = &cpuset_dir_inode_operations;
1909 number_of_cpusets = 1;
1569 err = cpuset_populate_dir(root); 1910 err = cpuset_populate_dir(root);
1911 /* memory_pressure_enabled is in root cpuset only */
1912 if (err == 0)
1913 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
1570out: 1914out:
1571 return err; 1915 return err;
1572} 1916}
@@ -1632,15 +1976,13 @@ void cpuset_fork(struct task_struct *child)
1632 * 1976 *
1633 * We don't need to task_lock() this reference to tsk->cpuset, 1977 * We don't need to task_lock() this reference to tsk->cpuset,
1634 * because tsk is already marked PF_EXITING, so attach_task() won't 1978 * because tsk is already marked PF_EXITING, so attach_task() won't
1635 * mess with it. 1979 * mess with it, or task is a failed fork, never visible to attach_task.
1636 **/ 1980 **/
1637 1981
1638void cpuset_exit(struct task_struct *tsk) 1982void cpuset_exit(struct task_struct *tsk)
1639{ 1983{
1640 struct cpuset *cs; 1984 struct cpuset *cs;
1641 1985
1642 BUG_ON(!(tsk->flags & PF_EXITING));
1643
1644 cs = tsk->cpuset; 1986 cs = tsk->cpuset;
1645 tsk->cpuset = NULL; 1987 tsk->cpuset = NULL;
1646 1988
@@ -1667,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk)
1667 * tasks cpuset. 2009 * tasks cpuset.
1668 **/ 2010 **/
1669 2011
1670cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) 2012cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
1671{ 2013{
1672 cpumask_t mask; 2014 cpumask_t mask;
1673 2015
1674 down(&callback_sem); 2016 down(&callback_sem);
1675 task_lock((struct task_struct *)tsk); 2017 task_lock(tsk);
1676 guarantee_online_cpus(tsk->cpuset, &mask); 2018 guarantee_online_cpus(tsk->cpuset, &mask);
1677 task_unlock((struct task_struct *)tsk); 2019 task_unlock(tsk);
1678 up(&callback_sem); 2020 up(&callback_sem);
1679 2021
1680 return mask; 2022 return mask;
@@ -1686,43 +2028,26 @@ void cpuset_init_current_mems_allowed(void)
1686} 2028}
1687 2029
1688/** 2030/**
1689 * cpuset_update_current_mems_allowed - update mems parameters to new values 2031 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
1690 * 2032 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
1691 * If the current tasks cpusets mems_allowed changed behind our backs,
1692 * update current->mems_allowed and mems_generation to the new value.
1693 * Do not call this routine if in_interrupt().
1694 * 2033 *
1695 * Call without callback_sem or task_lock() held. May be called 2034 * Description: Returns the nodemask_t mems_allowed of the cpuset
1696 * with or without manage_sem held. Unless exiting, it will acquire 2035 * attached to the specified @tsk. Guaranteed to return some non-empty
1697 * task_lock(). Also might acquire callback_sem during call to 2036 * subset of node_online_map, even if this means going outside the
1698 * refresh_mems(). 2037 * tasks cpuset.
1699 */ 2038 **/
1700 2039
1701void cpuset_update_current_mems_allowed(void) 2040nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1702{ 2041{
1703 struct cpuset *cs; 2042 nodemask_t mask;
1704 int need_to_refresh = 0;
1705 2043
1706 task_lock(current); 2044 down(&callback_sem);
1707 cs = current->cpuset; 2045 task_lock(tsk);
1708 if (!cs) 2046 guarantee_online_mems(tsk->cpuset, &mask);
1709 goto done; 2047 task_unlock(tsk);
1710 if (current->cpuset_mems_generation != cs->mems_generation) 2048 up(&callback_sem);
1711 need_to_refresh = 1;
1712done:
1713 task_unlock(current);
1714 if (need_to_refresh)
1715 refresh_mems();
1716}
1717 2049
1718/** 2050 return mask;
1719 * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
1720 * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
1721 */
1722void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
1723{
1724 bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
1725 MAX_NUMNODES);
1726} 2051}
1727 2052
1728/** 2053/**
@@ -1795,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1795 * GFP_USER - only nodes in current tasks mems allowed ok. 2120 * GFP_USER - only nodes in current tasks mems allowed ok.
1796 **/ 2121 **/
1797 2122
1798int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 2123int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1799{ 2124{
1800 int node; /* node that zone z is on */ 2125 int node; /* node that zone z is on */
1801 const struct cpuset *cs; /* current cpuset ancestors */ 2126 const struct cpuset *cs; /* current cpuset ancestors */
@@ -1825,6 +2150,33 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1825} 2150}
1826 2151
1827/** 2152/**
2153 * cpuset_lock - lock out any changes to cpuset structures
2154 *
2155 * The out of memory (oom) code needs to lock down cpusets
2156 * from being changed while it scans the tasklist looking for a
2157 * task in an overlapping cpuset. Expose callback_sem via this
2158 * cpuset_lock() routine, so the oom code can lock it, before
2159 * locking the task list. The tasklist_lock is a spinlock, so
2160 * must be taken inside callback_sem.
2161 */
2162
2163void cpuset_lock(void)
2164{
2165 down(&callback_sem);
2166}
2167
2168/**
2169 * cpuset_unlock - release lock on cpuset changes
2170 *
2171 * Undo the lock taken in a previous cpuset_lock() call.
2172 */
2173
2174void cpuset_unlock(void)
2175{
2176 up(&callback_sem);
2177}
2178
2179/**
1828 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? 2180 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
1829 * @p: pointer to task_struct of some other task. 2181 * @p: pointer to task_struct of some other task.
1830 * 2182 *
@@ -1833,7 +2185,7 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1833 * determine if task @p's memory usage might impact the memory 2185 * determine if task @p's memory usage might impact the memory
1834 * available to the current task. 2186 * available to the current task.
1835 * 2187 *
1836 * Acquires callback_sem - not suitable for calling from a fast path. 2188 * Call while holding callback_sem.
1837 **/ 2189 **/
1838 2190
1839int cpuset_excl_nodes_overlap(const struct task_struct *p) 2191int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1841,8 +2193,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1841 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 2193 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1842 int overlap = 0; /* do cpusets overlap? */ 2194 int overlap = 0; /* do cpusets overlap? */
1843 2195
1844 down(&callback_sem);
1845
1846 task_lock(current); 2196 task_lock(current);
1847 if (current->flags & PF_EXITING) { 2197 if (current->flags & PF_EXITING) {
1848 task_unlock(current); 2198 task_unlock(current);
@@ -1861,12 +2211,46 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1861 2211
1862 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 2212 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1863done: 2213done:
1864 up(&callback_sem);
1865
1866 return overlap; 2214 return overlap;
1867} 2215}
1868 2216
1869/* 2217/*
2218 * Collection of memory_pressure is suppressed unless
2219 * this flag is enabled by writing "1" to the special
2220 * cpuset file 'memory_pressure_enabled' in the root cpuset.
2221 */
2222
2223int cpuset_memory_pressure_enabled __read_mostly;
2224
2225/**
2226 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
2227 *
2228 * Keep a running average of the rate of synchronous (direct)
2229 * page reclaim efforts initiated by tasks in each cpuset.
2230 *
2231 * This represents the rate at which some task in the cpuset
2232 * ran low on memory on all nodes it was allowed to use, and
2233 * had to enter the kernels page reclaim code in an effort to
2234 * create more free memory by tossing clean pages or swapping
2235 * or writing dirty pages.
2236 *
2237 * Display to user space in the per-cpuset read-only file
2238 * "memory_pressure". Value displayed is an integer
2239 * representing the recent rate of entry into the synchronous
2240 * (direct) page reclaim by any task attached to the cpuset.
2241 **/
2242
2243void __cpuset_memory_pressure_bump(void)
2244{
2245 struct cpuset *cs;
2246
2247 task_lock(current);
2248 cs = current->cpuset;
2249 fmeter_markevent(&cs->fmeter);
2250 task_unlock(current);
2251}
2252
2253/*
1870 * proc_cpuset_show() 2254 * proc_cpuset_show()
1871 * - Print tasks cpuset path into seq_file. 2255 * - Print tasks cpuset path into seq_file.
1872 * - Used for /proc/<pid>/cpuset. 2256 * - Used for /proc/<pid>/cpuset.
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
deleted file mode 100644
index 334c37f5218a..000000000000
--- a/kernel/crash_dump.c
+++ /dev/null
@@ -1,61 +0,0 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/smp_lock.h>
9#include <linux/errno.h>
10#include <linux/proc_fs.h>
11#include <linux/bootmem.h>
12#include <linux/highmem.h>
13#include <linux/crash_dump.h>
14
15#include <asm/io.h>
16#include <asm/uaccess.h>
17
18/* Stores the physical address of elf header of crash image. */
19unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
20
21/**
22 * copy_oldmem_page - copy one page from "oldmem"
23 * @pfn: page frame number to be copied
24 * @buf: target memory address for the copy; this can be in kernel address
25 * space or user address space (see @userbuf)
26 * @csize: number of bytes to copy
27 * @offset: offset in bytes into the page (based on pfn) to begin the copy
28 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
29 * otherwise @buf is in kernel address space, use memcpy().
30 *
31 * Copy a page from "oldmem". For this page, there is no pte mapped
32 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
33 */
34ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
35 size_t csize, unsigned long offset, int userbuf)
36{
37 void *page, *vaddr;
38
39 if (!csize)
40 return 0;
41
42 page = kmalloc(PAGE_SIZE, GFP_KERNEL);
43 if (!page)
44 return -ENOMEM;
45
46 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
47 copy_page(page, vaddr);
48 kunmap_atomic(vaddr, KM_PTE0);
49
50 if (userbuf) {
51 if (copy_to_user(buf, (page + offset), csize)) {
52 kfree(page);
53 return -EFAULT;
54 }
55 } else {
56 memcpy(buf, (page + offset), csize);
57 }
58
59 kfree(page);
60 return csize;
61}
diff --git a/kernel/exit.c b/kernel/exit.c
index ee515683b92d..93cee3671332 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -10,6 +10,7 @@
10#include <linux/interrupt.h> 10#include <linux/interrupt.h>
11#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/capability.h>
13#include <linux/completion.h> 14#include <linux/completion.h>
14#include <linux/personality.h> 15#include <linux/personality.h>
15#include <linux/tty.h> 16#include <linux/tty.h>
@@ -29,6 +30,7 @@
29#include <linux/syscalls.h> 30#include <linux/syscalls.h>
30#include <linux/signal.h> 31#include <linux/signal.h>
31#include <linux/cn_proc.h> 32#include <linux/cn_proc.h>
33#include <linux/mutex.h>
32 34
33#include <asm/uaccess.h> 35#include <asm/uaccess.h>
34#include <asm/unistd.h> 36#include <asm/unistd.h>
@@ -72,7 +74,6 @@ repeat:
72 __ptrace_unlink(p); 74 __ptrace_unlink(p);
73 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 75 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
74 __exit_signal(p); 76 __exit_signal(p);
75 __exit_sighand(p);
76 /* 77 /*
77 * Note that the fastpath in sys_times depends on __exit_signal having 78 * Note that the fastpath in sys_times depends on __exit_signal having
78 * updated the counters before a task is removed from the tasklist of 79 * updated the counters before a task is removed from the tasklist of
@@ -192,7 +193,7 @@ int is_orphaned_pgrp(int pgrp)
192 return retval; 193 return retval;
193} 194}
194 195
195static inline int has_stopped_jobs(int pgrp) 196static int has_stopped_jobs(int pgrp)
196{ 197{
197 int retval = 0; 198 int retval = 0;
198 struct task_struct *p; 199 struct task_struct *p;
@@ -229,7 +230,7 @@ static inline int has_stopped_jobs(int pgrp)
229 * 230 *
230 * NOTE that reparent_to_init() gives the caller full capabilities. 231 * NOTE that reparent_to_init() gives the caller full capabilities.
231 */ 232 */
232static inline void reparent_to_init(void) 233static void reparent_to_init(void)
233{ 234{
234 write_lock_irq(&tasklist_lock); 235 write_lock_irq(&tasklist_lock);
235 236
@@ -243,7 +244,9 @@ static inline void reparent_to_init(void)
243 /* Set the exit signal to SIGCHLD so we signal init on exit */ 244 /* Set the exit signal to SIGCHLD so we signal init on exit */
244 current->exit_signal = SIGCHLD; 245 current->exit_signal = SIGCHLD;
245 246
246 if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) 247 if ((current->policy == SCHED_NORMAL ||
248 current->policy == SCHED_BATCH)
249 && (task_nice(current) < 0))
247 set_user_nice(current, 0); 250 set_user_nice(current, 0);
248 /* cpus_allowed? */ 251 /* cpus_allowed? */
249 /* rt_priority? */ 252 /* rt_priority? */
@@ -258,7 +261,7 @@ static inline void reparent_to_init(void)
258 261
259void __set_special_pids(pid_t session, pid_t pgrp) 262void __set_special_pids(pid_t session, pid_t pgrp)
260{ 263{
261 struct task_struct *curr = current; 264 struct task_struct *curr = current->group_leader;
262 265
263 if (curr->signal->session != session) { 266 if (curr->signal->session != session) {
264 detach_pid(curr, PIDTYPE_SID); 267 detach_pid(curr, PIDTYPE_SID);
@@ -366,7 +369,7 @@ void daemonize(const char *name, ...)
366 369
367EXPORT_SYMBOL(daemonize); 370EXPORT_SYMBOL(daemonize);
368 371
369static inline void close_files(struct files_struct * files) 372static void close_files(struct files_struct * files)
370{ 373{
371 int i, j; 374 int i, j;
372 struct fdtable *fdt; 375 struct fdtable *fdt;
@@ -540,7 +543,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re
540 p->real_parent = reaper; 543 p->real_parent = reaper;
541} 544}
542 545
543static inline void reparent_thread(task_t *p, task_t *father, int traced) 546static void reparent_thread(task_t *p, task_t *father, int traced)
544{ 547{
545 /* We don't want people slaying init. */ 548 /* We don't want people slaying init. */
546 if (p->exit_signal != -1) 549 if (p->exit_signal != -1)
@@ -604,7 +607,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
604 * group, and if no such member exists, give it to 607 * group, and if no such member exists, give it to
605 * the global child reaper process (ie "init") 608 * the global child reaper process (ie "init")
606 */ 609 */
607static inline void forget_original_parent(struct task_struct * father, 610static void forget_original_parent(struct task_struct * father,
608 struct list_head *to_release) 611 struct list_head *to_release)
609{ 612{
610 struct task_struct *p, *reaper = father; 613 struct task_struct *p, *reaper = father;
@@ -842,7 +845,7 @@ fastcall NORET_TYPE void do_exit(long code)
842 } 845 }
843 group_dead = atomic_dec_and_test(&tsk->signal->live); 846 group_dead = atomic_dec_and_test(&tsk->signal->live);
844 if (group_dead) { 847 if (group_dead) {
845 del_timer_sync(&tsk->signal->real_timer); 848 hrtimer_cancel(&tsk->signal->real_timer);
846 exit_itimers(tsk->signal); 849 exit_itimers(tsk->signal);
847 acct_process(code); 850 acct_process(code);
848 } 851 }
@@ -870,6 +873,10 @@ fastcall NORET_TYPE void do_exit(long code)
870 mpol_free(tsk->mempolicy); 873 mpol_free(tsk->mempolicy);
871 tsk->mempolicy = NULL; 874 tsk->mempolicy = NULL;
872#endif 875#endif
876 /*
877 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
878 */
879 mutex_debug_check_no_locks_held(tsk);
873 880
874 /* PF_DEAD causes final put_task_struct after we schedule. */ 881 /* PF_DEAD causes final put_task_struct after we schedule. */
875 preempt_disable(); 882 preempt_disable();
@@ -926,7 +933,6 @@ do_group_exit(int exit_code)
926 /* Another thread got here before we took the lock. */ 933 /* Another thread got here before we took the lock. */
927 exit_code = sig->group_exit_code; 934 exit_code = sig->group_exit_code;
928 else { 935 else {
929 sig->flags = SIGNAL_GROUP_EXIT;
930 sig->group_exit_code = exit_code; 936 sig->group_exit_code = exit_code;
931 zap_other_threads(current); 937 zap_other_threads(current);
932 } 938 }
@@ -1068,6 +1074,9 @@ static int wait_task_zombie(task_t *p, int noreap,
1068 } 1074 }
1069 1075
1070 if (likely(p->real_parent == p->parent) && likely(p->signal)) { 1076 if (likely(p->real_parent == p->parent) && likely(p->signal)) {
1077 struct signal_struct *psig;
1078 struct signal_struct *sig;
1079
1071 /* 1080 /*
1072 * The resource counters for the group leader are in its 1081 * The resource counters for the group leader are in its
1073 * own task_struct. Those for dead threads in the group 1082 * own task_struct. Those for dead threads in the group
@@ -1084,24 +1093,26 @@ static int wait_task_zombie(task_t *p, int noreap,
1084 * here reaping other children at the same time. 1093 * here reaping other children at the same time.
1085 */ 1094 */
1086 spin_lock_irq(&p->parent->sighand->siglock); 1095 spin_lock_irq(&p->parent->sighand->siglock);
1087 p->parent->signal->cutime = 1096 psig = p->parent->signal;
1088 cputime_add(p->parent->signal->cutime, 1097 sig = p->signal;
1098 psig->cutime =
1099 cputime_add(psig->cutime,
1089 cputime_add(p->utime, 1100 cputime_add(p->utime,
1090 cputime_add(p->signal->utime, 1101 cputime_add(sig->utime,
1091 p->signal->cutime))); 1102 sig->cutime)));
1092 p->parent->signal->cstime = 1103 psig->cstime =
1093 cputime_add(p->parent->signal->cstime, 1104 cputime_add(psig->cstime,
1094 cputime_add(p->stime, 1105 cputime_add(p->stime,
1095 cputime_add(p->signal->stime, 1106 cputime_add(sig->stime,
1096 p->signal->cstime))); 1107 sig->cstime)));
1097 p->parent->signal->cmin_flt += 1108 psig->cmin_flt +=
1098 p->min_flt + p->signal->min_flt + p->signal->cmin_flt; 1109 p->min_flt + sig->min_flt + sig->cmin_flt;
1099 p->parent->signal->cmaj_flt += 1110 psig->cmaj_flt +=
1100 p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt; 1111 p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1101 p->parent->signal->cnvcsw += 1112 psig->cnvcsw +=
1102 p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw; 1113 p->nvcsw + sig->nvcsw + sig->cnvcsw;
1103 p->parent->signal->cnivcsw += 1114 psig->cnivcsw +=
1104 p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw; 1115 p->nivcsw + sig->nivcsw + sig->cnivcsw;
1105 spin_unlock_irq(&p->parent->sighand->siglock); 1116 spin_unlock_irq(&p->parent->sighand->siglock);
1106 } 1117 }
1107 1118
diff --git a/kernel/fork.c b/kernel/fork.c
index fb8572a42297..4ae8cfc1c89c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,7 @@
28#include <linux/binfmts.h> 28#include <linux/binfmts.h>
29#include <linux/mman.h> 29#include <linux/mman.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/capability.h>
31#include <linux/cpu.h> 32#include <linux/cpu.h>
32#include <linux/cpuset.h> 33#include <linux/cpuset.h>
33#include <linux/security.h> 34#include <linux/security.h>
@@ -743,6 +744,14 @@ int unshare_files(void)
743 744
744EXPORT_SYMBOL(unshare_files); 745EXPORT_SYMBOL(unshare_files);
745 746
747void sighand_free_cb(struct rcu_head *rhp)
748{
749 struct sighand_struct *sp;
750
751 sp = container_of(rhp, struct sighand_struct, rcu);
752 kmem_cache_free(sighand_cachep, sp);
753}
754
746static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 755static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
747{ 756{
748 struct sighand_struct *sig; 757 struct sighand_struct *sig;
@@ -752,7 +761,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
752 return 0; 761 return 0;
753 } 762 }
754 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); 763 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
755 tsk->sighand = sig; 764 rcu_assign_pointer(tsk->sighand, sig);
756 if (!sig) 765 if (!sig)
757 return -ENOMEM; 766 return -ENOMEM;
758 spin_lock_init(&sig->siglock); 767 spin_lock_init(&sig->siglock);
@@ -793,19 +802,16 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
793 init_sigpending(&sig->shared_pending); 802 init_sigpending(&sig->shared_pending);
794 INIT_LIST_HEAD(&sig->posix_timers); 803 INIT_LIST_HEAD(&sig->posix_timers);
795 804
796 sig->it_real_value = sig->it_real_incr = 0; 805 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC);
806 sig->it_real_incr.tv64 = 0;
797 sig->real_timer.function = it_real_fn; 807 sig->real_timer.function = it_real_fn;
798 sig->real_timer.data = (unsigned long) tsk; 808 sig->real_timer.data = tsk;
799 init_timer(&sig->real_timer);
800 809
801 sig->it_virt_expires = cputime_zero; 810 sig->it_virt_expires = cputime_zero;
802 sig->it_virt_incr = cputime_zero; 811 sig->it_virt_incr = cputime_zero;
803 sig->it_prof_expires = cputime_zero; 812 sig->it_prof_expires = cputime_zero;
804 sig->it_prof_incr = cputime_zero; 813 sig->it_prof_incr = cputime_zero;
805 814
806 sig->tty = current->signal->tty;
807 sig->pgrp = process_group(current);
808 sig->session = current->signal->session;
809 sig->leader = 0; /* session leadership doesn't inherit */ 815 sig->leader = 0; /* session leadership doesn't inherit */
810 sig->tty_old_pgrp = 0; 816 sig->tty_old_pgrp = 0;
811 817
@@ -964,15 +970,20 @@ static task_t *copy_process(unsigned long clone_flags,
964 p->io_context = NULL; 970 p->io_context = NULL;
965 p->io_wait = NULL; 971 p->io_wait = NULL;
966 p->audit_context = NULL; 972 p->audit_context = NULL;
973 cpuset_fork(p);
967#ifdef CONFIG_NUMA 974#ifdef CONFIG_NUMA
968 p->mempolicy = mpol_copy(p->mempolicy); 975 p->mempolicy = mpol_copy(p->mempolicy);
969 if (IS_ERR(p->mempolicy)) { 976 if (IS_ERR(p->mempolicy)) {
970 retval = PTR_ERR(p->mempolicy); 977 retval = PTR_ERR(p->mempolicy);
971 p->mempolicy = NULL; 978 p->mempolicy = NULL;
972 goto bad_fork_cleanup; 979 goto bad_fork_cleanup_cpuset;
973 } 980 }
974#endif 981#endif
975 982
983#ifdef CONFIG_DEBUG_MUTEXES
984 p->blocked_on = NULL; /* not blocked yet */
985#endif
986
976 p->tgid = p->pid; 987 p->tgid = p->pid;
977 if (clone_flags & CLONE_THREAD) 988 if (clone_flags & CLONE_THREAD)
978 p->tgid = current->tgid; 989 p->tgid = current->tgid;
@@ -1127,25 +1138,19 @@ static task_t *copy_process(unsigned long clone_flags,
1127 attach_pid(p, PIDTYPE_PID, p->pid); 1138 attach_pid(p, PIDTYPE_PID, p->pid);
1128 attach_pid(p, PIDTYPE_TGID, p->tgid); 1139 attach_pid(p, PIDTYPE_TGID, p->tgid);
1129 if (thread_group_leader(p)) { 1140 if (thread_group_leader(p)) {
1141 p->signal->tty = current->signal->tty;
1142 p->signal->pgrp = process_group(current);
1143 p->signal->session = current->signal->session;
1130 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1144 attach_pid(p, PIDTYPE_PGID, process_group(p));
1131 attach_pid(p, PIDTYPE_SID, p->signal->session); 1145 attach_pid(p, PIDTYPE_SID, p->signal->session);
1132 if (p->pid) 1146 if (p->pid)
1133 __get_cpu_var(process_counts)++; 1147 __get_cpu_var(process_counts)++;
1134 } 1148 }
1135 1149
1136 if (!current->signal->tty && p->signal->tty)
1137 p->signal->tty = NULL;
1138
1139 nr_threads++; 1150 nr_threads++;
1140 total_forks++; 1151 total_forks++;
1141 write_unlock_irq(&tasklist_lock); 1152 write_unlock_irq(&tasklist_lock);
1142 proc_fork_connector(p); 1153 proc_fork_connector(p);
1143 cpuset_fork(p);
1144 retval = 0;
1145
1146fork_out:
1147 if (retval)
1148 return ERR_PTR(retval);
1149 return p; 1154 return p;
1150 1155
1151bad_fork_cleanup_namespace: 1156bad_fork_cleanup_namespace:
@@ -1172,7 +1177,9 @@ bad_fork_cleanup_security:
1172bad_fork_cleanup_policy: 1177bad_fork_cleanup_policy:
1173#ifdef CONFIG_NUMA 1178#ifdef CONFIG_NUMA
1174 mpol_free(p->mempolicy); 1179 mpol_free(p->mempolicy);
1180bad_fork_cleanup_cpuset:
1175#endif 1181#endif
1182 cpuset_exit(p);
1176bad_fork_cleanup: 1183bad_fork_cleanup:
1177 if (p->binfmt) 1184 if (p->binfmt)
1178 module_put(p->binfmt->module); 1185 module_put(p->binfmt->module);
@@ -1184,7 +1191,8 @@ bad_fork_cleanup_count:
1184 free_uid(p->user); 1191 free_uid(p->user);
1185bad_fork_free: 1192bad_fork_free:
1186 free_task(p); 1193 free_task(p);
1187 goto fork_out; 1194fork_out:
1195 return ERR_PTR(retval);
1188} 1196}
1189 1197
1190struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) 1198struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
@@ -1290,6 +1298,10 @@ long do_fork(unsigned long clone_flags,
1290 return pid; 1298 return pid;
1291} 1299}
1292 1300
1301#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1302#define ARCH_MIN_MMSTRUCT_ALIGN 0
1303#endif
1304
1293void __init proc_caches_init(void) 1305void __init proc_caches_init(void)
1294{ 1306{
1295 sighand_cachep = kmem_cache_create("sighand_cache", 1307 sighand_cachep = kmem_cache_create("sighand_cache",
@@ -1308,6 +1320,6 @@ void __init proc_caches_init(void)
1308 sizeof(struct vm_area_struct), 0, 1320 sizeof(struct vm_area_struct), 0,
1309 SLAB_PANIC, NULL, NULL); 1321 SLAB_PANIC, NULL, NULL);
1310 mm_cachep = kmem_cache_create("mm_struct", 1322 mm_cachep = kmem_cache_create("mm_struct",
1311 sizeof(struct mm_struct), 0, 1323 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1312 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1324 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1313} 1325}
diff --git a/kernel/futex.c b/kernel/futex.c
index 5872e3507f35..5efa2f978032 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -270,7 +270,13 @@ static void wake_futex(struct futex_q *q)
270 /* 270 /*
271 * The waiting task can free the futex_q as soon as this is written, 271 * The waiting task can free the futex_q as soon as this is written,
272 * without taking any locks. This must come last. 272 * without taking any locks. This must come last.
273 *
274 * A memory barrier is required here to prevent the following store
275 * to lock_ptr from getting ahead of the wakeup. Clearing the lock
276 * at the end of wake_up_all() does not prevent this store from
277 * moving.
273 */ 278 */
279 wmb();
274 q->lock_ptr = NULL; 280 q->lock_ptr = NULL;
275} 281}
276 282
@@ -350,6 +356,13 @@ retry:
350 if (bh1 != bh2) 356 if (bh1 != bh2)
351 spin_unlock(&bh2->lock); 357 spin_unlock(&bh2->lock);
352 358
359#ifndef CONFIG_MMU
360 /* we don't get EFAULT from MMU faults if we don't have an MMU,
361 * but we might get them from range checking */
362 ret = op_ret;
363 goto out;
364#endif
365
353 if (unlikely(op_ret != -EFAULT)) { 366 if (unlikely(op_ret != -EFAULT)) {
354 ret = op_ret; 367 ret = op_ret;
355 goto out; 368 goto out;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
new file mode 100644
index 000000000000..f1c4155b49ac
--- /dev/null
+++ b/kernel/hrtimer.c
@@ -0,0 +1,826 @@
1/*
2 * linux/kernel/hrtimer.c
3 *
4 * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
6 *
7 * High-resolution kernel timers
8 *
9 * In contrast to the low-resolution timeout API implemented in
10 * kernel/timer.c, hrtimers provide finer resolution and accuracy
11 * depending on system configuration and capabilities.
12 *
13 * These timers are currently used for:
14 * - itimers
15 * - POSIX timers
16 * - nanosleep
17 * - precise in-kernel timing
18 *
19 * Started by: Thomas Gleixner and Ingo Molnar
20 *
21 * Credits:
22 * based on kernel/timer.c
23 *
24 * For licencing details see kernel-base/COPYING
25 */
26
27#include <linux/cpu.h>
28#include <linux/module.h>
29#include <linux/percpu.h>
30#include <linux/hrtimer.h>
31#include <linux/notifier.h>
32#include <linux/syscalls.h>
33#include <linux/interrupt.h>
34
35#include <asm/uaccess.h>
36
37/**
38 * ktime_get - get the monotonic time in ktime_t format
39 *
40 * returns the time in ktime_t format
41 */
42static ktime_t ktime_get(void)
43{
44 struct timespec now;
45
46 ktime_get_ts(&now);
47
48 return timespec_to_ktime(now);
49}
50
51/**
52 * ktime_get_real - get the real (wall-) time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56static ktime_t ktime_get_real(void)
57{
58 struct timespec now;
59
60 getnstimeofday(&now);
61
62 return timespec_to_ktime(now);
63}
64
65EXPORT_SYMBOL_GPL(ktime_get_real);
66
67/*
68 * The timer bases:
69 */
70
71#define MAX_HRTIMER_BASES 2
72
73static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
74{
75 {
76 .index = CLOCK_REALTIME,
77 .get_time = &ktime_get_real,
78 .resolution = KTIME_REALTIME_RES,
79 },
80 {
81 .index = CLOCK_MONOTONIC,
82 .get_time = &ktime_get,
83 .resolution = KTIME_MONOTONIC_RES,
84 },
85};
86
87/**
88 * ktime_get_ts - get the monotonic clock in timespec format
89 *
90 * @ts: pointer to timespec variable
91 *
92 * The function calculates the monotonic clock from the realtime
93 * clock and the wall_to_monotonic offset and stores the result
94 * in normalized timespec format in the variable pointed to by ts.
95 */
96void ktime_get_ts(struct timespec *ts)
97{
98 struct timespec tomono;
99 unsigned long seq;
100
101 do {
102 seq = read_seqbegin(&xtime_lock);
103 getnstimeofday(ts);
104 tomono = wall_to_monotonic;
105
106 } while (read_seqretry(&xtime_lock, seq));
107
108 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
109 ts->tv_nsec + tomono.tv_nsec);
110}
111EXPORT_SYMBOL_GPL(ktime_get_ts);
112
113/*
114 * Functions and macros which are different for UP/SMP systems are kept in a
115 * single place
116 */
117#ifdef CONFIG_SMP
118
119#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0)
120
121/*
122 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
123 * means that all timers which are tied to this base via timer->base are
124 * locked, and the base itself is locked too.
125 *
126 * So __run_timers/migrate_timers can safely modify all timers which could
127 * be found on the lists/queues.
128 *
129 * When the timer's base is locked, and the timer removed from list, it is
130 * possible to set timer->base = NULL and drop the lock: the timer remains
131 * locked.
132 */
133static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
134 unsigned long *flags)
135{
136 struct hrtimer_base *base;
137
138 for (;;) {
139 base = timer->base;
140 if (likely(base != NULL)) {
141 spin_lock_irqsave(&base->lock, *flags);
142 if (likely(base == timer->base))
143 return base;
144 /* The timer has migrated to another CPU: */
145 spin_unlock_irqrestore(&base->lock, *flags);
146 }
147 cpu_relax();
148 }
149}
150
151/*
152 * Switch the timer base to the current CPU when possible.
153 */
154static inline struct hrtimer_base *
155switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
156{
157 struct hrtimer_base *new_base;
158
159 new_base = &__get_cpu_var(hrtimer_bases[base->index]);
160
161 if (base != new_base) {
162 /*
163 * We are trying to schedule the timer on the local CPU.
164 * However we can't change timer's base while it is running,
165 * so we keep it on the same CPU. No hassle vs. reprogramming
166 * the event source in the high resolution case. The softirq
167 * code will take care of this when the timer function has
168 * completed. There is no conflict as we hold the lock until
169 * the timer is enqueued.
170 */
171 if (unlikely(base->curr_timer == timer))
172 return base;
173
174 /* See the comment in lock_timer_base() */
175 timer->base = NULL;
176 spin_unlock(&base->lock);
177 spin_lock(&new_base->lock);
178 timer->base = new_base;
179 }
180 return new_base;
181}
182
183#else /* CONFIG_SMP */
184
185#define set_curr_timer(b, t) do { } while (0)
186
187static inline struct hrtimer_base *
188lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
189{
190 struct hrtimer_base *base = timer->base;
191
192 spin_lock_irqsave(&base->lock, *flags);
193
194 return base;
195}
196
197#define switch_hrtimer_base(t, b) (b)
198
199#endif /* !CONFIG_SMP */
200
201/*
202 * Functions for the union type storage format of ktime_t which are
203 * too large for inlining:
204 */
205#if BITS_PER_LONG < 64
206# ifndef CONFIG_KTIME_SCALAR
207/**
208 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
209 *
210 * @kt: addend
211 * @nsec: the scalar nsec value to add
212 *
213 * Returns the sum of kt and nsec in ktime_t format
214 */
215ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
216{
217 ktime_t tmp;
218
219 if (likely(nsec < NSEC_PER_SEC)) {
220 tmp.tv64 = nsec;
221 } else {
222 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
223
224 tmp = ktime_set((long)nsec, rem);
225 }
226
227 return ktime_add(kt, tmp);
228}
229
230#else /* CONFIG_KTIME_SCALAR */
231
232# endif /* !CONFIG_KTIME_SCALAR */
233
234/*
235 * Divide a ktime value by a nanosecond value
236 */
237static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
238{
239 u64 dclc, inc, dns;
240 int sft = 0;
241
242 dclc = dns = ktime_to_ns(kt);
243 inc = div;
244 /* Make sure the divisor is less than 2^32: */
245 while (div >> 32) {
246 sft++;
247 div >>= 1;
248 }
249 dclc >>= sft;
250 do_div(dclc, (unsigned long) div);
251
252 return (unsigned long) dclc;
253}
254
255#else /* BITS_PER_LONG < 64 */
256# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
257#endif /* BITS_PER_LONG >= 64 */
258
259/*
260 * Counterpart to lock_timer_base above:
261 */
262static inline
263void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
264{
265 spin_unlock_irqrestore(&timer->base->lock, *flags);
266}
267
268/**
269 * hrtimer_forward - forward the timer expiry
270 *
271 * @timer: hrtimer to forward
272 * @interval: the interval to forward
273 *
274 * Forward the timer expiry so it will expire in the future.
275 * Returns the number of overruns.
276 */
277unsigned long
278hrtimer_forward(struct hrtimer *timer, ktime_t interval)
279{
280 unsigned long orun = 1;
281 ktime_t delta, now;
282
283 now = timer->base->get_time();
284
285 delta = ktime_sub(now, timer->expires);
286
287 if (delta.tv64 < 0)
288 return 0;
289
290 if (interval.tv64 < timer->base->resolution.tv64)
291 interval.tv64 = timer->base->resolution.tv64;
292
293 if (unlikely(delta.tv64 >= interval.tv64)) {
294 nsec_t incr = ktime_to_ns(interval);
295
296 orun = ktime_divns(delta, incr);
297 timer->expires = ktime_add_ns(timer->expires, incr * orun);
298 if (timer->expires.tv64 > now.tv64)
299 return orun;
300 /*
301 * This (and the ktime_add() below) is the
302 * correction for exact:
303 */
304 orun++;
305 }
306 timer->expires = ktime_add(timer->expires, interval);
307
308 return orun;
309}
310
311/*
312 * enqueue_hrtimer - internal function to (re)start a timer
313 *
314 * The timer is inserted in expiry order. Insertion into the
315 * red black tree is O(log(n)). Must hold the base lock.
316 */
317static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
318{
319 struct rb_node **link = &base->active.rb_node;
320 struct rb_node *parent = NULL;
321 struct hrtimer *entry;
322
323 /*
324 * Find the right place in the rbtree:
325 */
326 while (*link) {
327 parent = *link;
328 entry = rb_entry(parent, struct hrtimer, node);
329 /*
330 * We dont care about collisions. Nodes with
331 * the same expiry time stay together.
332 */
333 if (timer->expires.tv64 < entry->expires.tv64)
334 link = &(*link)->rb_left;
335 else
336 link = &(*link)->rb_right;
337 }
338
339 /*
340 * Insert the timer to the rbtree and check whether it
341 * replaces the first pending timer
342 */
343 rb_link_node(&timer->node, parent, link);
344 rb_insert_color(&timer->node, &base->active);
345
346 timer->state = HRTIMER_PENDING;
347
348 if (!base->first || timer->expires.tv64 <
349 rb_entry(base->first, struct hrtimer, node)->expires.tv64)
350 base->first = &timer->node;
351}
352
353/*
354 * __remove_hrtimer - internal function to remove a timer
355 *
356 * Caller must hold the base lock.
357 */
358static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
359{
360 /*
361 * Remove the timer from the rbtree and replace the
362 * first entry pointer if necessary.
363 */
364 if (base->first == &timer->node)
365 base->first = rb_next(&timer->node);
366 rb_erase(&timer->node, &base->active);
367}
368
369/*
370 * remove hrtimer, called with base lock held
371 */
372static inline int
373remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
374{
375 if (hrtimer_active(timer)) {
376 __remove_hrtimer(timer, base);
377 timer->state = HRTIMER_INACTIVE;
378 return 1;
379 }
380 return 0;
381}
382
383/**
384 * hrtimer_start - (re)start an relative timer on the current CPU
385 *
386 * @timer: the timer to be added
387 * @tim: expiry time
388 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
389 *
390 * Returns:
391 * 0 on success
392 * 1 when the timer was active
393 */
394int
395hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
396{
397 struct hrtimer_base *base, *new_base;
398 unsigned long flags;
399 int ret;
400
401 base = lock_hrtimer_base(timer, &flags);
402
403 /* Remove an active timer from the queue: */
404 ret = remove_hrtimer(timer, base);
405
406 /* Switch the timer base, if necessary: */
407 new_base = switch_hrtimer_base(timer, base);
408
409 if (mode == HRTIMER_REL)
410 tim = ktime_add(tim, new_base->get_time());
411 timer->expires = tim;
412
413 enqueue_hrtimer(timer, new_base);
414
415 unlock_hrtimer_base(timer, &flags);
416
417 return ret;
418}
419
420/**
421 * hrtimer_try_to_cancel - try to deactivate a timer
422 *
423 * @timer: hrtimer to stop
424 *
425 * Returns:
426 * 0 when the timer was not active
427 * 1 when the timer was active
428 * -1 when the timer is currently excuting the callback function and
429 * can not be stopped
430 */
431int hrtimer_try_to_cancel(struct hrtimer *timer)
432{
433 struct hrtimer_base *base;
434 unsigned long flags;
435 int ret = -1;
436
437 base = lock_hrtimer_base(timer, &flags);
438
439 if (base->curr_timer != timer)
440 ret = remove_hrtimer(timer, base);
441
442 unlock_hrtimer_base(timer, &flags);
443
444 return ret;
445
446}
447
448/**
449 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
450 *
451 * @timer: the timer to be cancelled
452 *
453 * Returns:
454 * 0 when the timer was not active
455 * 1 when the timer was active
456 */
457int hrtimer_cancel(struct hrtimer *timer)
458{
459 for (;;) {
460 int ret = hrtimer_try_to_cancel(timer);
461
462 if (ret >= 0)
463 return ret;
464 }
465}
466
467/**
468 * hrtimer_get_remaining - get remaining time for the timer
469 *
470 * @timer: the timer to read
471 */
472ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
473{
474 struct hrtimer_base *base;
475 unsigned long flags;
476 ktime_t rem;
477
478 base = lock_hrtimer_base(timer, &flags);
479 rem = ktime_sub(timer->expires, timer->base->get_time());
480 unlock_hrtimer_base(timer, &flags);
481
482 return rem;
483}
484
485/**
486 * hrtimer_rebase - rebase an initialized hrtimer to a different base
487 *
488 * @timer: the timer to be rebased
489 * @clock_id: the clock to be used
490 */
491void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id)
492{
493 struct hrtimer_base *bases;
494
495 bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
496 timer->base = &bases[clock_id];
497}
498
499/**
500 * hrtimer_init - initialize a timer to the given clock
501 *
502 * @timer: the timer to be initialized
503 * @clock_id: the clock to be used
504 */
505void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id)
506{
507 memset(timer, 0, sizeof(struct hrtimer));
508 hrtimer_rebase(timer, clock_id);
509}
510
511/**
512 * hrtimer_get_res - get the timer resolution for a clock
513 *
514 * @which_clock: which clock to query
515 * @tp: pointer to timespec variable to store the resolution
516 *
517 * Store the resolution of the clock selected by which_clock in the
518 * variable pointed to by tp.
519 */
520int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
521{
522 struct hrtimer_base *bases;
523
524 bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
525 *tp = ktime_to_timespec(bases[which_clock].resolution);
526
527 return 0;
528}
529
530/*
531 * Expire the per base hrtimer-queue:
532 */
533static inline void run_hrtimer_queue(struct hrtimer_base *base)
534{
535 ktime_t now = base->get_time();
536 struct rb_node *node;
537
538 spin_lock_irq(&base->lock);
539
540 while ((node = base->first)) {
541 struct hrtimer *timer;
542 int (*fn)(void *);
543 int restart;
544 void *data;
545
546 timer = rb_entry(node, struct hrtimer, node);
547 if (now.tv64 <= timer->expires.tv64)
548 break;
549
550 fn = timer->function;
551 data = timer->data;
552 set_curr_timer(base, timer);
553 __remove_hrtimer(timer, base);
554 spin_unlock_irq(&base->lock);
555
556 /*
557 * fn == NULL is special case for the simplest timer
558 * variant - wake up process and do not restart:
559 */
560 if (!fn) {
561 wake_up_process(data);
562 restart = HRTIMER_NORESTART;
563 } else
564 restart = fn(data);
565
566 spin_lock_irq(&base->lock);
567
568 if (restart == HRTIMER_RESTART)
569 enqueue_hrtimer(timer, base);
570 else
571 timer->state = HRTIMER_EXPIRED;
572 }
573 set_curr_timer(base, NULL);
574 spin_unlock_irq(&base->lock);
575}
576
577/*
578 * Called from timer softirq every jiffy, expire hrtimers:
579 */
580void hrtimer_run_queues(void)
581{
582 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
583 int i;
584
585 for (i = 0; i < MAX_HRTIMER_BASES; i++)
586 run_hrtimer_queue(&base[i]);
587}
588
589/*
590 * Sleep related functions:
591 */
592
593/**
594 * schedule_hrtimer - sleep until timeout
595 *
596 * @timer: hrtimer variable initialized with the correct clock base
597 * @mode: timeout value is abs/rel
598 *
599 * Make the current task sleep until @timeout is
600 * elapsed.
601 *
602 * You can set the task state as follows -
603 *
604 * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
605 * pass before the routine returns. The routine will return 0
606 *
607 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
608 * delivered to the current task. In this case the remaining time
609 * will be returned
610 *
611 * The current task state is guaranteed to be TASK_RUNNING when this
612 * routine returns.
613 */
614static ktime_t __sched
615schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
616{
617 /* fn stays NULL, meaning single-shot wakeup: */
618 timer->data = current;
619
620 hrtimer_start(timer, timer->expires, mode);
621
622 schedule();
623 hrtimer_cancel(timer);
624
625 /* Return the remaining time: */
626 if (timer->state != HRTIMER_EXPIRED)
627 return ktime_sub(timer->expires, timer->base->get_time());
628 else
629 return (ktime_t) {.tv64 = 0 };
630}
631
632static inline ktime_t __sched
633schedule_hrtimer_interruptible(struct hrtimer *timer,
634 const enum hrtimer_mode mode)
635{
636 set_current_state(TASK_INTERRUPTIBLE);
637
638 return schedule_hrtimer(timer, mode);
639}
640
641static long __sched
642nanosleep_restart(struct restart_block *restart, clockid_t clockid)
643{
644 struct timespec __user *rmtp;
645 struct timespec tu;
646 void *rfn_save = restart->fn;
647 struct hrtimer timer;
648 ktime_t rem;
649
650 restart->fn = do_no_restart_syscall;
651
652 hrtimer_init(&timer, clockid);
653
654 timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
655
656 rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
657
658 if (rem.tv64 <= 0)
659 return 0;
660
661 rmtp = (struct timespec __user *) restart->arg2;
662 tu = ktime_to_timespec(rem);
663 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
664 return -EFAULT;
665
666 restart->fn = rfn_save;
667
668 /* The other values in restart are already filled in */
669 return -ERESTART_RESTARTBLOCK;
670}
671
672static long __sched nanosleep_restart_mono(struct restart_block *restart)
673{
674 return nanosleep_restart(restart, CLOCK_MONOTONIC);
675}
676
677static long __sched nanosleep_restart_real(struct restart_block *restart)
678{
679 return nanosleep_restart(restart, CLOCK_REALTIME);
680}
681
682long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
683 const enum hrtimer_mode mode, const clockid_t clockid)
684{
685 struct restart_block *restart;
686 struct hrtimer timer;
687 struct timespec tu;
688 ktime_t rem;
689
690 hrtimer_init(&timer, clockid);
691
692 timer.expires = timespec_to_ktime(*rqtp);
693
694 rem = schedule_hrtimer_interruptible(&timer, mode);
695 if (rem.tv64 <= 0)
696 return 0;
697
698 /* Absolute timers do not update the rmtp value: */
699 if (mode == HRTIMER_ABS)
700 return -ERESTARTNOHAND;
701
702 tu = ktime_to_timespec(rem);
703
704 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
705 return -EFAULT;
706
707 restart = &current_thread_info()->restart_block;
708 restart->fn = (clockid == CLOCK_MONOTONIC) ?
709 nanosleep_restart_mono : nanosleep_restart_real;
710 restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
711 restart->arg1 = timer.expires.tv64 >> 32;
712 restart->arg2 = (unsigned long) rmtp;
713
714 return -ERESTART_RESTARTBLOCK;
715}
716
717asmlinkage long
718sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
719{
720 struct timespec tu;
721
722 if (copy_from_user(&tu, rqtp, sizeof(tu)))
723 return -EFAULT;
724
725 if (!timespec_valid(&tu))
726 return -EINVAL;
727
728 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC);
729}
730
731/*
732 * Functions related to boot-time initialization:
733 */
734static void __devinit init_hrtimers_cpu(int cpu)
735{
736 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
737 int i;
738
739 for (i = 0; i < MAX_HRTIMER_BASES; i++) {
740 spin_lock_init(&base->lock);
741 base++;
742 }
743}
744
745#ifdef CONFIG_HOTPLUG_CPU
746
747static void migrate_hrtimer_list(struct hrtimer_base *old_base,
748 struct hrtimer_base *new_base)
749{
750 struct hrtimer *timer;
751 struct rb_node *node;
752
753 while ((node = rb_first(&old_base->active))) {
754 timer = rb_entry(node, struct hrtimer, node);
755 __remove_hrtimer(timer, old_base);
756 timer->base = new_base;
757 enqueue_hrtimer(timer, new_base);
758 }
759}
760
761static void migrate_hrtimers(int cpu)
762{
763 struct hrtimer_base *old_base, *new_base;
764 int i;
765
766 BUG_ON(cpu_online(cpu));
767 old_base = per_cpu(hrtimer_bases, cpu);
768 new_base = get_cpu_var(hrtimer_bases);
769
770 local_irq_disable();
771
772 for (i = 0; i < MAX_HRTIMER_BASES; i++) {
773
774 spin_lock(&new_base->lock);
775 spin_lock(&old_base->lock);
776
777 BUG_ON(old_base->curr_timer);
778
779 migrate_hrtimer_list(old_base, new_base);
780
781 spin_unlock(&old_base->lock);
782 spin_unlock(&new_base->lock);
783 old_base++;
784 new_base++;
785 }
786
787 local_irq_enable();
788 put_cpu_var(hrtimer_bases);
789}
790#endif /* CONFIG_HOTPLUG_CPU */
791
792static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
793 unsigned long action, void *hcpu)
794{
795 long cpu = (long)hcpu;
796
797 switch (action) {
798
799 case CPU_UP_PREPARE:
800 init_hrtimers_cpu(cpu);
801 break;
802
803#ifdef CONFIG_HOTPLUG_CPU
804 case CPU_DEAD:
805 migrate_hrtimers(cpu);
806 break;
807#endif
808
809 default:
810 break;
811 }
812
813 return NOTIFY_OK;
814}
815
816static struct notifier_block __devinitdata hrtimers_nb = {
817 .notifier_call = hrtimer_cpu_notify,
818};
819
820void __init hrtimers_init(void)
821{
822 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
823 (void *)(long)smp_processor_id());
824 register_cpu_notifier(&hrtimers_nb);
825}
826
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 81c49a4d679e..97d5559997d2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -366,6 +366,8 @@ int request_irq(unsigned int irq,
366 action->next = NULL; 366 action->next = NULL;
367 action->dev_id = dev_id; 367 action->dev_id = dev_id;
368 368
369 select_smp_affinity(irq);
370
369 retval = setup_irq(irq, action); 371 retval = setup_irq(irq, action);
370 if (retval) 372 if (retval)
371 kfree(action); 373 kfree(action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f26e534c6585..d03b5eef8ce0 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -10,6 +10,8 @@
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12 12
13#include "internals.h"
14
13static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; 15static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
14 16
15#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
@@ -68,7 +70,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
68 */ 70 */
69 cpus_and(tmp, new_value, cpu_online_map); 71 cpus_and(tmp, new_value, cpu_online_map);
70 if (cpus_empty(tmp)) 72 if (cpus_empty(tmp))
71 return -EINVAL; 73 /* Special case for empty set - allow the architecture
74 code to set default SMP affinity. */
75 return select_smp_affinity(irq) ? -EINVAL : full_count;
72 76
73 proc_set_irq_affinity(irq, new_value); 77 proc_set_irq_affinity(irq, new_value);
74 78
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 7c1b25e25e47..c2c05c4ff28d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,36 +12,46 @@
12#include <linux/syscalls.h> 12#include <linux/syscalls.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/posix-timers.h> 14#include <linux/posix-timers.h>
15#include <linux/hrtimer.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18
18static unsigned long it_real_value(struct signal_struct *sig) 19/**
20 * itimer_get_remtime - get remaining time for the timer
21 *
22 * @timer: the timer to read
23 *
24 * Returns the delta between the expiry time and now, which can be
25 * less than zero or 1usec for an pending expired timer
26 */
27static struct timeval itimer_get_remtime(struct hrtimer *timer)
19{ 28{
20 unsigned long val = 0; 29 ktime_t rem = hrtimer_get_remaining(timer);
21 if (timer_pending(&sig->real_timer)) {
22 val = sig->real_timer.expires - jiffies;
23 30
24 /* look out for negative/zero itimer.. */ 31 /*
25 if ((long) val <= 0) 32 * Racy but safe: if the itimer expires after the above
26 val = 1; 33 * hrtimer_get_remtime() call but before this condition
27 } 34 * then we return 0 - which is correct.
28 return val; 35 */
36 if (hrtimer_active(timer)) {
37 if (rem.tv64 <= 0)
38 rem.tv64 = NSEC_PER_USEC;
39 } else
40 rem.tv64 = 0;
41
42 return ktime_to_timeval(rem);
29} 43}
30 44
31int do_getitimer(int which, struct itimerval *value) 45int do_getitimer(int which, struct itimerval *value)
32{ 46{
33 struct task_struct *tsk = current; 47 struct task_struct *tsk = current;
34 unsigned long interval, val;
35 cputime_t cinterval, cval; 48 cputime_t cinterval, cval;
36 49
37 switch (which) { 50 switch (which) {
38 case ITIMER_REAL: 51 case ITIMER_REAL:
39 spin_lock_irq(&tsk->sighand->siglock); 52 value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
40 interval = tsk->signal->it_real_incr; 53 value->it_interval =
41 val = it_real_value(tsk->signal); 54 ktime_to_timeval(tsk->signal->it_real_incr);
42 spin_unlock_irq(&tsk->sighand->siglock);
43 jiffies_to_timeval(val, &value->it_value);
44 jiffies_to_timeval(interval, &value->it_interval);
45 break; 55 break;
46 case ITIMER_VIRTUAL: 56 case ITIMER_VIRTUAL:
47 read_lock(&tasklist_lock); 57 read_lock(&tasklist_lock);
@@ -113,59 +123,45 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
113} 123}
114 124
115 125
116void it_real_fn(unsigned long __data) 126/*
127 * The timer is automagically restarted, when interval != 0
128 */
129int it_real_fn(void *data)
117{ 130{
118 struct task_struct * p = (struct task_struct *) __data; 131 struct task_struct *tsk = (struct task_struct *) data;
119 unsigned long inc = p->signal->it_real_incr;
120 132
121 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); 133 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
122 134
123 /* 135 if (tsk->signal->it_real_incr.tv64 != 0) {
124 * Now restart the timer if necessary. We don't need any locking 136 hrtimer_forward(&tsk->signal->real_timer,
125 * here because do_setitimer makes sure we have finished running 137 tsk->signal->it_real_incr);
126 * before it touches anything. 138
127 * Note, we KNOW we are (or should be) at a jiffie edge here so 139 return HRTIMER_RESTART;
128 * we don't need the +1 stuff. Also, we want to use the prior 140 }
129 * expire value so as to not "slip" a jiffie if we are late. 141 return HRTIMER_NORESTART;
130 * Deal with requesting a time prior to "now" here rather than
131 * in add_timer.
132 */
133 if (!inc)
134 return;
135 while (time_before_eq(p->signal->real_timer.expires, jiffies))
136 p->signal->real_timer.expires += inc;
137 add_timer(&p->signal->real_timer);
138} 142}
139 143
140int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) 144int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
141{ 145{
142 struct task_struct *tsk = current; 146 struct task_struct *tsk = current;
143 unsigned long val, interval, expires; 147 struct hrtimer *timer;
148 ktime_t expires;
144 cputime_t cval, cinterval, nval, ninterval; 149 cputime_t cval, cinterval, nval, ninterval;
145 150
146 switch (which) { 151 switch (which) {
147 case ITIMER_REAL: 152 case ITIMER_REAL:
148again: 153 timer = &tsk->signal->real_timer;
149 spin_lock_irq(&tsk->sighand->siglock); 154 hrtimer_cancel(timer);
150 interval = tsk->signal->it_real_incr;
151 val = it_real_value(tsk->signal);
152 /* We are sharing ->siglock with it_real_fn() */
153 if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
154 spin_unlock_irq(&tsk->sighand->siglock);
155 goto again;
156 }
157 tsk->signal->it_real_incr =
158 timeval_to_jiffies(&value->it_interval);
159 expires = timeval_to_jiffies(&value->it_value);
160 if (expires)
161 mod_timer(&tsk->signal->real_timer,
162 jiffies + 1 + expires);
163 spin_unlock_irq(&tsk->sighand->siglock);
164 if (ovalue) { 155 if (ovalue) {
165 jiffies_to_timeval(val, &ovalue->it_value); 156 ovalue->it_value = itimer_get_remtime(timer);
166 jiffies_to_timeval(interval, 157 ovalue->it_interval
167 &ovalue->it_interval); 158 = ktime_to_timeval(tsk->signal->it_real_incr);
168 } 159 }
160 tsk->signal->it_real_incr =
161 timeval_to_ktime(value->it_interval);
162 expires = timeval_to_ktime(value->it_value);
163 if (expires.tv64 != 0)
164 hrtimer_start(timer, expires, HRTIMER_REL);
169 break; 165 break;
170 case ITIMER_VIRTUAL: 166 case ITIMER_VIRTUAL:
171 nval = timeval_to_cputime(&value->it_value); 167 nval = timeval_to_cputime(&value->it_value);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2c95848fbce8..bf39d28e4c0e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,7 @@
6 * Version 2. See the file COPYING for more details. 6 * Version 2. See the file COPYING for more details.
7 */ 7 */
8 8
9#include <linux/capability.h>
9#include <linux/mm.h> 10#include <linux/mm.h>
10#include <linux/file.h> 11#include <linux/file.h>
11#include <linux/slab.h> 12#include <linux/slab.h>
@@ -26,6 +27,9 @@
26#include <asm/system.h> 27#include <asm/system.h>
27#include <asm/semaphore.h> 28#include <asm/semaphore.h>
28 29
30/* Per cpu memory for storing cpu states in case of system crash. */
31note_buf_t* crash_notes;
32
29/* Location of the reserved area for the crash kernel */ 33/* Location of the reserved area for the crash kernel */
30struct resource crashk_res = { 34struct resource crashk_res = {
31 .name = "Crash kernel", 35 .name = "Crash kernel",
@@ -1054,9 +1058,24 @@ void crash_kexec(struct pt_regs *regs)
1054 if (!locked) { 1058 if (!locked) {
1055 image = xchg(&kexec_crash_image, NULL); 1059 image = xchg(&kexec_crash_image, NULL);
1056 if (image) { 1060 if (image) {
1057 machine_crash_shutdown(regs); 1061 struct pt_regs fixed_regs;
1062 crash_setup_regs(&fixed_regs, regs);
1063 machine_crash_shutdown(&fixed_regs);
1058 machine_kexec(image); 1064 machine_kexec(image);
1059 } 1065 }
1060 xchg(&kexec_lock, 0); 1066 xchg(&kexec_lock, 0);
1061 } 1067 }
1062} 1068}
1069
1070static int __init crash_notes_memory_init(void)
1071{
1072 /* Allocate memory for saving cpu registers. */
1073 crash_notes = alloc_percpu(note_buf_t);
1074 if (!crash_notes) {
1075 printk("Kexec: Memory allocation for saving cpu register"
1076 " states failed\n");
1077 return -ENOMEM;
1078 }
1079 return 0;
1080}
1081module_init(crash_notes_memory_init)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5beda378cc75..3ea6325228da 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,10 +48,11 @@
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50 50
51static DEFINE_SPINLOCK(kprobe_lock); /* Protects kprobe_table */ 51DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 54
55#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
55/* 56/*
56 * kprobe->ainsn.insn points to the copy of the instruction to be 57 * kprobe->ainsn.insn points to the copy of the instruction to be
57 * single-stepped. x86_64, POWER4 and above have no-exec support and 58 * single-stepped. x86_64, POWER4 and above have no-exec support and
@@ -151,6 +152,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
151 } 152 }
152 } 153 }
153} 154}
155#endif
154 156
155/* We have preemption disabled.. so it is safe to use __ versions */ 157/* We have preemption disabled.. so it is safe to use __ versions */
156static inline void set_kprobe_instance(struct kprobe *kp) 158static inline void set_kprobe_instance(struct kprobe *kp)
@@ -165,7 +167,7 @@ static inline void reset_kprobe_instance(void)
165 167
166/* 168/*
167 * This routine is called either: 169 * This routine is called either:
168 * - under the kprobe_lock spinlock - during kprobe_[un]register() 170 * - under the kprobe_mutex - during kprobe_[un]register()
169 * OR 171 * OR
170 * - with preemption disabled - from arch/xxx/kernel/kprobes.c 172 * - with preemption disabled - from arch/xxx/kernel/kprobes.c
171 */ 173 */
@@ -246,6 +248,19 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
246 return ret; 248 return ret;
247} 249}
248 250
251/* Walks the list and increments nmissed count for multiprobe case */
252void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
253{
254 struct kprobe *kp;
255 if (p->pre_handler != aggr_pre_handler) {
256 p->nmissed++;
257 } else {
258 list_for_each_entry_rcu(kp, &p->list, list)
259 kp->nmissed++;
260 }
261 return;
262}
263
249/* Called with kretprobe_lock held */ 264/* Called with kretprobe_lock held */
250struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) 265struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
251{ 266{
@@ -399,16 +414,12 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
399 INIT_LIST_HEAD(&ap->list); 414 INIT_LIST_HEAD(&ap->list);
400 list_add_rcu(&p->list, &ap->list); 415 list_add_rcu(&p->list, &ap->list);
401 416
402 INIT_HLIST_NODE(&ap->hlist); 417 hlist_replace_rcu(&p->hlist, &ap->hlist);
403 hlist_del_rcu(&p->hlist);
404 hlist_add_head_rcu(&ap->hlist,
405 &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
406} 418}
407 419
408/* 420/*
409 * This is the second or subsequent kprobe at the address - handle 421 * This is the second or subsequent kprobe at the address - handle
410 * the intricacies 422 * the intricacies
411 * TODO: Move kcalloc outside the spin_lock
412 */ 423 */
413static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 424static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
414 struct kprobe *p) 425 struct kprobe *p)
@@ -420,7 +431,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
420 copy_kprobe(old_p, p); 431 copy_kprobe(old_p, p);
421 ret = add_new_kprobe(old_p, p); 432 ret = add_new_kprobe(old_p, p);
422 } else { 433 } else {
423 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); 434 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
424 if (!ap) 435 if (!ap)
425 return -ENOMEM; 436 return -ENOMEM;
426 add_aggr_kprobe(ap, old_p); 437 add_aggr_kprobe(ap, old_p);
@@ -430,25 +441,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
430 return ret; 441 return ret;
431} 442}
432 443
433/* kprobe removal house-keeping routines */
434static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
435{
436 arch_disarm_kprobe(p);
437 hlist_del_rcu(&p->hlist);
438 spin_unlock_irqrestore(&kprobe_lock, flags);
439 arch_remove_kprobe(p);
440}
441
442static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
443 struct kprobe *p, unsigned long flags)
444{
445 list_del_rcu(&p->list);
446 if (list_empty(&old_p->list))
447 cleanup_kprobe(old_p, flags);
448 else
449 spin_unlock_irqrestore(&kprobe_lock, flags);
450}
451
452static int __kprobes in_kprobes_functions(unsigned long addr) 444static int __kprobes in_kprobes_functions(unsigned long addr)
453{ 445{
454 if (addr >= (unsigned long)__kprobes_text_start 446 if (addr >= (unsigned long)__kprobes_text_start
@@ -457,26 +449,44 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
457 return 0; 449 return 0;
458} 450}
459 451
460int __kprobes register_kprobe(struct kprobe *p) 452static int __kprobes __register_kprobe(struct kprobe *p,
453 unsigned long called_from)
461{ 454{
462 int ret = 0; 455 int ret = 0;
463 unsigned long flags = 0;
464 struct kprobe *old_p; 456 struct kprobe *old_p;
457 struct module *probed_mod;
465 458
466 if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) 459 if ((!kernel_text_address((unsigned long) p->addr)) ||
467 return ret; 460 in_kprobes_functions((unsigned long) p->addr))
468 if ((ret = arch_prepare_kprobe(p)) != 0) 461 return -EINVAL;
469 goto rm_kprobe; 462
463 p->mod_refcounted = 0;
464 /* Check are we probing a module */
465 if ((probed_mod = module_text_address((unsigned long) p->addr))) {
466 struct module *calling_mod = module_text_address(called_from);
467 /* We must allow modules to probe themself and
468 * in this case avoid incrementing the module refcount,
469 * so as to allow unloading of self probing modules.
470 */
471 if (calling_mod && (calling_mod != probed_mod)) {
472 if (unlikely(!try_module_get(probed_mod)))
473 return -EINVAL;
474 p->mod_refcounted = 1;
475 } else
476 probed_mod = NULL;
477 }
470 478
471 p->nmissed = 0; 479 p->nmissed = 0;
472 spin_lock_irqsave(&kprobe_lock, flags); 480 down(&kprobe_mutex);
473 old_p = get_kprobe(p->addr); 481 old_p = get_kprobe(p->addr);
474 if (old_p) { 482 if (old_p) {
475 ret = register_aggr_kprobe(old_p, p); 483 ret = register_aggr_kprobe(old_p, p);
476 goto out; 484 goto out;
477 } 485 }
478 486
479 arch_copy_kprobe(p); 487 if ((ret = arch_prepare_kprobe(p)) != 0)
488 goto out;
489
480 INIT_HLIST_NODE(&p->hlist); 490 INIT_HLIST_NODE(&p->hlist);
481 hlist_add_head_rcu(&p->hlist, 491 hlist_add_head_rcu(&p->hlist,
482 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 492 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
@@ -484,33 +494,66 @@ int __kprobes register_kprobe(struct kprobe *p)
484 arch_arm_kprobe(p); 494 arch_arm_kprobe(p);
485 495
486out: 496out:
487 spin_unlock_irqrestore(&kprobe_lock, flags); 497 up(&kprobe_mutex);
488rm_kprobe: 498
489 if (ret == -EEXIST) 499 if (ret && probed_mod)
490 arch_remove_kprobe(p); 500 module_put(probed_mod);
491 return ret; 501 return ret;
492} 502}
493 503
504int __kprobes register_kprobe(struct kprobe *p)
505{
506 return __register_kprobe(p,
507 (unsigned long)__builtin_return_address(0));
508}
509
494void __kprobes unregister_kprobe(struct kprobe *p) 510void __kprobes unregister_kprobe(struct kprobe *p)
495{ 511{
496 unsigned long flags; 512 struct module *mod;
497 struct kprobe *old_p; 513 struct kprobe *old_p, *list_p;
514 int cleanup_p;
498 515
499 spin_lock_irqsave(&kprobe_lock, flags); 516 down(&kprobe_mutex);
500 old_p = get_kprobe(p->addr); 517 old_p = get_kprobe(p->addr);
501 if (old_p) { 518 if (unlikely(!old_p)) {
502 /* cleanup_*_kprobe() does the spin_unlock_irqrestore */ 519 up(&kprobe_mutex);
503 if (old_p->pre_handler == aggr_pre_handler) 520 return;
504 cleanup_aggr_kprobe(old_p, p, flags); 521 }
505 else 522 if (p != old_p) {
506 cleanup_kprobe(p, flags); 523 list_for_each_entry_rcu(list_p, &old_p->list, list)
507 524 if (list_p == p)
508 synchronize_sched(); 525 /* kprobe p is a valid probe */
509 if (old_p->pre_handler == aggr_pre_handler && 526 goto valid_p;
510 list_empty(&old_p->list)) 527 up(&kprobe_mutex);
528 return;
529 }
530valid_p:
531 if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
532 (p->list.next == &old_p->list) &&
533 (p->list.prev == &old_p->list))) {
534 /* Only probe on the hash list */
535 arch_disarm_kprobe(p);
536 hlist_del_rcu(&old_p->hlist);
537 cleanup_p = 1;
538 } else {
539 list_del_rcu(&p->list);
540 cleanup_p = 0;
541 }
542
543 up(&kprobe_mutex);
544
545 synchronize_sched();
546 if (p->mod_refcounted &&
547 (mod = module_text_address((unsigned long)p->addr)))
548 module_put(mod);
549
550 if (cleanup_p) {
551 if (p != old_p) {
552 list_del_rcu(&p->list);
511 kfree(old_p); 553 kfree(old_p);
512 } else 554 }
513 spin_unlock_irqrestore(&kprobe_lock, flags); 555 arch_remove_kprobe(p);
556 }
514} 557}
515 558
516static struct notifier_block kprobe_exceptions_nb = { 559static struct notifier_block kprobe_exceptions_nb = {
@@ -524,7 +567,8 @@ int __kprobes register_jprobe(struct jprobe *jp)
524 jp->kp.pre_handler = setjmp_pre_handler; 567 jp->kp.pre_handler = setjmp_pre_handler;
525 jp->kp.break_handler = longjmp_break_handler; 568 jp->kp.break_handler = longjmp_break_handler;
526 569
527 return register_kprobe(&jp->kp); 570 return __register_kprobe(&jp->kp,
571 (unsigned long)__builtin_return_address(0));
528} 572}
529 573
530void __kprobes unregister_jprobe(struct jprobe *jp) 574void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -564,7 +608,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
564 608
565 rp->nmissed = 0; 609 rp->nmissed = 0;
566 /* Establish function entry probe point */ 610 /* Establish function entry probe point */
567 if ((ret = register_kprobe(&rp->kp)) != 0) 611 if ((ret = __register_kprobe(&rp->kp,
612 (unsigned long)__builtin_return_address(0))) != 0)
568 free_rp_inst(rp); 613 free_rp_inst(rp);
569 return ret; 614 return ret;
570} 615}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 015fb69ad94d..d5eeae0fa5bc 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,6 +15,9 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/init.h> 16#include <linux/init.h>
17 17
18u64 uevent_seqnum;
19char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
20
18#define KERNEL_ATTR_RO(_name) \ 21#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 22static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
20 23
@@ -23,21 +26,29 @@ static struct subsys_attribute _name##_attr = \
23 __ATTR(_name, 0644, _name##_show, _name##_store) 26 __ATTR(_name, 0644, _name##_show, _name##_store)
24 27
25#ifdef CONFIG_HOTPLUG 28#ifdef CONFIG_HOTPLUG
26static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) 29/* current uevent sequence number */
30static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
27{ 31{
28 return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); 32 return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
29} 33}
30KERNEL_ATTR_RO(hotplug_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
31#endif
32
33#ifdef CONFIG_KEXEC
34#include <asm/kexec.h>
35 35
36static ssize_t crash_notes_show(struct subsystem *subsys, char *page) 36/* uevent helper program, used during early boo */
37static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
37{ 38{
38 return sprintf(page, "%p\n", (void *)crash_notes); 39 return sprintf(page, "%s\n", uevent_helper);
39} 40}
40KERNEL_ATTR_RO(crash_notes); 41static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
42{
43 if (count+1 > UEVENT_HELPER_PATH_LEN)
44 return -ENOENT;
45 memcpy(uevent_helper, page, count);
46 uevent_helper[count] = '\0';
47 if (count && uevent_helper[count-1] == '\n')
48 uevent_helper[count-1] = '\0';
49 return count;
50}
51KERNEL_ATTR_RW(uevent_helper);
41#endif 52#endif
42 53
43decl_subsys(kernel, NULL, NULL); 54decl_subsys(kernel, NULL, NULL);
@@ -45,10 +56,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys);
45 56
46static struct attribute * kernel_attrs[] = { 57static struct attribute * kernel_attrs[] = {
47#ifdef CONFIG_HOTPLUG 58#ifdef CONFIG_HOTPLUG
48 &hotplug_seqnum_attr.attr, 59 &uevent_seqnum_attr.attr,
49#endif 60 &uevent_helper_attr.attr,
50#ifdef CONFIG_KEXEC
51 &crash_notes_attr.attr,
52#endif 61#endif
53 NULL 62 NULL
54}; 63};
diff --git a/kernel/module.c b/kernel/module.c
index 2ea929d51ad0..618ed6e23ecc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -28,6 +28,7 @@
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/fcntl.h> 29#include <linux/fcntl.h>
30#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
31#include <linux/capability.h>
31#include <linux/cpu.h> 32#include <linux/cpu.h>
32#include <linux/moduleparam.h> 33#include <linux/moduleparam.h>
33#include <linux/errno.h> 34#include <linux/errno.h>
@@ -496,15 +497,15 @@ static void module_unload_free(struct module *mod)
496} 497}
497 498
498#ifdef CONFIG_MODULE_FORCE_UNLOAD 499#ifdef CONFIG_MODULE_FORCE_UNLOAD
499static inline int try_force(unsigned int flags) 500static inline int try_force_unload(unsigned int flags)
500{ 501{
501 int ret = (flags & O_TRUNC); 502 int ret = (flags & O_TRUNC);
502 if (ret) 503 if (ret)
503 add_taint(TAINT_FORCED_MODULE); 504 add_taint(TAINT_FORCED_RMMOD);
504 return ret; 505 return ret;
505} 506}
506#else 507#else
507static inline int try_force(unsigned int flags) 508static inline int try_force_unload(unsigned int flags)
508{ 509{
509 return 0; 510 return 0;
510} 511}
@@ -524,7 +525,7 @@ static int __try_stop_module(void *_sref)
524 525
525 /* If it's not unused, quit unless we are told to block. */ 526 /* If it's not unused, quit unless we are told to block. */
526 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { 527 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
527 if (!(*sref->forced = try_force(sref->flags))) 528 if (!(*sref->forced = try_force_unload(sref->flags)))
528 return -EWOULDBLOCK; 529 return -EWOULDBLOCK;
529 } 530 }
530 531
@@ -609,7 +610,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
609 /* If it has an init func, it must have an exit func to unload */ 610 /* If it has an init func, it must have an exit func to unload */
610 if ((mod->init != NULL && mod->exit == NULL) 611 if ((mod->init != NULL && mod->exit == NULL)
611 || mod->unsafe) { 612 || mod->unsafe) {
612 forced = try_force(flags); 613 forced = try_force_unload(flags);
613 if (!forced) { 614 if (!forced) {
614 /* This module can't be removed */ 615 /* This module can't be removed */
615 ret = -EBUSY; 616 ret = -EBUSY;
@@ -958,7 +959,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
958 unsigned long ret; 959 unsigned long ret;
959 const unsigned long *crc; 960 const unsigned long *crc;
960 961
961 spin_lock_irq(&modlist_lock);
962 ret = __find_symbol(name, &owner, &crc, mod->license_gplok); 962 ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
963 if (ret) { 963 if (ret) {
964 /* use_module can fail due to OOM, or module unloading */ 964 /* use_module can fail due to OOM, or module unloading */
@@ -966,7 +966,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
966 !use_module(mod, owner)) 966 !use_module(mod, owner))
967 ret = 0; 967 ret = 0;
968 } 968 }
969 spin_unlock_irq(&modlist_lock);
970 return ret; 969 return ret;
971} 970}
972 971
@@ -1204,6 +1203,39 @@ void *__symbol_get(const char *symbol)
1204} 1203}
1205EXPORT_SYMBOL_GPL(__symbol_get); 1204EXPORT_SYMBOL_GPL(__symbol_get);
1206 1205
1206/*
1207 * Ensure that an exported symbol [global namespace] does not already exist
1208 * in the Kernel or in some other modules exported symbol table.
1209 */
1210static int verify_export_symbols(struct module *mod)
1211{
1212 const char *name = NULL;
1213 unsigned long i, ret = 0;
1214 struct module *owner;
1215 const unsigned long *crc;
1216
1217 for (i = 0; i < mod->num_syms; i++)
1218 if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
1219 name = mod->syms[i].name;
1220 ret = -ENOEXEC;
1221 goto dup;
1222 }
1223
1224 for (i = 0; i < mod->num_gpl_syms; i++)
1225 if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
1226 name = mod->gpl_syms[i].name;
1227 ret = -ENOEXEC;
1228 goto dup;
1229 }
1230
1231dup:
1232 if (ret)
1233 printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
1234 mod->name, name, module_name(owner));
1235
1236 return ret;
1237}
1238
1207/* Change all symbols so that sh_value encodes the pointer directly. */ 1239/* Change all symbols so that sh_value encodes the pointer directly. */
1208static int simplify_symbols(Elf_Shdr *sechdrs, 1240static int simplify_symbols(Elf_Shdr *sechdrs,
1209 unsigned int symindex, 1241 unsigned int symindex,
@@ -1715,6 +1747,11 @@ static struct module *load_module(void __user *umod,
1715 /* Set up license info based on the info section */ 1747 /* Set up license info based on the info section */
1716 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1748 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1717 1749
1750 if (strcmp(mod->name, "ndiswrapper") == 0)
1751 add_taint(TAINT_PROPRIETARY_MODULE);
1752 if (strcmp(mod->name, "driverloader") == 0)
1753 add_taint(TAINT_PROPRIETARY_MODULE);
1754
1718#ifdef CONFIG_MODULE_UNLOAD 1755#ifdef CONFIG_MODULE_UNLOAD
1719 /* Set up MODINFO_ATTR fields */ 1756 /* Set up MODINFO_ATTR fields */
1720 setup_modinfo(mod, sechdrs, infoindex); 1757 setup_modinfo(mod, sechdrs, infoindex);
@@ -1767,6 +1804,12 @@ static struct module *load_module(void __user *umod,
1767 goto cleanup; 1804 goto cleanup;
1768 } 1805 }
1769 1806
1807 /* Find duplicate symbols */
1808 err = verify_export_symbols(mod);
1809
1810 if (err < 0)
1811 goto cleanup;
1812
1770 /* Set up and sort exception table */ 1813 /* Set up and sort exception table */
1771 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); 1814 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
1772 mod->extable = extable = (void *)sechdrs[exindex].sh_addr; 1815 mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
@@ -1854,8 +1897,7 @@ static struct module *load_module(void __user *umod,
1854 kfree(args); 1897 kfree(args);
1855 free_hdr: 1898 free_hdr:
1856 vfree(hdr); 1899 vfree(hdr);
1857 if (err < 0) return ERR_PTR(err); 1900 return ERR_PTR(err);
1858 else return ptr;
1859 1901
1860 truncated: 1902 truncated:
1861 printk(KERN_ERR "Module len %lu truncated\n", len); 1903 printk(KERN_ERR "Module len %lu truncated\n", len);
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
new file mode 100644
index 000000000000..f4913c376950
--- /dev/null
+++ b/kernel/mutex-debug.c
@@ -0,0 +1,462 @@
1/*
2 * kernel/mutex-debug.c
3 *
4 * Debugging code for mutexes
5 *
6 * Started by Ingo Molnar:
7 *
8 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 *
10 * lock debugging, locking tree, deadlock detection started by:
11 *
12 * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
13 * Released under the General Public License (GPL).
14 */
15#include <linux/mutex.h>
16#include <linux/sched.h>
17#include <linux/delay.h>
18#include <linux/module.h>
19#include <linux/spinlock.h>
20#include <linux/kallsyms.h>
21#include <linux/interrupt.h>
22
23#include "mutex-debug.h"
24
25/*
26 * We need a global lock when we walk through the multi-process
27 * lock tree. Only used in the deadlock-debugging case.
28 */
29DEFINE_SPINLOCK(debug_mutex_lock);
30
31/*
32 * All locks held by all tasks, in a single global list:
33 */
34LIST_HEAD(debug_mutex_held_locks);
35
36/*
37 * In the debug case we carry the caller's instruction pointer into
38 * other functions, but we dont want the function argument overhead
39 * in the nondebug case - hence these macros:
40 */
41#define __IP_DECL__ , unsigned long ip
42#define __IP__ , ip
43#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
44
45/*
46 * "mutex debugging enabled" flag. We turn it off when we detect
47 * the first problem because we dont want to recurse back
48 * into the tracing code when doing error printk or
49 * executing a BUG():
50 */
51int debug_mutex_on = 1;
52
53static void printk_task(struct task_struct *p)
54{
55 if (p)
56 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
57 else
58 printk("<none>");
59}
60
61static void printk_ti(struct thread_info *ti)
62{
63 if (ti)
64 printk_task(ti->task);
65 else
66 printk("<none>");
67}
68
69static void printk_task_short(struct task_struct *p)
70{
71 if (p)
72 printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
73 else
74 printk("<none>");
75}
76
77static void printk_lock(struct mutex *lock, int print_owner)
78{
79 printk(" [%p] {%s}\n", lock, lock->name);
80
81 if (print_owner && lock->owner) {
82 printk(".. held by: ");
83 printk_ti(lock->owner);
84 printk("\n");
85 }
86 if (lock->owner) {
87 printk("... acquired at: ");
88 print_symbol("%s\n", lock->acquire_ip);
89 }
90}
91
92/*
93 * printk locks held by a task:
94 */
95static void show_task_locks(struct task_struct *p)
96{
97 switch (p->state) {
98 case TASK_RUNNING: printk("R"); break;
99 case TASK_INTERRUPTIBLE: printk("S"); break;
100 case TASK_UNINTERRUPTIBLE: printk("D"); break;
101 case TASK_STOPPED: printk("T"); break;
102 case EXIT_ZOMBIE: printk("Z"); break;
103 case EXIT_DEAD: printk("X"); break;
104 default: printk("?"); break;
105 }
106 printk_task(p);
107 if (p->blocked_on) {
108 struct mutex *lock = p->blocked_on->lock;
109
110 printk(" blocked on mutex:");
111 printk_lock(lock, 1);
112 } else
113 printk(" (not blocked on mutex)\n");
114}
115
116/*
117 * printk all locks held in the system (if filter == NULL),
118 * or all locks belonging to a single task (if filter != NULL):
119 */
120void show_held_locks(struct task_struct *filter)
121{
122 struct list_head *curr, *cursor = NULL;
123 struct mutex *lock;
124 struct thread_info *t;
125 unsigned long flags;
126 int count = 0;
127
128 if (filter) {
129 printk("------------------------------\n");
130 printk("| showing all locks held by: | (");
131 printk_task_short(filter);
132 printk("):\n");
133 printk("------------------------------\n");
134 } else {
135 printk("---------------------------\n");
136 printk("| showing all locks held: |\n");
137 printk("---------------------------\n");
138 }
139
140 /*
141 * Play safe and acquire the global trace lock. We
142 * cannot printk with that lock held so we iterate
143 * very carefully:
144 */
145next:
146 debug_spin_lock_save(&debug_mutex_lock, flags);
147 list_for_each(curr, &debug_mutex_held_locks) {
148 if (cursor && curr != cursor)
149 continue;
150 lock = list_entry(curr, struct mutex, held_list);
151 t = lock->owner;
152 if (filter && (t != filter->thread_info))
153 continue;
154 count++;
155 cursor = curr->next;
156 debug_spin_lock_restore(&debug_mutex_lock, flags);
157
158 printk("\n#%03d: ", count);
159 printk_lock(lock, filter ? 0 : 1);
160 goto next;
161 }
162 debug_spin_lock_restore(&debug_mutex_lock, flags);
163 printk("\n");
164}
165
166void mutex_debug_show_all_locks(void)
167{
168 struct task_struct *g, *p;
169 int count = 10;
170 int unlock = 1;
171
172 printk("\nShowing all blocking locks in the system:\n");
173
174 /*
175 * Here we try to get the tasklist_lock as hard as possible,
176 * if not successful after 2 seconds we ignore it (but keep
177 * trying). This is to enable a debug printout even if a
178 * tasklist_lock-holding task deadlocks or crashes.
179 */
180retry:
181 if (!read_trylock(&tasklist_lock)) {
182 if (count == 10)
183 printk("hm, tasklist_lock locked, retrying... ");
184 if (count) {
185 count--;
186 printk(" #%d", 10-count);
187 mdelay(200);
188 goto retry;
189 }
190 printk(" ignoring it.\n");
191 unlock = 0;
192 }
193 if (count != 10)
194 printk(" locked it.\n");
195
196 do_each_thread(g, p) {
197 show_task_locks(p);
198 if (!unlock)
199 if (read_trylock(&tasklist_lock))
200 unlock = 1;
201 } while_each_thread(g, p);
202
203 printk("\n");
204 show_held_locks(NULL);
205 printk("=============================================\n\n");
206
207 if (unlock)
208 read_unlock(&tasklist_lock);
209}
210
211static void report_deadlock(struct task_struct *task, struct mutex *lock,
212 struct mutex *lockblk, unsigned long ip)
213{
214 printk("\n%s/%d is trying to acquire this lock:\n",
215 current->comm, current->pid);
216 printk_lock(lock, 1);
217 printk("... trying at: ");
218 print_symbol("%s\n", ip);
219 show_held_locks(current);
220
221 if (lockblk) {
222 printk("but %s/%d is deadlocking current task %s/%d!\n\n",
223 task->comm, task->pid, current->comm, current->pid);
224 printk("\n%s/%d is blocked on this lock:\n",
225 task->comm, task->pid);
226 printk_lock(lockblk, 1);
227
228 show_held_locks(task);
229
230 printk("\n%s/%d's [blocked] stackdump:\n\n",
231 task->comm, task->pid);
232 show_stack(task, NULL);
233 }
234
235 printk("\n%s/%d's [current] stackdump:\n\n",
236 current->comm, current->pid);
237 dump_stack();
238 mutex_debug_show_all_locks();
239 printk("[ turning off deadlock detection. Please report this. ]\n\n");
240 local_irq_disable();
241}
242
243/*
244 * Recursively check for mutex deadlocks:
245 */
246static int check_deadlock(struct mutex *lock, int depth,
247 struct thread_info *ti, unsigned long ip)
248{
249 struct mutex *lockblk;
250 struct task_struct *task;
251
252 if (!debug_mutex_on)
253 return 0;
254
255 ti = lock->owner;
256 if (!ti)
257 return 0;
258
259 task = ti->task;
260 lockblk = NULL;
261 if (task->blocked_on)
262 lockblk = task->blocked_on->lock;
263
264 /* Self-deadlock: */
265 if (current == task) {
266 DEBUG_OFF();
267 if (depth)
268 return 1;
269 printk("\n==========================================\n");
270 printk( "[ BUG: lock recursion deadlock detected! |\n");
271 printk( "------------------------------------------\n");
272 report_deadlock(task, lock, NULL, ip);
273 return 0;
274 }
275
276 /* Ugh, something corrupted the lock data structure? */
277 if (depth > 20) {
278 DEBUG_OFF();
279 printk("\n===========================================\n");
280 printk( "[ BUG: infinite lock dependency detected!? |\n");
281 printk( "-------------------------------------------\n");
282 report_deadlock(task, lock, lockblk, ip);
283 return 0;
284 }
285
286 /* Recursively check for dependencies: */
287 if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) {
288 printk("\n============================================\n");
289 printk( "[ BUG: circular locking deadlock detected! ]\n");
290 printk( "--------------------------------------------\n");
291 report_deadlock(task, lock, lockblk, ip);
292 return 0;
293 }
294 return 0;
295}
296
297/*
298 * Called when a task exits, this function checks whether the
299 * task is holding any locks, and reports the first one if so:
300 */
301void mutex_debug_check_no_locks_held(struct task_struct *task)
302{
303 struct list_head *curr, *next;
304 struct thread_info *t;
305 unsigned long flags;
306 struct mutex *lock;
307
308 if (!debug_mutex_on)
309 return;
310
311 debug_spin_lock_save(&debug_mutex_lock, flags);
312 list_for_each_safe(curr, next, &debug_mutex_held_locks) {
313 lock = list_entry(curr, struct mutex, held_list);
314 t = lock->owner;
315 if (t != task->thread_info)
316 continue;
317 list_del_init(curr);
318 DEBUG_OFF();
319 debug_spin_lock_restore(&debug_mutex_lock, flags);
320
321 printk("BUG: %s/%d, lock held at task exit time!\n",
322 task->comm, task->pid);
323 printk_lock(lock, 1);
324 if (lock->owner != task->thread_info)
325 printk("exiting task is not even the owner??\n");
326 return;
327 }
328 debug_spin_lock_restore(&debug_mutex_lock, flags);
329}
330
331/*
332 * Called when kernel memory is freed (or unmapped), or if a mutex
333 * is destroyed or reinitialized - this code checks whether there is
334 * any held lock in the memory range of <from> to <to>:
335 */
336void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
337{
338 struct list_head *curr, *next;
339 const void *to = from + len;
340 unsigned long flags;
341 struct mutex *lock;
342 void *lock_addr;
343
344 if (!debug_mutex_on)
345 return;
346
347 debug_spin_lock_save(&debug_mutex_lock, flags);
348 list_for_each_safe(curr, next, &debug_mutex_held_locks) {
349 lock = list_entry(curr, struct mutex, held_list);
350 lock_addr = lock;
351 if (lock_addr < from || lock_addr >= to)
352 continue;
353 list_del_init(curr);
354 DEBUG_OFF();
355 debug_spin_lock_restore(&debug_mutex_lock, flags);
356
357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
358 current->comm, current->pid, lock, from, to);
359 dump_stack();
360 printk_lock(lock, 1);
361 if (lock->owner != current_thread_info())
362 printk("freeing task is not even the owner??\n");
363 return;
364 }
365 debug_spin_lock_restore(&debug_mutex_lock, flags);
366}
367
368/*
369 * Must be called with lock->wait_lock held.
370 */
371void debug_mutex_set_owner(struct mutex *lock,
372 struct thread_info *new_owner __IP_DECL__)
373{
374 lock->owner = new_owner;
375 DEBUG_WARN_ON(!list_empty(&lock->held_list));
376 if (debug_mutex_on) {
377 list_add_tail(&lock->held_list, &debug_mutex_held_locks);
378 lock->acquire_ip = ip;
379 }
380}
381
382void debug_mutex_init_waiter(struct mutex_waiter *waiter)
383{
384 memset(waiter, 0x11, sizeof(*waiter));
385 waiter->magic = waiter;
386 INIT_LIST_HEAD(&waiter->list);
387}
388
389void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
390{
391 SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
392 DEBUG_WARN_ON(list_empty(&lock->wait_list));
393 DEBUG_WARN_ON(waiter->magic != waiter);
394 DEBUG_WARN_ON(list_empty(&waiter->list));
395}
396
397void debug_mutex_free_waiter(struct mutex_waiter *waiter)
398{
399 DEBUG_WARN_ON(!list_empty(&waiter->list));
400 memset(waiter, 0x22, sizeof(*waiter));
401}
402
403void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
404 struct thread_info *ti __IP_DECL__)
405{
406 SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
407 check_deadlock(lock, 0, ti, ip);
408 /* Mark the current thread as blocked on the lock: */
409 ti->task->blocked_on = waiter;
410 waiter->lock = lock;
411}
412
413void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
414 struct thread_info *ti)
415{
416 DEBUG_WARN_ON(list_empty(&waiter->list));
417 DEBUG_WARN_ON(waiter->task != ti->task);
418 DEBUG_WARN_ON(ti->task->blocked_on != waiter);
419 ti->task->blocked_on = NULL;
420
421 list_del_init(&waiter->list);
422 waiter->task = NULL;
423}
424
425void debug_mutex_unlock(struct mutex *lock)
426{
427 DEBUG_WARN_ON(lock->magic != lock);
428 DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
429 DEBUG_WARN_ON(lock->owner != current_thread_info());
430 if (debug_mutex_on) {
431 DEBUG_WARN_ON(list_empty(&lock->held_list));
432 list_del_init(&lock->held_list);
433 }
434}
435
436void debug_mutex_init(struct mutex *lock, const char *name)
437{
438 /*
439 * Make sure we are not reinitializing a held lock:
440 */
441 mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock));
442 lock->owner = NULL;
443 INIT_LIST_HEAD(&lock->held_list);
444 lock->name = name;
445 lock->magic = lock;
446}
447
448/***
449 * mutex_destroy - mark a mutex unusable
450 * @lock: the mutex to be destroyed
451 *
452 * This function marks the mutex uninitialized, and any subsequent
453 * use of the mutex is forbidden. The mutex must not be locked when
454 * this function is called.
455 */
456void fastcall mutex_destroy(struct mutex *lock)
457{
458 DEBUG_WARN_ON(mutex_is_locked(lock));
459 lock->magic = NULL;
460}
461
462EXPORT_SYMBOL_GPL(mutex_destroy);
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
new file mode 100644
index 000000000000..fd384050acb1
--- /dev/null
+++ b/kernel/mutex-debug.h
@@ -0,0 +1,134 @@
1/*
2 * Mutexes: blocking mutual exclusion locks
3 *
4 * started by Ingo Molnar:
5 *
6 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 *
8 * This file contains mutex debugging related internal declarations,
9 * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
10 * More details are in kernel/mutex-debug.c.
11 */
12
13extern spinlock_t debug_mutex_lock;
14extern struct list_head debug_mutex_held_locks;
15extern int debug_mutex_on;
16
17/*
18 * In the debug case we carry the caller's instruction pointer into
19 * other functions, but we dont want the function argument overhead
20 * in the nondebug case - hence these macros:
21 */
22#define __IP_DECL__ , unsigned long ip
23#define __IP__ , ip
24#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
25
26/*
27 * This must be called with lock->wait_lock held.
28 */
29extern void debug_mutex_set_owner(struct mutex *lock,
30 struct thread_info *new_owner __IP_DECL__);
31
32static inline void debug_mutex_clear_owner(struct mutex *lock)
33{
34 lock->owner = NULL;
35}
36
37extern void debug_mutex_init_waiter(struct mutex_waiter *waiter);
38extern void debug_mutex_wake_waiter(struct mutex *lock,
39 struct mutex_waiter *waiter);
40extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
41extern void debug_mutex_add_waiter(struct mutex *lock,
42 struct mutex_waiter *waiter,
43 struct thread_info *ti __IP_DECL__);
44extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
45 struct thread_info *ti);
46extern void debug_mutex_unlock(struct mutex *lock);
47extern void debug_mutex_init(struct mutex *lock, const char *name);
48
49#define debug_spin_lock(lock) \
50 do { \
51 local_irq_disable(); \
52 if (debug_mutex_on) \
53 spin_lock(lock); \
54 } while (0)
55
56#define debug_spin_unlock(lock) \
57 do { \
58 if (debug_mutex_on) \
59 spin_unlock(lock); \
60 local_irq_enable(); \
61 preempt_check_resched(); \
62 } while (0)
63
64#define debug_spin_lock_save(lock, flags) \
65 do { \
66 local_irq_save(flags); \
67 if (debug_mutex_on) \
68 spin_lock(lock); \
69 } while (0)
70
71#define debug_spin_lock_restore(lock, flags) \
72 do { \
73 if (debug_mutex_on) \
74 spin_unlock(lock); \
75 local_irq_restore(flags); \
76 preempt_check_resched(); \
77 } while (0)
78
79#define spin_lock_mutex(lock) \
80 do { \
81 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
82 \
83 DEBUG_WARN_ON(in_interrupt()); \
84 debug_spin_lock(&debug_mutex_lock); \
85 spin_lock(lock); \
86 DEBUG_WARN_ON(l->magic != l); \
87 } while (0)
88
89#define spin_unlock_mutex(lock) \
90 do { \
91 spin_unlock(lock); \
92 debug_spin_unlock(&debug_mutex_lock); \
93 } while (0)
94
95#define DEBUG_OFF() \
96do { \
97 if (debug_mutex_on) { \
98 debug_mutex_on = 0; \
99 console_verbose(); \
100 if (spin_is_locked(&debug_mutex_lock)) \
101 spin_unlock(&debug_mutex_lock); \
102 } \
103} while (0)
104
105#define DEBUG_BUG() \
106do { \
107 if (debug_mutex_on) { \
108 DEBUG_OFF(); \
109 BUG(); \
110 } \
111} while (0)
112
113#define DEBUG_WARN_ON(c) \
114do { \
115 if (unlikely(c && debug_mutex_on)) { \
116 DEBUG_OFF(); \
117 WARN_ON(1); \
118 } \
119} while (0)
120
121# define DEBUG_BUG_ON(c) \
122do { \
123 if (unlikely(c)) \
124 DEBUG_BUG(); \
125} while (0)
126
127#ifdef CONFIG_SMP
128# define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c)
129# define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c)
130#else
131# define SMP_DEBUG_WARN_ON(c) do { } while (0)
132# define SMP_DEBUG_BUG_ON(c) do { } while (0)
133#endif
134
diff --git a/kernel/mutex.c b/kernel/mutex.c
new file mode 100644
index 000000000000..5449b210d9ed
--- /dev/null
+++ b/kernel/mutex.c
@@ -0,0 +1,315 @@
1/*
2 * kernel/mutex.c
3 *
4 * Mutexes: blocking mutual exclusion locks
5 *
6 * Started by Ingo Molnar:
7 *
8 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 *
10 * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
11 * David Howells for suggestions and improvements.
12 *
13 * Also see Documentation/mutex-design.txt.
14 */
15#include <linux/mutex.h>
16#include <linux/sched.h>
17#include <linux/module.h>
18#include <linux/spinlock.h>
19#include <linux/interrupt.h>
20
21/*
22 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
23 * which forces all calls into the slowpath:
24 */
25#ifdef CONFIG_DEBUG_MUTEXES
26# include "mutex-debug.h"
27# include <asm-generic/mutex-null.h>
28#else
29# include "mutex.h"
30# include <asm/mutex.h>
31#endif
32
33/***
34 * mutex_init - initialize the mutex
35 * @lock: the mutex to be initialized
36 *
37 * Initialize the mutex to unlocked state.
38 *
39 * It is not allowed to initialize an already locked mutex.
40 */
41void fastcall __mutex_init(struct mutex *lock, const char *name)
42{
43 atomic_set(&lock->count, 1);
44 spin_lock_init(&lock->wait_lock);
45 INIT_LIST_HEAD(&lock->wait_list);
46
47 debug_mutex_init(lock, name);
48}
49
50EXPORT_SYMBOL(__mutex_init);
51
52/*
53 * We split the mutex lock/unlock logic into separate fastpath and
54 * slowpath functions, to reduce the register pressure on the fastpath.
55 * We also put the fastpath first in the kernel image, to make sure the
56 * branch is predicted by the CPU as default-untaken.
57 */
58static void fastcall noinline __sched
59__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
60
61/***
62 * mutex_lock - acquire the mutex
63 * @lock: the mutex to be acquired
64 *
65 * Lock the mutex exclusively for this task. If the mutex is not
66 * available right now, it will sleep until it can get it.
67 *
68 * The mutex must later on be released by the same task that
69 * acquired it. Recursive locking is not allowed. The task
70 * may not exit without first unlocking the mutex. Also, kernel
71 * memory where the mutex resides mutex must not be freed with
72 * the mutex still locked. The mutex must first be initialized
73 * (or statically defined) before it can be locked. memset()-ing
74 * the mutex to 0 is not allowed.
75 *
76 * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
77 * checks that will enforce the restrictions and will also do
78 * deadlock debugging. )
79 *
80 * This function is similar to (but not equivalent to) down().
81 */
82void fastcall __sched mutex_lock(struct mutex *lock)
83{
84 might_sleep();
85 /*
86 * The locking fastpath is the 1->0 transition from
87 * 'unlocked' into 'locked' state.
88 */
89 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
90}
91
92EXPORT_SYMBOL(mutex_lock);
93
94static void fastcall noinline __sched
95__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__);
96
97/***
98 * mutex_unlock - release the mutex
99 * @lock: the mutex to be released
100 *
101 * Unlock a mutex that has been locked by this task previously.
102 *
103 * This function must not be used in interrupt context. Unlocking
104 * of a not locked mutex is not allowed.
105 *
106 * This function is similar to (but not equivalent to) up().
107 */
108void fastcall __sched mutex_unlock(struct mutex *lock)
109{
110 /*
111 * The unlocking fastpath is the 0->1 transition from 'locked'
112 * into 'unlocked' state:
113 */
114 __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
115}
116
117EXPORT_SYMBOL(mutex_unlock);
118
119/*
120 * Lock a mutex (possibly interruptible), slowpath:
121 */
122static inline int __sched
123__mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
124{
125 struct task_struct *task = current;
126 struct mutex_waiter waiter;
127 unsigned int old_val;
128
129 debug_mutex_init_waiter(&waiter);
130
131 spin_lock_mutex(&lock->wait_lock);
132
133 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
134
135 /* add waiting tasks to the end of the waitqueue (FIFO): */
136 list_add_tail(&waiter.list, &lock->wait_list);
137 waiter.task = task;
138
139 for (;;) {
140 /*
141 * Lets try to take the lock again - this is needed even if
142 * we get here for the first time (shortly after failing to
143 * acquire the lock), to make sure that we get a wakeup once
144 * it's unlocked. Later on, if we sleep, this is the
145 * operation that gives us the lock. We xchg it to -1, so
146 * that when we release the lock, we properly wake up the
147 * other waiters:
148 */
149 old_val = atomic_xchg(&lock->count, -1);
150 if (old_val == 1)
151 break;
152
153 /*
154 * got a signal? (This code gets eliminated in the
155 * TASK_UNINTERRUPTIBLE case.)
156 */
157 if (unlikely(state == TASK_INTERRUPTIBLE &&
158 signal_pending(task))) {
159 mutex_remove_waiter(lock, &waiter, task->thread_info);
160 spin_unlock_mutex(&lock->wait_lock);
161
162 debug_mutex_free_waiter(&waiter);
163 return -EINTR;
164 }
165 __set_task_state(task, state);
166
167 /* didnt get the lock, go to sleep: */
168 spin_unlock_mutex(&lock->wait_lock);
169 schedule();
170 spin_lock_mutex(&lock->wait_lock);
171 }
172
173 /* got the lock - rejoice! */
174 mutex_remove_waiter(lock, &waiter, task->thread_info);
175 debug_mutex_set_owner(lock, task->thread_info __IP__);
176
177 /* set it to 0 if there are no waiters left: */
178 if (likely(list_empty(&lock->wait_list)))
179 atomic_set(&lock->count, 0);
180
181 spin_unlock_mutex(&lock->wait_lock);
182
183 debug_mutex_free_waiter(&waiter);
184
185 DEBUG_WARN_ON(list_empty(&lock->held_list));
186 DEBUG_WARN_ON(lock->owner != task->thread_info);
187
188 return 0;
189}
190
191static void fastcall noinline __sched
192__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__)
193{
194 struct mutex *lock = container_of(lock_count, struct mutex, count);
195
196 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__);
197}
198
199/*
200 * Release the lock, slowpath:
201 */
202static fastcall noinline void
203__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
204{
205 struct mutex *lock = container_of(lock_count, struct mutex, count);
206
207 DEBUG_WARN_ON(lock->owner != current_thread_info());
208
209 spin_lock_mutex(&lock->wait_lock);
210
211 /*
212 * some architectures leave the lock unlocked in the fastpath failure
213 * case, others need to leave it locked. In the later case we have to
214 * unlock it here
215 */
216 if (__mutex_slowpath_needs_to_unlock())
217 atomic_set(&lock->count, 1);
218
219 debug_mutex_unlock(lock);
220
221 if (!list_empty(&lock->wait_list)) {
222 /* get the first entry from the wait-list: */
223 struct mutex_waiter *waiter =
224 list_entry(lock->wait_list.next,
225 struct mutex_waiter, list);
226
227 debug_mutex_wake_waiter(lock, waiter);
228
229 wake_up_process(waiter->task);
230 }
231
232 debug_mutex_clear_owner(lock);
233
234 spin_unlock_mutex(&lock->wait_lock);
235}
236
237/*
238 * Here come the less common (and hence less performance-critical) APIs:
239 * mutex_lock_interruptible() and mutex_trylock().
240 */
241static int fastcall noinline __sched
242__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
243
244/***
245 * mutex_lock_interruptible - acquire the mutex, interruptable
246 * @lock: the mutex to be acquired
247 *
248 * Lock the mutex like mutex_lock(), and return 0 if the mutex has
249 * been acquired or sleep until the mutex becomes available. If a
250 * signal arrives while waiting for the lock then this function
251 * returns -EINTR.
252 *
253 * This function is similar to (but not equivalent to) down_interruptible().
254 */
255int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
256{
257 might_sleep();
258 return __mutex_fastpath_lock_retval
259 (&lock->count, __mutex_lock_interruptible_slowpath);
260}
261
262EXPORT_SYMBOL(mutex_lock_interruptible);
263
264static int fastcall noinline __sched
265__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
266{
267 struct mutex *lock = container_of(lock_count, struct mutex, count);
268
269 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__);
270}
271
272/*
273 * Spinlock based trylock, we take the spinlock and check whether we
274 * can get the lock:
275 */
276static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
277{
278 struct mutex *lock = container_of(lock_count, struct mutex, count);
279 int prev;
280
281 spin_lock_mutex(&lock->wait_lock);
282
283 prev = atomic_xchg(&lock->count, -1);
284 if (likely(prev == 1))
285 debug_mutex_set_owner(lock, current_thread_info() __RET_IP__);
286 /* Set it back to 0 if there are no waiters: */
287 if (likely(list_empty(&lock->wait_list)))
288 atomic_set(&lock->count, 0);
289
290 spin_unlock_mutex(&lock->wait_lock);
291
292 return prev == 1;
293}
294
295/***
296 * mutex_trylock - try acquire the mutex, without waiting
297 * @lock: the mutex to be acquired
298 *
299 * Try to acquire the mutex atomically. Returns 1 if the mutex
300 * has been acquired successfully, and 0 on contention.
301 *
302 * NOTE: this function follows the spin_trylock() convention, so
303 * it is negated to the down_trylock() return values! Be careful
304 * about this when converting semaphore users to mutexes.
305 *
306 * This function must not be used in interrupt context. The
307 * mutex must be released by the same task that acquired it.
308 */
309int fastcall mutex_trylock(struct mutex *lock)
310{
311 return __mutex_fastpath_trylock(&lock->count,
312 __mutex_trylock_slowpath);
313}
314
315EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
new file mode 100644
index 000000000000..00fe84e7b672
--- /dev/null
+++ b/kernel/mutex.h
@@ -0,0 +1,35 @@
1/*
2 * Mutexes: blocking mutual exclusion locks
3 *
4 * started by Ingo Molnar:
5 *
6 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 *
8 * This file contains mutex debugging related internal prototypes, for the
9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
10 */
11
12#define spin_lock_mutex(lock) spin_lock(lock)
13#define spin_unlock_mutex(lock) spin_unlock(lock)
14#define mutex_remove_waiter(lock, waiter, ti) \
15 __list_del((waiter)->list.prev, (waiter)->list.next)
16
17#define DEBUG_WARN_ON(c) do { } while (0)
18#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
19#define debug_mutex_clear_owner(lock) do { } while (0)
20#define debug_mutex_init_waiter(waiter) do { } while (0)
21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
22#define debug_mutex_free_waiter(waiter) do { } while (0)
23#define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0)
24#define debug_mutex_unlock(lock) do { } while (0)
25#define debug_mutex_init(lock, name) do { } while (0)
26
27/*
28 * Return-address parameters/declarations. They are very useful for
29 * debugging, but add overhead in the !DEBUG case - so we go the
30 * trouble of using this not too elegant but zero-cost solution:
31 */
32#define __IP_DECL__
33#define __IP__
34#define __RET_IP__
35
diff --git a/kernel/panic.c b/kernel/panic.c
index aabc5f86fa3f..c5c4ab255834 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
60 long i; 60 long i;
61 static char buf[1024]; 61 static char buf[1024];
62 va_list args; 62 va_list args;
63#if defined(CONFIG_ARCH_S390) 63#if defined(CONFIG_S390)
64 unsigned long caller = (unsigned long) __builtin_return_address(0); 64 unsigned long caller = (unsigned long) __builtin_return_address(0);
65#endif 65#endif
66 66
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...)
125 printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); 125 printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
126 } 126 }
127#endif 127#endif
128#if defined(CONFIG_ARCH_S390) 128#if defined(CONFIG_S390)
129 disabled_wait(caller); 129 disabled_wait(caller);
130#endif 130#endif
131 local_irq_enable(); 131 local_irq_enable();
diff --git a/kernel/params.c b/kernel/params.c
index 47ba69547945..c76ad25e6a21 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -619,7 +619,7 @@ static void __init param_sysfs_builtin(void)
619 619
620 620
621/* module-related sysfs stuff */ 621/* module-related sysfs stuff */
622#ifdef CONFIG_MODULES 622#ifdef CONFIG_SYSFS
623 623
624#define to_module_attr(n) container_of(n, struct module_attribute, attr); 624#define to_module_attr(n) container_of(n, struct module_attribute, attr);
625#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 625#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
diff --git a/kernel/pid.c b/kernel/pid.c
index edba31c681ac..1acc07246991 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
136 struct hlist_node *elem; 136 struct hlist_node *elem;
137 struct pid *pid; 137 struct pid *pid;
138 138
139 hlist_for_each_entry(pid, elem, 139 hlist_for_each_entry_rcu(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) { 140 &pid_hash[type][pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr) 141 if (pid->nr == nr)
142 return pid; 142 return pid;
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
150 150
151 task_pid = &task->pids[type]; 151 task_pid = &task->pids[type];
152 pid = find_pid(type, nr); 152 pid = find_pid(type, nr);
153 task_pid->nr = nr;
153 if (pid == NULL) { 154 if (pid == NULL) {
154 hlist_add_head(&task_pid->pid_chain,
155 &pid_hash[type][pid_hashfn(nr)]);
156 INIT_LIST_HEAD(&task_pid->pid_list); 155 INIT_LIST_HEAD(&task_pid->pid_list);
156 hlist_add_head_rcu(&task_pid->pid_chain,
157 &pid_hash[type][pid_hashfn(nr)]);
157 } else { 158 } else {
158 INIT_HLIST_NODE(&task_pid->pid_chain); 159 INIT_HLIST_NODE(&task_pid->pid_chain);
159 list_add_tail(&task_pid->pid_list, &pid->pid_list); 160 list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
160 } 161 }
161 task_pid->nr = nr;
162 162
163 return 0; 163 return 0;
164} 164}
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type)
170 170
171 pid = &task->pids[type]; 171 pid = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) { 172 if (!hlist_unhashed(&pid->pid_chain)) {
173 hlist_del(&pid->pid_chain);
174 173
175 if (list_empty(&pid->pid_list)) 174 if (list_empty(&pid->pid_list)) {
176 nr = pid->nr; 175 nr = pid->nr;
177 else { 176 hlist_del_rcu(&pid->pid_chain);
177 } else {
178 pid_next = list_entry(pid->pid_list.next, 178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list); 179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */ 180 /* insert next pid from pid_list to hash */
181 hlist_add_head(&pid_next->pid_chain, 181 hlist_replace_rcu(&pid->pid_chain,
182 &pid_hash[type][pid_hashfn(pid_next->nr)]); 182 &pid_next->pid_chain);
183 } 183 }
184 } 184 }
185 185
186 list_del(&pid->pid_list); 186 list_del_rcu(&pid->pid_list);
187 pid->nr = 0; 187 pid->nr = 0;
188 188
189 return nr; 189 return nr;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index cae4f5728997..520f6c59948d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,7 +7,7 @@
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9 9
10static int check_clock(clockid_t which_clock) 10static int check_clock(const clockid_t which_clock)
11{ 11{
12 int error = 0; 12 int error = 0;
13 struct task_struct *p; 13 struct task_struct *p;
@@ -31,7 +31,7 @@ static int check_clock(clockid_t which_clock)
31} 31}
32 32
33static inline union cpu_time_count 33static inline union cpu_time_count
34timespec_to_sample(clockid_t which_clock, const struct timespec *tp) 34timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
35{ 35{
36 union cpu_time_count ret; 36 union cpu_time_count ret;
37 ret.sched = 0; /* high half always zero when .cpu used */ 37 ret.sched = 0; /* high half always zero when .cpu used */
@@ -43,7 +43,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
43 return ret; 43 return ret;
44} 44}
45 45
46static void sample_to_timespec(clockid_t which_clock, 46static void sample_to_timespec(const clockid_t which_clock,
47 union cpu_time_count cpu, 47 union cpu_time_count cpu,
48 struct timespec *tp) 48 struct timespec *tp)
49{ 49{
@@ -55,7 +55,7 @@ static void sample_to_timespec(clockid_t which_clock,
55 } 55 }
56} 56}
57 57
58static inline int cpu_time_before(clockid_t which_clock, 58static inline int cpu_time_before(const clockid_t which_clock,
59 union cpu_time_count now, 59 union cpu_time_count now,
60 union cpu_time_count then) 60 union cpu_time_count then)
61{ 61{
@@ -65,7 +65,7 @@ static inline int cpu_time_before(clockid_t which_clock,
65 return cputime_lt(now.cpu, then.cpu); 65 return cputime_lt(now.cpu, then.cpu);
66 } 66 }
67} 67}
68static inline void cpu_time_add(clockid_t which_clock, 68static inline void cpu_time_add(const clockid_t which_clock,
69 union cpu_time_count *acc, 69 union cpu_time_count *acc,
70 union cpu_time_count val) 70 union cpu_time_count val)
71{ 71{
@@ -75,7 +75,7 @@ static inline void cpu_time_add(clockid_t which_clock,
75 acc->cpu = cputime_add(acc->cpu, val.cpu); 75 acc->cpu = cputime_add(acc->cpu, val.cpu);
76 } 76 }
77} 77}
78static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, 78static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
79 union cpu_time_count a, 79 union cpu_time_count a,
80 union cpu_time_count b) 80 union cpu_time_count b)
81{ 81{
@@ -151,7 +151,7 @@ static inline unsigned long long sched_ns(struct task_struct *p)
151 return (p == current) ? current_sched_time(p) : p->sched_time; 151 return (p == current) ? current_sched_time(p) : p->sched_time;
152} 152}
153 153
154int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) 154int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
155{ 155{
156 int error = check_clock(which_clock); 156 int error = check_clock(which_clock);
157 if (!error) { 157 if (!error) {
@@ -169,7 +169,7 @@ int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
169 return error; 169 return error;
170} 170}
171 171
172int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp) 172int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
173{ 173{
174 /* 174 /*
175 * You can never reset a CPU clock, but we check for other errors 175 * You can never reset a CPU clock, but we check for other errors
@@ -186,7 +186,7 @@ int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
186/* 186/*
187 * Sample a per-thread clock for the given task. 187 * Sample a per-thread clock for the given task.
188 */ 188 */
189static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p, 189static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
190 union cpu_time_count *cpu) 190 union cpu_time_count *cpu)
191{ 191{
192 switch (CPUCLOCK_WHICH(which_clock)) { 192 switch (CPUCLOCK_WHICH(which_clock)) {
@@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
238 while ((t = next_thread(t)) != p) { 238 while ((t = next_thread(t)) != p) {
239 cpu->sched += t->sched_time; 239 cpu->sched += t->sched_time;
240 } 240 }
241 if (p->tgid == current->tgid) { 241 cpu->sched += sched_ns(p);
242 /*
243 * We're sampling ourselves, so include the
244 * cycles not yet banked. We still omit
245 * other threads running on other CPUs,
246 * so the total can always be behind as
247 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
248 */
249 cpu->sched += current_sched_time(current);
250 } else {
251 cpu->sched += p->sched_time;
252 }
253 break; 242 break;
254 } 243 }
255 return 0; 244 return 0;
@@ -259,7 +248,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
259 * Sample a process (thread group) clock for the given group_leader task. 248 * Sample a process (thread group) clock for the given group_leader task.
260 * Must be called with tasklist_lock held for reading. 249 * Must be called with tasklist_lock held for reading.
261 */ 250 */
262static int cpu_clock_sample_group(clockid_t which_clock, 251static int cpu_clock_sample_group(const clockid_t which_clock,
263 struct task_struct *p, 252 struct task_struct *p,
264 union cpu_time_count *cpu) 253 union cpu_time_count *cpu)
265{ 254{
@@ -273,7 +262,7 @@ static int cpu_clock_sample_group(clockid_t which_clock,
273} 262}
274 263
275 264
276int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp) 265int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
277{ 266{
278 const pid_t pid = CPUCLOCK_PID(which_clock); 267 const pid_t pid = CPUCLOCK_PID(which_clock);
279 int error = -EINVAL; 268 int error = -EINVAL;
@@ -1410,8 +1399,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1410 1399
1411static long posix_cpu_clock_nanosleep_restart(struct restart_block *); 1400static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
1412 1401
1413int posix_cpu_nsleep(clockid_t which_clock, int flags, 1402int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1414 struct timespec *rqtp) 1403 struct timespec *rqtp, struct timespec __user *rmtp)
1415{ 1404{
1416 struct restart_block *restart_block = 1405 struct restart_block *restart_block =
1417 &current_thread_info()->restart_block; 1406 &current_thread_info()->restart_block;
@@ -1436,7 +1425,6 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
1436 error = posix_cpu_timer_create(&timer); 1425 error = posix_cpu_timer_create(&timer);
1437 timer.it_process = current; 1426 timer.it_process = current;
1438 if (!error) { 1427 if (!error) {
1439 struct timespec __user *rmtp;
1440 static struct itimerspec zero_it; 1428 static struct itimerspec zero_it;
1441 struct itimerspec it = { .it_value = *rqtp, 1429 struct itimerspec it = { .it_value = *rqtp,
1442 .it_interval = {} }; 1430 .it_interval = {} };
@@ -1483,7 +1471,6 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
1483 /* 1471 /*
1484 * Report back to the user the time still remaining. 1472 * Report back to the user the time still remaining.
1485 */ 1473 */
1486 rmtp = (struct timespec __user *) restart_block->arg1;
1487 if (rmtp != NULL && !(flags & TIMER_ABSTIME) && 1474 if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
1488 copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1475 copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1489 return -EFAULT; 1476 return -EFAULT;
@@ -1491,6 +1478,7 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
1491 restart_block->fn = posix_cpu_clock_nanosleep_restart; 1478 restart_block->fn = posix_cpu_clock_nanosleep_restart;
1492 /* Caller already set restart_block->arg1 */ 1479 /* Caller already set restart_block->arg1 */
1493 restart_block->arg0 = which_clock; 1480 restart_block->arg0 = which_clock;
1481 restart_block->arg1 = (unsigned long) rmtp;
1494 restart_block->arg2 = rqtp->tv_sec; 1482 restart_block->arg2 = rqtp->tv_sec;
1495 restart_block->arg3 = rqtp->tv_nsec; 1483 restart_block->arg3 = rqtp->tv_nsec;
1496 1484
@@ -1504,21 +1492,28 @@ static long
1504posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) 1492posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
1505{ 1493{
1506 clockid_t which_clock = restart_block->arg0; 1494 clockid_t which_clock = restart_block->arg0;
1507 struct timespec t = { .tv_sec = restart_block->arg2, 1495 struct timespec __user *rmtp;
1508 .tv_nsec = restart_block->arg3 }; 1496 struct timespec t;
1497
1498 rmtp = (struct timespec __user *) restart_block->arg1;
1499 t.tv_sec = restart_block->arg2;
1500 t.tv_nsec = restart_block->arg3;
1501
1509 restart_block->fn = do_no_restart_syscall; 1502 restart_block->fn = do_no_restart_syscall;
1510 return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t); 1503 return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
1511} 1504}
1512 1505
1513 1506
1514#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1507#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1515#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1508#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1516 1509
1517static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) 1510static int process_cpu_clock_getres(const clockid_t which_clock,
1511 struct timespec *tp)
1518{ 1512{
1519 return posix_cpu_clock_getres(PROCESS_CLOCK, tp); 1513 return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1520} 1514}
1521static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp) 1515static int process_cpu_clock_get(const clockid_t which_clock,
1516 struct timespec *tp)
1522{ 1517{
1523 return posix_cpu_clock_get(PROCESS_CLOCK, tp); 1518 return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1524} 1519}
@@ -1527,16 +1522,19 @@ static int process_cpu_timer_create(struct k_itimer *timer)
1527 timer->it_clock = PROCESS_CLOCK; 1522 timer->it_clock = PROCESS_CLOCK;
1528 return posix_cpu_timer_create(timer); 1523 return posix_cpu_timer_create(timer);
1529} 1524}
1530static int process_cpu_nsleep(clockid_t which_clock, int flags, 1525static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1531 struct timespec *rqtp) 1526 struct timespec *rqtp,
1527 struct timespec __user *rmtp)
1532{ 1528{
1533 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); 1529 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1534} 1530}
1535static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) 1531static int thread_cpu_clock_getres(const clockid_t which_clock,
1532 struct timespec *tp)
1536{ 1533{
1537 return posix_cpu_clock_getres(THREAD_CLOCK, tp); 1534 return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1538} 1535}
1539static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp) 1536static int thread_cpu_clock_get(const clockid_t which_clock,
1537 struct timespec *tp)
1540{ 1538{
1541 return posix_cpu_clock_get(THREAD_CLOCK, tp); 1539 return posix_cpu_clock_get(THREAD_CLOCK, tp);
1542} 1540}
@@ -1545,8 +1543,8 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1545 timer->it_clock = THREAD_CLOCK; 1543 timer->it_clock = THREAD_CLOCK;
1546 return posix_cpu_timer_create(timer); 1544 return posix_cpu_timer_create(timer);
1547} 1545}
1548static int thread_cpu_nsleep(clockid_t which_clock, int flags, 1546static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
1549 struct timespec *rqtp) 1547 struct timespec *rqtp, struct timespec __user *rmtp)
1550{ 1548{
1551 return -EINVAL; 1549 return -EINVAL;
1552} 1550}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5870efb3e200..197208b3aa2a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,21 +48,6 @@
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/module.h> 49#include <linux/module.h>
50 50
51#ifndef div_long_long_rem
52#include <asm/div64.h>
53
54#define div_long_long_rem(dividend,divisor,remainder) ({ \
55 u64 result = dividend; \
56 *remainder = do_div(result,divisor); \
57 result; })
58
59#endif
60#define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */
61
62static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
63{
64 return (u64)mpy1 * mpy2;
65}
66/* 51/*
67 * Management arrays for POSIX timers. Timers are kept in slab memory 52 * Management arrays for POSIX timers. Timers are kept in slab memory
68 * Timer ids are allocated by an external routine that keeps track of the 53 * Timer ids are allocated by an external routine that keeps track of the
@@ -148,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock);
148 */ 133 */
149 134
150static struct k_clock posix_clocks[MAX_CLOCKS]; 135static struct k_clock posix_clocks[MAX_CLOCKS];
136
151/* 137/*
152 * We only have one real clock that can be set so we need only one abs list, 138 * These ones are defined below.
153 * even if we should want to have several clocks with differing resolutions.
154 */ 139 */
155static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list), 140static int common_nsleep(const clockid_t, int flags, struct timespec *t,
156 .lock = SPIN_LOCK_UNLOCKED}; 141 struct timespec __user *rmtp);
142static void common_timer_get(struct k_itimer *, struct itimerspec *);
143static int common_timer_set(struct k_itimer *, int,
144 struct itimerspec *, struct itimerspec *);
145static int common_timer_del(struct k_itimer *timer);
157 146
158static void posix_timer_fn(unsigned long); 147static int posix_timer_fn(void *data);
159static u64 do_posix_clock_monotonic_gettime_parts(
160 struct timespec *tp, struct timespec *mo);
161int do_posix_clock_monotonic_gettime(struct timespec *tp);
162static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
163 148
164static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 149static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
165 150
@@ -184,7 +169,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
184 * the function pointer CALL in struct k_clock. 169 * the function pointer CALL in struct k_clock.
185 */ 170 */
186 171
187static inline int common_clock_getres(clockid_t which_clock, 172static inline int common_clock_getres(const clockid_t which_clock,
188 struct timespec *tp) 173 struct timespec *tp)
189{ 174{
190 tp->tv_sec = 0; 175 tp->tv_sec = 0;
@@ -192,39 +177,33 @@ static inline int common_clock_getres(clockid_t which_clock,
192 return 0; 177 return 0;
193} 178}
194 179
195static inline int common_clock_get(clockid_t which_clock, struct timespec *tp) 180/*
181 * Get real time for posix timers
182 */
183static int common_clock_get(clockid_t which_clock, struct timespec *tp)
196{ 184{
197 getnstimeofday(tp); 185 ktime_get_real_ts(tp);
198 return 0; 186 return 0;
199} 187}
200 188
201static inline int common_clock_set(clockid_t which_clock, struct timespec *tp) 189static inline int common_clock_set(const clockid_t which_clock,
190 struct timespec *tp)
202{ 191{
203 return do_sys_settimeofday(tp, NULL); 192 return do_sys_settimeofday(tp, NULL);
204} 193}
205 194
206static inline int common_timer_create(struct k_itimer *new_timer) 195static int common_timer_create(struct k_itimer *new_timer)
207{ 196{
208 INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry); 197 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock);
209 init_timer(&new_timer->it.real.timer); 198 new_timer->it.real.timer.data = new_timer;
210 new_timer->it.real.timer.data = (unsigned long) new_timer;
211 new_timer->it.real.timer.function = posix_timer_fn; 199 new_timer->it.real.timer.function = posix_timer_fn;
212 return 0; 200 return 0;
213} 201}
214 202
215/* 203/*
216 * These ones are defined below. 204 * Return nonzero if we know a priori this clockid_t value is bogus.
217 */
218static int common_nsleep(clockid_t, int flags, struct timespec *t);
219static void common_timer_get(struct k_itimer *, struct itimerspec *);
220static int common_timer_set(struct k_itimer *, int,
221 struct itimerspec *, struct itimerspec *);
222static int common_timer_del(struct k_itimer *timer);
223
224/*
225 * Return nonzero iff we know a priori this clockid_t value is bogus.
226 */ 205 */
227static inline int invalid_clockid(clockid_t which_clock) 206static inline int invalid_clockid(const clockid_t which_clock)
228{ 207{
229 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ 208 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */
230 return 0; 209 return 0;
@@ -232,26 +211,32 @@ static inline int invalid_clockid(clockid_t which_clock)
232 return 1; 211 return 1;
233 if (posix_clocks[which_clock].clock_getres != NULL) 212 if (posix_clocks[which_clock].clock_getres != NULL)
234 return 0; 213 return 0;
235#ifndef CLOCK_DISPATCH_DIRECT
236 if (posix_clocks[which_clock].res != 0) 214 if (posix_clocks[which_clock].res != 0)
237 return 0; 215 return 0;
238#endif
239 return 1; 216 return 1;
240} 217}
241 218
219/*
220 * Get monotonic time for posix timers
221 */
222static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
223{
224 ktime_get_ts(tp);
225 return 0;
226}
242 227
243/* 228/*
244 * Initialize everything, well, just everything in Posix clocks/timers ;) 229 * Initialize everything, well, just everything in Posix clocks/timers ;)
245 */ 230 */
246static __init int init_posix_timers(void) 231static __init int init_posix_timers(void)
247{ 232{
248 struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES, 233 struct k_clock clock_realtime = {
249 .abs_struct = &abs_list 234 .clock_getres = hrtimer_get_res,
250 }; 235 };
251 struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES, 236 struct k_clock clock_monotonic = {
252 .abs_struct = NULL, 237 .clock_getres = hrtimer_get_res,
253 .clock_get = do_posix_clock_monotonic_get, 238 .clock_get = posix_ktime_get_ts,
254 .clock_set = do_posix_clock_nosettime 239 .clock_set = do_posix_clock_nosettime,
255 }; 240 };
256 241
257 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 242 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -265,117 +250,17 @@ static __init int init_posix_timers(void)
265 250
266__initcall(init_posix_timers); 251__initcall(init_posix_timers);
267 252
268static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
269{
270 long sec = tp->tv_sec;
271 long nsec = tp->tv_nsec + res - 1;
272
273 if (nsec >= NSEC_PER_SEC) {
274 sec++;
275 nsec -= NSEC_PER_SEC;
276 }
277
278 /*
279 * The scaling constants are defined in <linux/time.h>
280 * The difference between there and here is that we do the
281 * res rounding and compute a 64-bit result (well so does that
282 * but it then throws away the high bits).
283 */
284 *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
285 (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >>
286 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
287}
288
289/*
290 * This function adjusts the timer as needed as a result of the clock
291 * being set. It should only be called for absolute timers, and then
292 * under the abs_list lock. It computes the time difference and sets
293 * the new jiffies value in the timer. It also updates the timers
294 * reference wall_to_monotonic value. It is complicated by the fact
295 * that tstojiffies() only handles positive times and it needs to work
296 * with both positive and negative times. Also, for negative offsets,
297 * we need to defeat the res round up.
298 *
299 * Return is true if there is a new time, else false.
300 */
301static long add_clockset_delta(struct k_itimer *timr,
302 struct timespec *new_wall_to)
303{
304 struct timespec delta;
305 int sign = 0;
306 u64 exp;
307
308 set_normalized_timespec(&delta,
309 new_wall_to->tv_sec -
310 timr->it.real.wall_to_prev.tv_sec,
311 new_wall_to->tv_nsec -
312 timr->it.real.wall_to_prev.tv_nsec);
313 if (likely(!(delta.tv_sec | delta.tv_nsec)))
314 return 0;
315 if (delta.tv_sec < 0) {
316 set_normalized_timespec(&delta,
317 -delta.tv_sec,
318 1 - delta.tv_nsec -
319 posix_clocks[timr->it_clock].res);
320 sign++;
321 }
322 tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
323 timr->it.real.wall_to_prev = *new_wall_to;
324 timr->it.real.timer.expires += (sign ? -exp : exp);
325 return 1;
326}
327
328static void remove_from_abslist(struct k_itimer *timr)
329{
330 if (!list_empty(&timr->it.real.abs_timer_entry)) {
331 spin_lock(&abs_list.lock);
332 list_del_init(&timr->it.real.abs_timer_entry);
333 spin_unlock(&abs_list.lock);
334 }
335}
336
337static void schedule_next_timer(struct k_itimer *timr) 253static void schedule_next_timer(struct k_itimer *timr)
338{ 254{
339 struct timespec new_wall_to; 255 if (timr->it.real.interval.tv64 == 0)
340 struct now_struct now;
341 unsigned long seq;
342
343 /*
344 * Set up the timer for the next interval (if there is one).
345 * Note: this code uses the abs_timer_lock to protect
346 * it.real.wall_to_prev and must hold it until exp is set, not exactly
347 * obvious...
348
349 * This function is used for CLOCK_REALTIME* and
350 * CLOCK_MONOTONIC* timers. If we ever want to handle other
351 * CLOCKs, the calling code (do_schedule_next_timer) would need
352 * to pull the "clock" info from the timer and dispatch the
353 * "other" CLOCKs "next timer" code (which, I suppose should
354 * also be added to the k_clock structure).
355 */
356 if (!timr->it.real.incr)
357 return; 256 return;
358 257
359 do { 258 timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
360 seq = read_seqbegin(&xtime_lock); 259 timr->it.real.interval);
361 new_wall_to = wall_to_monotonic;
362 posix_get_now(&now);
363 } while (read_seqretry(&xtime_lock, seq));
364
365 if (!list_empty(&timr->it.real.abs_timer_entry)) {
366 spin_lock(&abs_list.lock);
367 add_clockset_delta(timr, &new_wall_to);
368
369 posix_bump_timer(timr, now);
370
371 spin_unlock(&abs_list.lock);
372 } else {
373 posix_bump_timer(timr, now);
374 }
375 timr->it_overrun_last = timr->it_overrun; 260 timr->it_overrun_last = timr->it_overrun;
376 timr->it_overrun = -1; 261 timr->it_overrun = -1;
377 ++timr->it_requeue_pending; 262 ++timr->it_requeue_pending;
378 add_timer(&timr->it.real.timer); 263 hrtimer_restart(&timr->it.real.timer);
379} 264}
380 265
381/* 266/*
@@ -396,31 +281,23 @@ void do_schedule_next_timer(struct siginfo *info)
396 281
397 timr = lock_timer(info->si_tid, &flags); 282 timr = lock_timer(info->si_tid, &flags);
398 283
399 if (!timr || timr->it_requeue_pending != info->si_sys_private) 284 if (timr && timr->it_requeue_pending == info->si_sys_private) {
400 goto exit; 285 if (timr->it_clock < 0)
286 posix_cpu_timer_schedule(timr);
287 else
288 schedule_next_timer(timr);
401 289
402 if (timr->it_clock < 0) /* CPU clock */ 290 info->si_overrun = timr->it_overrun_last;
403 posix_cpu_timer_schedule(timr); 291 }
404 else 292
405 schedule_next_timer(timr); 293 unlock_timer(timr, flags);
406 info->si_overrun = timr->it_overrun_last;
407exit:
408 if (timr)
409 unlock_timer(timr, flags);
410} 294}
411 295
412int posix_timer_event(struct k_itimer *timr,int si_private) 296int posix_timer_event(struct k_itimer *timr,int si_private)
413{ 297{
414 memset(&timr->sigq->info, 0, sizeof(siginfo_t)); 298 memset(&timr->sigq->info, 0, sizeof(siginfo_t));
415 timr->sigq->info.si_sys_private = si_private; 299 timr->sigq->info.si_sys_private = si_private;
416 /* 300 /* Send signal to the process that owns this timer.*/
417 * Send signal to the process that owns this timer.
418
419 * This code assumes that all the possible abs_lists share the
420 * same lock (there is only one list at this time). If this is
421 * not the case, the CLOCK info would need to be used to find
422 * the proper abs list lock.
423 */
424 301
425 timr->sigq->info.si_signo = timr->it_sigev_signo; 302 timr->sigq->info.si_signo = timr->it_sigev_signo;
426 timr->sigq->info.si_errno = 0; 303 timr->sigq->info.si_errno = 0;
@@ -454,66 +331,37 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
454 331
455 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 332 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
456 */ 333 */
457static void posix_timer_fn(unsigned long __data) 334static int posix_timer_fn(void *data)
458{ 335{
459 struct k_itimer *timr = (struct k_itimer *) __data; 336 struct k_itimer *timr = data;
460 unsigned long flags; 337 unsigned long flags;
461 unsigned long seq; 338 int si_private = 0;
462 struct timespec delta, new_wall_to; 339 int ret = HRTIMER_NORESTART;
463 u64 exp = 0;
464 int do_notify = 1;
465 340
466 spin_lock_irqsave(&timr->it_lock, flags); 341 spin_lock_irqsave(&timr->it_lock, flags);
467 if (!list_empty(&timr->it.real.abs_timer_entry)) {
468 spin_lock(&abs_list.lock);
469 do {
470 seq = read_seqbegin(&xtime_lock);
471 new_wall_to = wall_to_monotonic;
472 } while (read_seqretry(&xtime_lock, seq));
473 set_normalized_timespec(&delta,
474 new_wall_to.tv_sec -
475 timr->it.real.wall_to_prev.tv_sec,
476 new_wall_to.tv_nsec -
477 timr->it.real.wall_to_prev.tv_nsec);
478 if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
479 /* do nothing, timer is on time */
480 } else if (delta.tv_sec < 0) {
481 /* do nothing, timer is already late */
482 } else {
483 /* timer is early due to a clock set */
484 tstojiffie(&delta,
485 posix_clocks[timr->it_clock].res,
486 &exp);
487 timr->it.real.wall_to_prev = new_wall_to;
488 timr->it.real.timer.expires += exp;
489 add_timer(&timr->it.real.timer);
490 do_notify = 0;
491 }
492 spin_unlock(&abs_list.lock);
493 342
494 } 343 if (timr->it.real.interval.tv64 != 0)
495 if (do_notify) { 344 si_private = ++timr->it_requeue_pending;
496 int si_private=0;
497 345
498 if (timr->it.real.incr) 346 if (posix_timer_event(timr, si_private)) {
499 si_private = ++timr->it_requeue_pending; 347 /*
500 else { 348 * signal was not sent because of sig_ignor
501 remove_from_abslist(timr); 349 * we will not get a call back to restart it AND
350 * it should be restarted.
351 */
352 if (timr->it.real.interval.tv64 != 0) {
353 timr->it_overrun +=
354 hrtimer_forward(&timr->it.real.timer,
355 timr->it.real.interval);
356 ret = HRTIMER_RESTART;
502 } 357 }
503
504 if (posix_timer_event(timr, si_private))
505 /*
506 * signal was not sent because of sig_ignor
507 * we will not get a call back to restart it AND
508 * it should be restarted.
509 */
510 schedule_next_timer(timr);
511 } 358 }
512 unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
513}
514 359
360 unlock_timer(timr, flags);
361 return ret;
362}
515 363
516static inline struct task_struct * good_sigevent(sigevent_t * event) 364static struct task_struct * good_sigevent(sigevent_t * event)
517{ 365{
518 struct task_struct *rtn = current->group_leader; 366 struct task_struct *rtn = current->group_leader;
519 367
@@ -530,7 +378,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event)
530 return rtn; 378 return rtn;
531} 379}
532 380
533void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock) 381void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
534{ 382{
535 if ((unsigned) clock_id >= MAX_CLOCKS) { 383 if ((unsigned) clock_id >= MAX_CLOCKS) {
536 printk("POSIX clock register failed for clock_id %d\n", 384 printk("POSIX clock register failed for clock_id %d\n",
@@ -576,7 +424,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
576/* Create a POSIX.1b interval timer. */ 424/* Create a POSIX.1b interval timer. */
577 425
578asmlinkage long 426asmlinkage long
579sys_timer_create(clockid_t which_clock, 427sys_timer_create(const clockid_t which_clock,
580 struct sigevent __user *timer_event_spec, 428 struct sigevent __user *timer_event_spec,
581 timer_t __user * created_timer_id) 429 timer_t __user * created_timer_id)
582{ 430{
@@ -602,8 +450,7 @@ sys_timer_create(clockid_t which_clock,
602 goto out; 450 goto out;
603 } 451 }
604 spin_lock_irq(&idr_lock); 452 spin_lock_irq(&idr_lock);
605 error = idr_get_new(&posix_timers_id, 453 error = idr_get_new(&posix_timers_id, (void *) new_timer,
606 (void *) new_timer,
607 &new_timer_id); 454 &new_timer_id);
608 spin_unlock_irq(&idr_lock); 455 spin_unlock_irq(&idr_lock);
609 if (error == -EAGAIN) 456 if (error == -EAGAIN)
@@ -704,27 +551,6 @@ out:
704} 551}
705 552
706/* 553/*
707 * good_timespec
708 *
709 * This function checks the elements of a timespec structure.
710 *
711 * Arguments:
712 * ts : Pointer to the timespec structure to check
713 *
714 * Return value:
715 * If a NULL pointer was passed in, or the tv_nsec field was less than 0
716 * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
717 * this function returns 0. Otherwise it returns 1.
718 */
719static int good_timespec(const struct timespec *ts)
720{
721 if ((!ts) || (ts->tv_sec < 0) ||
722 ((unsigned) ts->tv_nsec >= NSEC_PER_SEC))
723 return 0;
724 return 1;
725}
726
727/*
728 * Locking issues: We need to protect the result of the id look up until 554 * Locking issues: We need to protect the result of the id look up until
729 * we get the timer locked down so it is not deleted under us. The 555 * we get the timer locked down so it is not deleted under us. The
730 * removal is done under the idr spinlock so we use that here to bridge 556 * removal is done under the idr spinlock so we use that here to bridge
@@ -776,39 +602,39 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
776static void 602static void
777common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) 603common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
778{ 604{
779 unsigned long expires; 605 ktime_t remaining;
780 struct now_struct now; 606 struct hrtimer *timer = &timr->it.real.timer;
781 607
782 do 608 memset(cur_setting, 0, sizeof(struct itimerspec));
783 expires = timr->it.real.timer.expires; 609 remaining = hrtimer_get_remaining(timer);
784 while ((volatile long) (timr->it.real.timer.expires) != expires);
785
786 posix_get_now(&now);
787
788 if (expires &&
789 ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
790 !timr->it.real.incr &&
791 posix_time_before(&timr->it.real.timer, &now))
792 timr->it.real.timer.expires = expires = 0;
793 if (expires) {
794 if (timr->it_requeue_pending & REQUEUE_PENDING ||
795 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
796 posix_bump_timer(timr, now);
797 expires = timr->it.real.timer.expires;
798 }
799 else
800 if (!timer_pending(&timr->it.real.timer))
801 expires = 0;
802 if (expires)
803 expires -= now.jiffies;
804 }
805 jiffies_to_timespec(expires, &cur_setting->it_value);
806 jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
807 610
808 if (cur_setting->it_value.tv_sec < 0) { 611 /* Time left ? or timer pending */
612 if (remaining.tv64 > 0 || hrtimer_active(timer))
613 goto calci;
614 /* interval timer ? */
615 if (timr->it.real.interval.tv64 == 0)
616 return;
617 /*
618 * When a requeue is pending or this is a SIGEV_NONE timer
619 * move the expiry time forward by intervals, so expiry is >
620 * now.
621 */
622 if (timr->it_requeue_pending & REQUEUE_PENDING ||
623 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
624 timr->it_overrun +=
625 hrtimer_forward(timer, timr->it.real.interval);
626 remaining = hrtimer_get_remaining(timer);
627 }
628 calci:
629 /* interval timer ? */
630 if (timr->it.real.interval.tv64 != 0)
631 cur_setting->it_interval =
632 ktime_to_timespec(timr->it.real.interval);
633 /* Return 0 only, when the timer is expired and not pending */
634 if (remaining.tv64 <= 0)
809 cur_setting->it_value.tv_nsec = 1; 635 cur_setting->it_value.tv_nsec = 1;
810 cur_setting->it_value.tv_sec = 0; 636 else
811 } 637 cur_setting->it_value = ktime_to_timespec(remaining);
812} 638}
813 639
814/* Get the time remaining on a POSIX.1b interval timer. */ 640/* Get the time remaining on a POSIX.1b interval timer. */
@@ -832,6 +658,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
832 658
833 return 0; 659 return 0;
834} 660}
661
835/* 662/*
836 * Get the number of overruns of a POSIX.1b interval timer. This is to 663 * Get the number of overruns of a POSIX.1b interval timer. This is to
837 * be the overrun of the timer last delivered. At the same time we are 664 * be the overrun of the timer last delivered. At the same time we are
@@ -841,7 +668,6 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
841 * the call back to do_schedule_next_timer(). So all we need to do is 668 * the call back to do_schedule_next_timer(). So all we need to do is
842 * to pick up the frozen overrun. 669 * to pick up the frozen overrun.
843 */ 670 */
844
845asmlinkage long 671asmlinkage long
846sys_timer_getoverrun(timer_t timer_id) 672sys_timer_getoverrun(timer_t timer_id)
847{ 673{
@@ -858,153 +684,55 @@ sys_timer_getoverrun(timer_t timer_id)
858 684
859 return overrun; 685 return overrun;
860} 686}
861/*
862 * Adjust for absolute time
863 *
864 * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
865 * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
866 * what ever clock he is using.
867 *
868 * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
869 * time to it to get the proper time for the timer.
870 */
871static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
872 int abs, u64 *exp, struct timespec *wall_to)
873{
874 struct timespec now;
875 struct timespec oc = *tp;
876 u64 jiffies_64_f;
877 int rtn =0;
878
879 if (abs) {
880 /*
881 * The mask pick up the 4 basic clocks
882 */
883 if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
884 jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
885 &now, wall_to);
886 /*
887 * If we are doing a MONOTONIC clock
888 */
889 if((clock - &posix_clocks[0]) & CLOCKS_MONO){
890 now.tv_sec += wall_to->tv_sec;
891 now.tv_nsec += wall_to->tv_nsec;
892 }
893 } else {
894 /*
895 * Not one of the basic clocks
896 */
897 clock->clock_get(clock - posix_clocks, &now);
898 jiffies_64_f = get_jiffies_64();
899 }
900 /*
901 * Take away now to get delta and normalize
902 */
903 set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
904 oc.tv_nsec - now.tv_nsec);
905 }else{
906 jiffies_64_f = get_jiffies_64();
907 }
908 /*
909 * Check if the requested time is prior to now (if so set now)
910 */
911 if (oc.tv_sec < 0)
912 oc.tv_sec = oc.tv_nsec = 0;
913
914 if (oc.tv_sec | oc.tv_nsec)
915 set_normalized_timespec(&oc, oc.tv_sec,
916 oc.tv_nsec + clock->res);
917 tstojiffie(&oc, clock->res, exp);
918
919 /*
920 * Check if the requested time is more than the timer code
921 * can handle (if so we error out but return the value too).
922 */
923 if (*exp > ((u64)MAX_JIFFY_OFFSET))
924 /*
925 * This is a considered response, not exactly in
926 * line with the standard (in fact it is silent on
927 * possible overflows). We assume such a large
928 * value is ALMOST always a programming error and
929 * try not to compound it by setting a really dumb
930 * value.
931 */
932 rtn = -EINVAL;
933 /*
934 * return the actual jiffies expire time, full 64 bits
935 */
936 *exp += jiffies_64_f;
937 return rtn;
938}
939 687
940/* Set a POSIX.1b interval timer. */ 688/* Set a POSIX.1b interval timer. */
941/* timr->it_lock is taken. */ 689/* timr->it_lock is taken. */
942static inline int 690static int
943common_timer_set(struct k_itimer *timr, int flags, 691common_timer_set(struct k_itimer *timr, int flags,
944 struct itimerspec *new_setting, struct itimerspec *old_setting) 692 struct itimerspec *new_setting, struct itimerspec *old_setting)
945{ 693{
946 struct k_clock *clock = &posix_clocks[timr->it_clock]; 694 struct hrtimer *timer = &timr->it.real.timer;
947 u64 expire_64;
948 695
949 if (old_setting) 696 if (old_setting)
950 common_timer_get(timr, old_setting); 697 common_timer_get(timr, old_setting);
951 698
952 /* disable the timer */ 699 /* disable the timer */
953 timr->it.real.incr = 0; 700 timr->it.real.interval.tv64 = 0;
954 /* 701 /*
955 * careful here. If smp we could be in the "fire" routine which will 702 * careful here. If smp we could be in the "fire" routine which will
956 * be spinning as we hold the lock. But this is ONLY an SMP issue. 703 * be spinning as we hold the lock. But this is ONLY an SMP issue.
957 */ 704 */
958 if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { 705 if (hrtimer_try_to_cancel(timer) < 0)
959#ifdef CONFIG_SMP
960 /*
961 * It can only be active if on an other cpu. Since
962 * we have cleared the interval stuff above, it should
963 * clear once we release the spin lock. Of course once
964 * we do that anything could happen, including the
965 * complete melt down of the timer. So return with
966 * a "retry" exit status.
967 */
968 return TIMER_RETRY; 706 return TIMER_RETRY;
969#endif
970 }
971
972 remove_from_abslist(timr);
973 707
974 timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 708 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
975 ~REQUEUE_PENDING; 709 ~REQUEUE_PENDING;
976 timr->it_overrun_last = 0; 710 timr->it_overrun_last = 0;
977 timr->it_overrun = -1;
978 /*
979 *switch off the timer when it_value is zero
980 */
981 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
982 timr->it.real.timer.expires = 0;
983 return 0;
984 }
985 711
986 if (adjust_abs_time(clock, 712 /* switch off the timer when it_value is zero */
987 &new_setting->it_value, flags & TIMER_ABSTIME, 713 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
988 &expire_64, &(timr->it.real.wall_to_prev))) { 714 return 0;
989 return -EINVAL;
990 }
991 timr->it.real.timer.expires = (unsigned long)expire_64;
992 tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
993 timr->it.real.incr = (unsigned long)expire_64;
994 715
995 /* 716 /* Posix madness. Only absolute CLOCK_REALTIME timers
996 * We do not even queue SIGEV_NONE timers! But we do put them 717 * are affected by clock sets. So we must reiniatilize
997 * in the abs list so we can do that right. 718 * the timer.
998 */ 719 */
999 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)) 720 if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME))
1000 add_timer(&timr->it.real.timer); 721 hrtimer_rebase(timer, CLOCK_REALTIME);
1001 722 else
1002 if (flags & TIMER_ABSTIME && clock->abs_struct) { 723 hrtimer_rebase(timer, CLOCK_MONOTONIC);
1003 spin_lock(&clock->abs_struct->lock); 724
1004 list_add_tail(&(timr->it.real.abs_timer_entry), 725 timer->expires = timespec_to_ktime(new_setting->it_value);
1005 &(clock->abs_struct->list)); 726
1006 spin_unlock(&clock->abs_struct->lock); 727 /* Convert interval */
1007 } 728 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
729
730 /* SIGEV_NONE timers are not queued ! See common_timer_get */
731 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
732 return 0;
733
734 hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ?
735 HRTIMER_ABS : HRTIMER_REL);
1008 return 0; 736 return 0;
1009} 737}
1010 738
@@ -1026,8 +754,8 @@ sys_timer_settime(timer_t timer_id, int flags,
1026 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) 754 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
1027 return -EFAULT; 755 return -EFAULT;
1028 756
1029 if ((!good_timespec(&new_spec.it_interval)) || 757 if (!timespec_valid(&new_spec.it_interval) ||
1030 (!good_timespec(&new_spec.it_value))) 758 !timespec_valid(&new_spec.it_value))
1031 return -EINVAL; 759 return -EINVAL;
1032retry: 760retry:
1033 timr = lock_timer(timer_id, &flag); 761 timr = lock_timer(timer_id, &flag);
@@ -1043,8 +771,8 @@ retry:
1043 goto retry; 771 goto retry;
1044 } 772 }
1045 773
1046 if (old_setting && !error && copy_to_user(old_setting, 774 if (old_setting && !error &&
1047 &old_spec, sizeof (old_spec))) 775 copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
1048 error = -EFAULT; 776 error = -EFAULT;
1049 777
1050 return error; 778 return error;
@@ -1052,24 +780,10 @@ retry:
1052 780
1053static inline int common_timer_del(struct k_itimer *timer) 781static inline int common_timer_del(struct k_itimer *timer)
1054{ 782{
1055 timer->it.real.incr = 0; 783 timer->it.real.interval.tv64 = 0;
1056 784
1057 if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { 785 if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
1058#ifdef CONFIG_SMP
1059 /*
1060 * It can only be active if on an other cpu. Since
1061 * we have cleared the interval stuff above, it should
1062 * clear once we release the spin lock. Of course once
1063 * we do that anything could happen, including the
1064 * complete melt down of the timer. So return with
1065 * a "retry" exit status.
1066 */
1067 return TIMER_RETRY; 786 return TIMER_RETRY;
1068#endif
1069 }
1070
1071 remove_from_abslist(timer);
1072
1073 return 0; 787 return 0;
1074} 788}
1075 789
@@ -1085,24 +799,16 @@ sys_timer_delete(timer_t timer_id)
1085 struct k_itimer *timer; 799 struct k_itimer *timer;
1086 long flags; 800 long flags;
1087 801
1088#ifdef CONFIG_SMP
1089 int error;
1090retry_delete: 802retry_delete:
1091#endif
1092 timer = lock_timer(timer_id, &flags); 803 timer = lock_timer(timer_id, &flags);
1093 if (!timer) 804 if (!timer)
1094 return -EINVAL; 805 return -EINVAL;
1095 806
1096#ifdef CONFIG_SMP 807 if (timer_delete_hook(timer) == TIMER_RETRY) {
1097 error = timer_delete_hook(timer);
1098
1099 if (error == TIMER_RETRY) {
1100 unlock_timer(timer, flags); 808 unlock_timer(timer, flags);
1101 goto retry_delete; 809 goto retry_delete;
1102 } 810 }
1103#else 811
1104 timer_delete_hook(timer);
1105#endif
1106 spin_lock(&current->sighand->siglock); 812 spin_lock(&current->sighand->siglock);
1107 list_del(&timer->list); 813 list_del(&timer->list);
1108 spin_unlock(&current->sighand->siglock); 814 spin_unlock(&current->sighand->siglock);
@@ -1119,29 +825,21 @@ retry_delete:
1119 release_posix_timer(timer, IT_ID_SET); 825 release_posix_timer(timer, IT_ID_SET);
1120 return 0; 826 return 0;
1121} 827}
828
1122/* 829/*
1123 * return timer owned by the process, used by exit_itimers 830 * return timer owned by the process, used by exit_itimers
1124 */ 831 */
1125static inline void itimer_delete(struct k_itimer *timer) 832static void itimer_delete(struct k_itimer *timer)
1126{ 833{
1127 unsigned long flags; 834 unsigned long flags;
1128 835
1129#ifdef CONFIG_SMP
1130 int error;
1131retry_delete: 836retry_delete:
1132#endif
1133 spin_lock_irqsave(&timer->it_lock, flags); 837 spin_lock_irqsave(&timer->it_lock, flags);
1134 838
1135#ifdef CONFIG_SMP 839 if (timer_delete_hook(timer) == TIMER_RETRY) {
1136 error = timer_delete_hook(timer);
1137
1138 if (error == TIMER_RETRY) {
1139 unlock_timer(timer, flags); 840 unlock_timer(timer, flags);
1140 goto retry_delete; 841 goto retry_delete;
1141 } 842 }
1142#else
1143 timer_delete_hook(timer);
1144#endif
1145 list_del(&timer->list); 843 list_del(&timer->list);
1146 /* 844 /*
1147 * This keeps any tasks waiting on the spin lock from thinking 845 * This keeps any tasks waiting on the spin lock from thinking
@@ -1170,57 +868,8 @@ void exit_itimers(struct signal_struct *sig)
1170 } 868 }
1171} 869}
1172 870
1173/* 871/* Not available / possible... functions */
1174 * And now for the "clock" calls 872int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
1175 *
1176 * These functions are called both from timer functions (with the timer
1177 * spin_lock_irq() held and from clock calls with no locking. They must
1178 * use the save flags versions of locks.
1179 */
1180
1181/*
1182 * We do ticks here to avoid the irq lock ( they take sooo long).
1183 * The seqlock is great here. Since we a reader, we don't really care
1184 * if we are interrupted since we don't take lock that will stall us or
1185 * any other cpu. Voila, no irq lock is needed.
1186 *
1187 */
1188
1189static u64 do_posix_clock_monotonic_gettime_parts(
1190 struct timespec *tp, struct timespec *mo)
1191{
1192 u64 jiff;
1193 unsigned int seq;
1194
1195 do {
1196 seq = read_seqbegin(&xtime_lock);
1197 getnstimeofday(tp);
1198 *mo = wall_to_monotonic;
1199 jiff = jiffies_64;
1200
1201 } while(read_seqretry(&xtime_lock, seq));
1202
1203 return jiff;
1204}
1205
1206static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
1207{
1208 struct timespec wall_to_mono;
1209
1210 do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
1211
1212 set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec,
1213 tp->tv_nsec + wall_to_mono.tv_nsec);
1214
1215 return 0;
1216}
1217
1218int do_posix_clock_monotonic_gettime(struct timespec *tp)
1219{
1220 return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
1221}
1222
1223int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
1224{ 873{
1225 return -EINVAL; 874 return -EINVAL;
1226} 875}
@@ -1232,7 +881,8 @@ int do_posix_clock_notimer_create(struct k_itimer *timer)
1232} 881}
1233EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); 882EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
1234 883
1235int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) 884int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
885 struct timespec *t, struct timespec __user *r)
1236{ 886{
1237#ifndef ENOTSUP 887#ifndef ENOTSUP
1238 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ 888 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
@@ -1242,8 +892,8 @@ int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
1242} 892}
1243EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); 893EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
1244 894
1245asmlinkage long 895asmlinkage long sys_clock_settime(const clockid_t which_clock,
1246sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp) 896 const struct timespec __user *tp)
1247{ 897{
1248 struct timespec new_tp; 898 struct timespec new_tp;
1249 899
@@ -1256,7 +906,7 @@ sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
1256} 906}
1257 907
1258asmlinkage long 908asmlinkage long
1259sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp) 909sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
1260{ 910{
1261 struct timespec kernel_tp; 911 struct timespec kernel_tp;
1262 int error; 912 int error;
@@ -1273,7 +923,7 @@ sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
1273} 923}
1274 924
1275asmlinkage long 925asmlinkage long
1276sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) 926sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
1277{ 927{
1278 struct timespec rtn_tp; 928 struct timespec rtn_tp;
1279 int error; 929 int error;
@@ -1292,117 +942,34 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
1292} 942}
1293 943
1294/* 944/*
1295 * The standard says that an absolute nanosleep call MUST wake up at 945 * nanosleep for monotonic and realtime clocks
1296 * the requested time in spite of clock settings. Here is what we do:
1297 * For each nanosleep call that needs it (only absolute and not on
1298 * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
1299 * into the "nanosleep_abs_list". All we need is the task_struct pointer.
1300 * When ever the clock is set we just wake up all those tasks. The rest
1301 * is done by the while loop in clock_nanosleep().
1302 *
1303 * On locking, clock_was_set() is called from update_wall_clock which
1304 * holds (or has held for it) a write_lock_irq( xtime_lock) and is
1305 * called from the timer bh code. Thus we need the irq save locks.
1306 *
1307 * Also, on the call from update_wall_clock, that is done as part of a
1308 * softirq thing. We don't want to delay the system that much (possibly
1309 * long list of timers to fix), so we defer that work to keventd.
1310 */ 946 */
1311 947static int common_nsleep(const clockid_t which_clock, int flags,
1312static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue); 948 struct timespec *tsave, struct timespec __user *rmtp)
1313static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL); 949{
1314 950 int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
1315static DECLARE_MUTEX(clock_was_set_lock); 951 int clockid = which_clock;
1316 952
1317void clock_was_set(void) 953 switch (which_clock) {
1318{ 954 case CLOCK_REALTIME:
1319 struct k_itimer *timr; 955 /* Posix madness. Only absolute timers on clock realtime
1320 struct timespec new_wall_to; 956 are affected by clock set. */
1321 LIST_HEAD(cws_list); 957 if (mode != HRTIMER_ABS)
1322 unsigned long seq; 958 clockid = CLOCK_MONOTONIC;
1323 959 case CLOCK_MONOTONIC:
1324 960 break;
1325 if (unlikely(in_interrupt())) { 961 default:
1326 schedule_work(&clock_was_set_work); 962 return -EINVAL;
1327 return;
1328 } 963 }
1329 wake_up_all(&nanosleep_abs_wqueue); 964 return hrtimer_nanosleep(tsave, rmtp, mode, clockid);
1330
1331 /*
1332 * Check if there exist TIMER_ABSTIME timers to correct.
1333 *
1334 * Notes on locking: This code is run in task context with irq
1335 * on. We CAN be interrupted! All other usage of the abs list
1336 * lock is under the timer lock which holds the irq lock as
1337 * well. We REALLY don't want to scan the whole list with the
1338 * interrupt system off, AND we would like a sequence lock on
1339 * this code as well. Since we assume that the clock will not
1340 * be set often, it seems ok to take and release the irq lock
1341 * for each timer. In fact add_timer will do this, so this is
1342 * not an issue. So we know when we are done, we will move the
1343 * whole list to a new location. Then as we process each entry,
1344 * we will move it to the actual list again. This way, when our
1345 * copy is empty, we are done. We are not all that concerned
1346 * about preemption so we will use a semaphore lock to protect
1347 * aginst reentry. This way we will not stall another
1348 * processor. It is possible that this may delay some timers
1349 * that should have expired, given the new clock, but even this
1350 * will be minimal as we will always update to the current time,
1351 * even if it was set by a task that is waiting for entry to
1352 * this code. Timers that expire too early will be caught by
1353 * the expire code and restarted.
1354
1355 * Absolute timers that repeat are left in the abs list while
1356 * waiting for the task to pick up the signal. This means we
1357 * may find timers that are not in the "add_timer" list, but are
1358 * in the abs list. We do the same thing for these, save
1359 * putting them back in the "add_timer" list. (Note, these are
1360 * left in the abs list mainly to indicate that they are
1361 * ABSOLUTE timers, a fact that is used by the re-arm code, and
1362 * for which we have no other flag.)
1363
1364 */
1365
1366 down(&clock_was_set_lock);
1367 spin_lock_irq(&abs_list.lock);
1368 list_splice_init(&abs_list.list, &cws_list);
1369 spin_unlock_irq(&abs_list.lock);
1370 do {
1371 do {
1372 seq = read_seqbegin(&xtime_lock);
1373 new_wall_to = wall_to_monotonic;
1374 } while (read_seqretry(&xtime_lock, seq));
1375
1376 spin_lock_irq(&abs_list.lock);
1377 if (list_empty(&cws_list)) {
1378 spin_unlock_irq(&abs_list.lock);
1379 break;
1380 }
1381 timr = list_entry(cws_list.next, struct k_itimer,
1382 it.real.abs_timer_entry);
1383
1384 list_del_init(&timr->it.real.abs_timer_entry);
1385 if (add_clockset_delta(timr, &new_wall_to) &&
1386 del_timer(&timr->it.real.timer)) /* timer run yet? */
1387 add_timer(&timr->it.real.timer);
1388 list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
1389 spin_unlock_irq(&abs_list.lock);
1390 } while (1);
1391
1392 up(&clock_was_set_lock);
1393} 965}
1394 966
1395long clock_nanosleep_restart(struct restart_block *restart_block);
1396
1397asmlinkage long 967asmlinkage long
1398sys_clock_nanosleep(clockid_t which_clock, int flags, 968sys_clock_nanosleep(const clockid_t which_clock, int flags,
1399 const struct timespec __user *rqtp, 969 const struct timespec __user *rqtp,
1400 struct timespec __user *rmtp) 970 struct timespec __user *rmtp)
1401{ 971{
1402 struct timespec t; 972 struct timespec t;
1403 struct restart_block *restart_block =
1404 &(current_thread_info()->restart_block);
1405 int ret;
1406 973
1407 if (invalid_clockid(which_clock)) 974 if (invalid_clockid(which_clock))
1408 return -EINVAL; 975 return -EINVAL;
@@ -1410,125 +977,9 @@ sys_clock_nanosleep(clockid_t which_clock, int flags,
1410 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 977 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1411 return -EFAULT; 978 return -EFAULT;
1412 979
1413 if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0) 980 if (!timespec_valid(&t))
1414 return -EINVAL; 981 return -EINVAL;
1415 982
1416 /* 983 return CLOCK_DISPATCH(which_clock, nsleep,
1417 * Do this here as nsleep function does not have the real address. 984 (which_clock, flags, &t, rmtp));
1418 */
1419 restart_block->arg1 = (unsigned long)rmtp;
1420
1421 ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
1422
1423 if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
1424 copy_to_user(rmtp, &t, sizeof (t)))
1425 return -EFAULT;
1426 return ret;
1427}
1428
1429
1430static int common_nsleep(clockid_t which_clock,
1431 int flags, struct timespec *tsave)
1432{
1433 struct timespec t, dum;
1434 DECLARE_WAITQUEUE(abs_wqueue, current);
1435 u64 rq_time = (u64)0;
1436 s64 left;
1437 int abs;
1438 struct restart_block *restart_block =
1439 &current_thread_info()->restart_block;
1440
1441 abs_wqueue.flags = 0;
1442 abs = flags & TIMER_ABSTIME;
1443
1444 if (restart_block->fn == clock_nanosleep_restart) {
1445 /*
1446 * Interrupted by a non-delivered signal, pick up remaining
1447 * time and continue. Remaining time is in arg2 & 3.
1448 */
1449 restart_block->fn = do_no_restart_syscall;
1450
1451 rq_time = restart_block->arg3;
1452 rq_time = (rq_time << 32) + restart_block->arg2;
1453 if (!rq_time)
1454 return -EINTR;
1455 left = rq_time - get_jiffies_64();
1456 if (left <= (s64)0)
1457 return 0; /* Already passed */
1458 }
1459
1460 if (abs && (posix_clocks[which_clock].clock_get !=
1461 posix_clocks[CLOCK_MONOTONIC].clock_get))
1462 add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
1463
1464 do {
1465 t = *tsave;
1466 if (abs || !rq_time) {
1467 adjust_abs_time(&posix_clocks[which_clock], &t, abs,
1468 &rq_time, &dum);
1469 }
1470
1471 left = rq_time - get_jiffies_64();
1472 if (left >= (s64)MAX_JIFFY_OFFSET)
1473 left = (s64)MAX_JIFFY_OFFSET;
1474 if (left < (s64)0)
1475 break;
1476
1477 schedule_timeout_interruptible(left);
1478
1479 left = rq_time - get_jiffies_64();
1480 } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
1481
1482 if (abs_wqueue.task_list.next)
1483 finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
1484
1485 if (left > (s64)0) {
1486
1487 /*
1488 * Always restart abs calls from scratch to pick up any
1489 * clock shifting that happened while we are away.
1490 */
1491 if (abs)
1492 return -ERESTARTNOHAND;
1493
1494 left *= TICK_NSEC;
1495 tsave->tv_sec = div_long_long_rem(left,
1496 NSEC_PER_SEC,
1497 &tsave->tv_nsec);
1498 /*
1499 * Restart works by saving the time remaing in
1500 * arg2 & 3 (it is 64-bits of jiffies). The other
1501 * info we need is the clock_id (saved in arg0).
1502 * The sys_call interface needs the users
1503 * timespec return address which _it_ saves in arg1.
1504 * Since we have cast the nanosleep call to a clock_nanosleep
1505 * both can be restarted with the same code.
1506 */
1507 restart_block->fn = clock_nanosleep_restart;
1508 restart_block->arg0 = which_clock;
1509 /*
1510 * Caller sets arg1
1511 */
1512 restart_block->arg2 = rq_time & 0xffffffffLL;
1513 restart_block->arg3 = rq_time >> 32;
1514
1515 return -ERESTART_RESTARTBLOCK;
1516 }
1517
1518 return 0;
1519}
1520/*
1521 * This will restart clock_nanosleep.
1522 */
1523long
1524clock_nanosleep_restart(struct restart_block *restart_block)
1525{
1526 struct timespec t;
1527 int ret = common_nsleep(restart_block->arg0, 0, &t);
1528
1529 if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
1530 copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
1531 sizeof (t)))
1532 return -EFAULT;
1533 return ret;
1534} 985}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5ec248cb7f4a..9fd8d4f03595 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -38,7 +38,7 @@ config PM_DEBUG
38 38
39config SOFTWARE_SUSPEND 39config SOFTWARE_SUSPEND
40 bool "Software Suspend" 40 bool "Software Suspend"
41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP) 41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
42 ---help--- 42 ---help---
43 Enable the possibility of suspending the machine. 43 Enable the possibility of suspending the machine.
44 It doesn't need APM. 44 It doesn't need APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 027322a564f4..e03d85e55291 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -24,10 +24,11 @@
24 24
25extern suspend_disk_method_t pm_disk_mode; 25extern suspend_disk_method_t pm_disk_mode;
26 26
27extern int swsusp_shrink_memory(void);
27extern int swsusp_suspend(void); 28extern int swsusp_suspend(void);
28extern int swsusp_write(void); 29extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
29extern int swsusp_check(void); 30extern int swsusp_check(void);
30extern int swsusp_read(void); 31extern int swsusp_read(struct pbe **pblist_ptr);
31extern void swsusp_close(void); 32extern void swsusp_close(void);
32extern int swsusp_resume(void); 33extern int swsusp_resume(void);
33 34
@@ -52,7 +53,7 @@ static void power_down(suspend_disk_method_t mode)
52 53
53 switch(mode) { 54 switch(mode) {
54 case PM_DISK_PLATFORM: 55 case PM_DISK_PLATFORM:
55 kernel_power_off_prepare(); 56 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
56 error = pm_ops->enter(PM_SUSPEND_DISK); 57 error = pm_ops->enter(PM_SUSPEND_DISK);
57 break; 58 break;
58 case PM_DISK_SHUTDOWN: 59 case PM_DISK_SHUTDOWN:
@@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode)
73static int in_suspend __nosavedata = 0; 74static int in_suspend __nosavedata = 0;
74 75
75 76
76/**
77 * free_some_memory - Try to free as much memory as possible
78 *
79 * ... but do not OOM-kill anyone
80 *
81 * Notice: all userland should be stopped at this point, or
82 * livelock is possible.
83 */
84
85static void free_some_memory(void)
86{
87 unsigned int i = 0;
88 unsigned int tmp;
89 unsigned long pages = 0;
90 char *p = "-\\|/";
91
92 printk("Freeing memory... ");
93 while ((tmp = shrink_all_memory(10000))) {
94 pages += tmp;
95 printk("\b%c", p[i++ % 4]);
96 }
97 printk("\bdone (%li pages freed)\n", pages);
98}
99
100
101static inline void platform_finish(void) 77static inline void platform_finish(void)
102{ 78{
103 if (pm_disk_mode == PM_DISK_PLATFORM) { 79 if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -119,16 +95,9 @@ static int prepare_processes(void)
119 goto thaw; 95 goto thaw;
120 } 96 }
121 97
122 if (pm_disk_mode == PM_DISK_PLATFORM) {
123 if (pm_ops && pm_ops->prepare) {
124 if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
125 goto thaw;
126 }
127 }
128
129 /* Free memory before shutting down devices. */ 98 /* Free memory before shutting down devices. */
130 free_some_memory(); 99 if (!(error = swsusp_shrink_memory()))
131 return 0; 100 return 0;
132thaw: 101thaw:
133 thaw_processes(); 102 thaw_processes();
134 enable_nonboot_cpus(); 103 enable_nonboot_cpus();
@@ -176,7 +145,7 @@ int pm_suspend_disk(void)
176 if (in_suspend) { 145 if (in_suspend) {
177 device_resume(); 146 device_resume();
178 pr_debug("PM: writing image.\n"); 147 pr_debug("PM: writing image.\n");
179 error = swsusp_write(); 148 error = swsusp_write(pagedir_nosave, nr_copy_pages);
180 if (!error) 149 if (!error)
181 power_down(pm_disk_mode); 150 power_down(pm_disk_mode);
182 else { 151 else {
@@ -247,7 +216,7 @@ static int software_resume(void)
247 216
248 pr_debug("PM: Reading swsusp image.\n"); 217 pr_debug("PM: Reading swsusp image.\n");
249 218
250 if ((error = swsusp_read())) { 219 if ((error = swsusp_read(&pagedir_nosave))) {
251 swsusp_free(); 220 swsusp_free();
252 goto Thaw; 221 goto Thaw;
253 } 222 }
@@ -363,37 +332,55 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf)
363 MINOR(swsusp_resume_device)); 332 MINOR(swsusp_resume_device));
364} 333}
365 334
366static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) 335static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
367{ 336{
368 int len;
369 char *p;
370 unsigned int maj, min; 337 unsigned int maj, min;
371 int error = -EINVAL;
372 dev_t res; 338 dev_t res;
339 int ret = -EINVAL;
373 340
374 p = memchr(buf, '\n', n); 341 if (sscanf(buf, "%u:%u", &maj, &min) != 2)
375 len = p ? p - buf : n; 342 goto out;
376 343
377 if (sscanf(buf, "%u:%u", &maj, &min) == 2) { 344 res = MKDEV(maj,min);
378 res = MKDEV(maj,min); 345 if (maj != MAJOR(res) || min != MINOR(res))
379 if (maj == MAJOR(res) && min == MINOR(res)) { 346 goto out;
380 down(&pm_sem);
381 swsusp_resume_device = res;
382 up(&pm_sem);
383 printk("Attempting manual resume\n");
384 noresume = 0;
385 software_resume();
386 }
387 }
388 347
389 return error >= 0 ? n : error; 348 down(&pm_sem);
349 swsusp_resume_device = res;
350 up(&pm_sem);
351 printk("Attempting manual resume\n");
352 noresume = 0;
353 software_resume();
354 ret = n;
355out:
356 return ret;
390} 357}
391 358
392power_attr(resume); 359power_attr(resume);
393 360
361static ssize_t image_size_show(struct subsystem * subsys, char *buf)
362{
363 return sprintf(buf, "%u\n", image_size);
364}
365
366static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
367{
368 unsigned int size;
369
370 if (sscanf(buf, "%u", &size) == 1) {
371 image_size = size;
372 return n;
373 }
374
375 return -EINVAL;
376}
377
378power_attr(image_size);
379
394static struct attribute * g[] = { 380static struct attribute * g[] = {
395 &disk_attr.attr, 381 &disk_attr.attr,
396 &resume_attr.attr, 382 &resume_attr.attr,
383 &image_size_attr.attr,
397 NULL, 384 NULL,
398}; 385};
399 386
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d253f3ae2fa5..9cb235cba4a9 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -133,10 +133,10 @@ static int suspend_enter(suspend_state_t state)
133static void suspend_finish(suspend_state_t state) 133static void suspend_finish(suspend_state_t state)
134{ 134{
135 device_resume(); 135 device_resume();
136 if (pm_ops && pm_ops->finish)
137 pm_ops->finish(state);
138 thaw_processes(); 136 thaw_processes();
139 enable_nonboot_cpus(); 137 enable_nonboot_cpus();
138 if (pm_ops && pm_ops->finish)
139 pm_ops->finish(state);
140 pm_restore_console(); 140 pm_restore_console();
141} 141}
142 142
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6c042b5ee14b..7e8492fd1423 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -9,19 +9,13 @@
9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
10#endif 10#endif
11 11
12#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \
13 - 4 - 3*sizeof(unsigned long) - sizeof(int) \
14 - sizeof(void *)) / sizeof(swp_entry_t))
15
16struct swsusp_info { 12struct swsusp_info {
17 struct new_utsname uts; 13 struct new_utsname uts;
18 u32 version_code; 14 u32 version_code;
19 unsigned long num_physpages; 15 unsigned long num_physpages;
20 int cpus; 16 int cpus;
21 unsigned long image_pages; 17 unsigned long image_pages;
22 unsigned long pagedir_pages; 18 unsigned long pages;
23 suspend_pagedir_t * suspend_pagedir;
24 swp_entry_t pagedir[MAX_PBES];
25} __attribute__((aligned(PAGE_SIZE))); 19} __attribute__((aligned(PAGE_SIZE)));
26 20
27 21
@@ -48,25 +42,27 @@ static struct subsys_attribute _name##_attr = { \
48 42
49extern struct subsystem power_subsys; 43extern struct subsystem power_subsys;
50 44
51extern int freeze_processes(void);
52extern void thaw_processes(void);
53
54extern int pm_prepare_console(void); 45extern int pm_prepare_console(void);
55extern void pm_restore_console(void); 46extern void pm_restore_console(void);
56 47
57
58/* References to section boundaries */ 48/* References to section boundaries */
59extern const void __nosave_begin, __nosave_end; 49extern const void __nosave_begin, __nosave_end;
60 50
61extern unsigned int nr_copy_pages; 51extern unsigned int nr_copy_pages;
62extern suspend_pagedir_t *pagedir_nosave; 52extern struct pbe *pagedir_nosave;
63extern suspend_pagedir_t *pagedir_save; 53
54/* Preferred image size in MB (default 500) */
55extern unsigned int image_size;
64 56
65extern asmlinkage int swsusp_arch_suspend(void); 57extern asmlinkage int swsusp_arch_suspend(void);
66extern asmlinkage int swsusp_arch_resume(void); 58extern asmlinkage int swsusp_arch_resume(void);
67 59
60extern unsigned int count_data_pages(void);
68extern void free_pagedir(struct pbe *pblist); 61extern void free_pagedir(struct pbe *pblist);
62extern void release_eaten_pages(void);
69extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); 63extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
70extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
71extern void swsusp_free(void); 64extern void swsusp_free(void);
72extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); 65extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
66extern unsigned int snapshot_nr_pages(void);
67extern struct pbe *snapshot_pblist(void);
68extern void snapshot_pblist_set(struct pbe *pblist);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4a6dbcefd378..41f66365f0d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,7 +33,35 @@
33 33
34#include "power.h" 34#include "power.h"
35 35
36struct pbe *pagedir_nosave;
37unsigned int nr_copy_pages;
38
36#ifdef CONFIG_HIGHMEM 39#ifdef CONFIG_HIGHMEM
40unsigned int count_highmem_pages(void)
41{
42 struct zone *zone;
43 unsigned long zone_pfn;
44 unsigned int n = 0;
45
46 for_each_zone (zone)
47 if (is_highmem(zone)) {
48 mark_free_pages(zone);
49 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
50 struct page *page;
51 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
52 if (!pfn_valid(pfn))
53 continue;
54 page = pfn_to_page(pfn);
55 if (PageReserved(page))
56 continue;
57 if (PageNosaveFree(page))
58 continue;
59 n++;
60 }
61 }
62 return n;
63}
64
37struct highmem_page { 65struct highmem_page {
38 char *data; 66 char *data;
39 struct page *page; 67 struct page *page;
@@ -149,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
149 BUG_ON(PageReserved(page) && PageNosave(page)); 177 BUG_ON(PageReserved(page) && PageNosave(page));
150 if (PageNosave(page)) 178 if (PageNosave(page))
151 return 0; 179 return 0;
152 if (PageReserved(page) && pfn_is_nosave(pfn)) { 180 if (PageReserved(page) && pfn_is_nosave(pfn))
153 pr_debug("[nosave pfn 0x%lx]", pfn);
154 return 0; 181 return 0;
155 }
156 if (PageNosaveFree(page)) 182 if (PageNosaveFree(page))
157 return 0; 183 return 0;
158 184
159 return 1; 185 return 1;
160} 186}
161 187
162static unsigned count_data_pages(void) 188unsigned int count_data_pages(void)
163{ 189{
164 struct zone *zone; 190 struct zone *zone;
165 unsigned long zone_pfn; 191 unsigned long zone_pfn;
@@ -244,7 +270,7 @@ static inline void fill_pb_page(struct pbe *pbpage)
244 * of memory pages allocated with alloc_pagedir() 270 * of memory pages allocated with alloc_pagedir()
245 */ 271 */
246 272
247void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) 273static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
248{ 274{
249 struct pbe *pbpage, *p; 275 struct pbe *pbpage, *p;
250 unsigned int num = PBES_PER_PAGE; 276 unsigned int num = PBES_PER_PAGE;
@@ -261,7 +287,35 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
261 p->next = p + 1; 287 p->next = p + 1;
262 p->next = NULL; 288 p->next = NULL;
263 } 289 }
264 pr_debug("create_pbe_list(): initialized %d PBEs\n", num); 290}
291
292/**
293 * On resume it is necessary to trace and eventually free the unsafe
294 * pages that have been allocated, because they are needed for I/O
295 * (on x86-64 we likely will "eat" these pages once again while
296 * creating the temporary page translation tables)
297 */
298
299struct eaten_page {
300 struct eaten_page *next;
301 char padding[PAGE_SIZE - sizeof(void *)];
302};
303
304static struct eaten_page *eaten_pages = NULL;
305
306void release_eaten_pages(void)
307{
308 struct eaten_page *p, *q;
309
310 p = eaten_pages;
311 while (p) {
312 q = p->next;
313 /* We don't want swsusp_free() to free this page again */
314 ClearPageNosave(virt_to_page(p));
315 free_page((unsigned long)p);
316 p = q;
317 }
318 eaten_pages = NULL;
265} 319}
266 320
267/** 321/**
@@ -282,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
282 if (safe_needed) 336 if (safe_needed)
283 do { 337 do {
284 res = (void *)get_zeroed_page(gfp_mask); 338 res = (void *)get_zeroed_page(gfp_mask);
285 if (res && PageNosaveFree(virt_to_page(res))) 339 if (res && PageNosaveFree(virt_to_page(res))) {
286 /* This is for swsusp_free() */ 340 /* This is for swsusp_free() */
287 SetPageNosave(virt_to_page(res)); 341 SetPageNosave(virt_to_page(res));
342 ((struct eaten_page *)res)->next = eaten_pages;
343 eaten_pages = res;
344 }
288 } while (res && PageNosaveFree(virt_to_page(res))); 345 } while (res && PageNosaveFree(virt_to_page(res)));
289 else 346 else
290 res = (void *)get_zeroed_page(gfp_mask); 347 res = (void *)get_zeroed_page(gfp_mask);
@@ -332,7 +389,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
332 if (!pbe) { /* get_zeroed_page() failed */ 389 if (!pbe) { /* get_zeroed_page() failed */
333 free_pagedir(pblist); 390 free_pagedir(pblist);
334 pblist = NULL; 391 pblist = NULL;
335 } 392 } else
393 create_pbe_list(pblist, nr_pages);
336 return pblist; 394 return pblist;
337} 395}
338 396
@@ -370,8 +428,14 @@ void swsusp_free(void)
370 428
371static int enough_free_mem(unsigned int nr_pages) 429static int enough_free_mem(unsigned int nr_pages)
372{ 430{
373 pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); 431 struct zone *zone;
374 return nr_free_pages() > (nr_pages + PAGES_FOR_IO + 432 unsigned int n = 0;
433
434 for_each_zone (zone)
435 if (!is_highmem(zone))
436 n += zone->free_pages;
437 pr_debug("swsusp: available memory: %u pages\n", n);
438 return n > (nr_pages + PAGES_FOR_IO +
375 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 439 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
376} 440}
377 441
@@ -395,7 +459,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
395 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 459 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
396 return NULL; 460 return NULL;
397 } 461 }
398 create_pbe_list(pblist, nr_pages);
399 462
400 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { 463 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
401 printk(KERN_ERR "suspend: Allocating image pages failed.\n"); 464 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -421,10 +484,6 @@ asmlinkage int swsusp_save(void)
421 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, 484 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
422 PAGES_FOR_IO, nr_free_pages()); 485 PAGES_FOR_IO, nr_free_pages());
423 486
424 /* This is needed because of the fixed size of swsusp_info */
425 if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
426 return -ENOSPC;
427
428 if (!enough_free_mem(nr_pages)) { 487 if (!enough_free_mem(nr_pages)) {
429 printk(KERN_ERR "swsusp: Not enough free memory\n"); 488 printk(KERN_ERR "swsusp: Not enough free memory\n");
430 return -ENOMEM; 489 return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c05f46e7348f..55a18d26abed 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,8 +30,8 @@
30 * Alex Badea <vampire@go.ro>: 30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init 31 * Fixed runaway init
32 * 32 *
33 * Andreas Steinmetz <ast@domdv.de>: 33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Added encrypted suspend option 34 * Added the swap map data structure and reworked the handling of swap
35 * 35 *
36 * More state savers are welcome. Especially for the scsi layer... 36 * More state savers are welcome. Especially for the scsi layer...
37 * 37 *
@@ -67,44 +67,33 @@
67#include <asm/tlbflush.h> 67#include <asm/tlbflush.h>
68#include <asm/io.h> 68#include <asm/io.h>
69 69
70#include <linux/random.h>
71#include <linux/crypto.h>
72#include <asm/scatterlist.h>
73
74#include "power.h" 70#include "power.h"
75 71
72/*
73 * Preferred image size in MB (tunable via /sys/power/image_size).
74 * When it is set to N, swsusp will do its best to ensure the image
75 * size will not exceed N MB, but if that is impossible, it will
76 * try to create the smallest image possible.
77 */
78unsigned int image_size = 500;
79
76#ifdef CONFIG_HIGHMEM 80#ifdef CONFIG_HIGHMEM
81unsigned int count_highmem_pages(void);
77int save_highmem(void); 82int save_highmem(void);
78int restore_highmem(void); 83int restore_highmem(void);
79#else 84#else
80static int save_highmem(void) { return 0; } 85static int save_highmem(void) { return 0; }
81static int restore_highmem(void) { return 0; } 86static int restore_highmem(void) { return 0; }
87static unsigned int count_highmem_pages(void) { return 0; }
82#endif 88#endif
83 89
84#define CIPHER "aes"
85#define MAXKEY 32
86#define MAXIV 32
87
88extern char resume_file[]; 90extern char resume_file[];
89 91
90/* Local variables that should not be affected by save */
91unsigned int nr_copy_pages __nosavedata = 0;
92
93/* Suspend pagedir is allocated before final copy, therefore it
94 must be freed after resume
95
96 Warning: this is even more evil than it seems. Pagedirs this file
97 talks about are completely different from page directories used by
98 MMU hardware.
99 */
100suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
101
102#define SWSUSP_SIG "S1SUSPEND" 92#define SWSUSP_SIG "S1SUSPEND"
103 93
104static struct swsusp_header { 94static struct swsusp_header {
105 char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; 95 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
106 u8 key_iv[MAXKEY+MAXIV]; 96 swp_entry_t image;
107 swp_entry_t swsusp_info;
108 char orig_sig[10]; 97 char orig_sig[10];
109 char sig[10]; 98 char sig[10];
110} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 99} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
@@ -115,140 +104,9 @@ static struct swsusp_info swsusp_info;
115 * Saving part... 104 * Saving part...
116 */ 105 */
117 106
118/* We memorize in swapfile_used what swap devices are used for suspension */ 107static unsigned short root_swap = 0xffff;
119#define SWAPFILE_UNUSED 0
120#define SWAPFILE_SUSPEND 1 /* This is the suspending device */
121#define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */
122
123static unsigned short swapfile_used[MAX_SWAPFILES];
124static unsigned short root_swap;
125
126static int write_page(unsigned long addr, swp_entry_t *loc);
127static int bio_read_page(pgoff_t page_off, void *page);
128
129static u8 key_iv[MAXKEY+MAXIV];
130
131#ifdef CONFIG_SWSUSP_ENCRYPT
132
133static int crypto_init(int mode, void **mem)
134{
135 int error = 0;
136 int len;
137 char *modemsg;
138 struct crypto_tfm *tfm;
139
140 modemsg = mode ? "suspend not possible" : "resume not possible";
141
142 tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
143 if(!tfm) {
144 printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
145 error = -EINVAL;
146 goto out;
147 }
148
149 if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
150 printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
151 error = -ENOKEY;
152 goto fail;
153 }
154
155 if (mode)
156 get_random_bytes(key_iv, MAXKEY+MAXIV);
157
158 len = crypto_tfm_alg_max_keysize(tfm);
159 if (len > MAXKEY)
160 len = MAXKEY;
161
162 if (crypto_cipher_setkey(tfm, key_iv, len)) {
163 printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
164 error = -EKEYREJECTED;
165 goto fail;
166 }
167
168 len = crypto_tfm_alg_ivsize(tfm);
169
170 if (MAXIV < len) {
171 printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
172 error = -EOVERFLOW;
173 goto fail;
174 }
175
176 crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
177
178 *mem=(void *)tfm;
179
180 goto out;
181
182fail: crypto_free_tfm(tfm);
183out: return error;
184}
185
186static __inline__ void crypto_exit(void *mem)
187{
188 crypto_free_tfm((struct crypto_tfm *)mem);
189}
190
191static __inline__ int crypto_write(struct pbe *p, void *mem)
192{
193 int error = 0;
194 struct scatterlist src, dst;
195
196 src.page = virt_to_page(p->address);
197 src.offset = 0;
198 src.length = PAGE_SIZE;
199 dst.page = virt_to_page((void *)&swsusp_header);
200 dst.offset = 0;
201 dst.length = PAGE_SIZE;
202
203 error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
204 PAGE_SIZE);
205
206 if (!error)
207 error = write_page((unsigned long)&swsusp_header,
208 &(p->swap_address));
209 return error;
210}
211
212static __inline__ int crypto_read(struct pbe *p, void *mem)
213{
214 int error = 0;
215 struct scatterlist src, dst;
216
217 error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
218 if (!error) {
219 src.offset = 0;
220 src.length = PAGE_SIZE;
221 dst.offset = 0;
222 dst.length = PAGE_SIZE;
223 src.page = dst.page = virt_to_page((void *)p->address);
224
225 error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
226 &src, PAGE_SIZE);
227 }
228 return error;
229}
230#else
231static __inline__ int crypto_init(int mode, void *mem)
232{
233 return 0;
234}
235
236static __inline__ void crypto_exit(void *mem)
237{
238}
239
240static __inline__ int crypto_write(struct pbe *p, void *mem)
241{
242 return write_page(p->address, &(p->swap_address));
243}
244 108
245static __inline__ int crypto_read(struct pbe *p, void *mem) 109static int mark_swapfiles(swp_entry_t start)
246{
247 return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
248}
249#endif
250
251static int mark_swapfiles(swp_entry_t prev)
252{ 110{
253 int error; 111 int error;
254 112
@@ -259,8 +117,7 @@ static int mark_swapfiles(swp_entry_t prev)
259 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 117 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
260 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 118 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
261 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 119 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
262 memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); 120 swsusp_header.image = start;
263 swsusp_header.swsusp_info = prev;
264 error = rw_swap_page_sync(WRITE, 121 error = rw_swap_page_sync(WRITE,
265 swp_entry(root_swap, 0), 122 swp_entry(root_swap, 0),
266 virt_to_page((unsigned long) 123 virt_to_page((unsigned long)
@@ -283,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
283 * devfs, since the resume code can only recognize the form /dev/hda4, 140 * devfs, since the resume code can only recognize the form /dev/hda4,
284 * but the suspend code would see the long name.) 141 * but the suspend code would see the long name.)
285 */ 142 */
286static int is_resume_device(const struct swap_info_struct *swap_info) 143static inline int is_resume_device(const struct swap_info_struct *swap_info)
287{ 144{
288 struct file *file = swap_info->swap_file; 145 struct file *file = swap_info->swap_file;
289 struct inode *inode = file->f_dentry->d_inode; 146 struct inode *inode = file->f_dentry->d_inode;
@@ -294,54 +151,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
294 151
295static int swsusp_swap_check(void) /* This is called before saving image */ 152static int swsusp_swap_check(void) /* This is called before saving image */
296{ 153{
297 int i, len;
298
299 len=strlen(resume_file);
300 root_swap = 0xFFFF;
301
302 spin_lock(&swap_lock);
303 for (i=0; i<MAX_SWAPFILES; i++) {
304 if (!(swap_info[i].flags & SWP_WRITEOK)) {
305 swapfile_used[i]=SWAPFILE_UNUSED;
306 } else {
307 if (!len) {
308 printk(KERN_WARNING "resume= option should be used to set suspend device" );
309 if (root_swap == 0xFFFF) {
310 swapfile_used[i] = SWAPFILE_SUSPEND;
311 root_swap = i;
312 } else
313 swapfile_used[i] = SWAPFILE_IGNORED;
314 } else {
315 /* we ignore all swap devices that are not the resume_file */
316 if (is_resume_device(&swap_info[i])) {
317 swapfile_used[i] = SWAPFILE_SUSPEND;
318 root_swap = i;
319 } else {
320 swapfile_used[i] = SWAPFILE_IGNORED;
321 }
322 }
323 }
324 }
325 spin_unlock(&swap_lock);
326 return (root_swap != 0xffff) ? 0 : -ENODEV;
327}
328
329/**
330 * This is called after saving image so modification
331 * will be lost after resume... and that's what we want.
332 * we make the device unusable. A new call to
333 * lock_swapdevices can unlock the devices.
334 */
335static void lock_swapdevices(void)
336{
337 int i; 154 int i;
338 155
156 if (!swsusp_resume_device)
157 return -ENODEV;
339 spin_lock(&swap_lock); 158 spin_lock(&swap_lock);
340 for (i = 0; i< MAX_SWAPFILES; i++) 159 for (i = 0; i < MAX_SWAPFILES; i++) {
341 if (swapfile_used[i] == SWAPFILE_IGNORED) { 160 if (!(swap_info[i].flags & SWP_WRITEOK))
342 swap_info[i].flags ^= SWP_WRITEOK; 161 continue;
162 if (is_resume_device(swap_info + i)) {
163 spin_unlock(&swap_lock);
164 root_swap = i;
165 return 0;
343 } 166 }
167 }
344 spin_unlock(&swap_lock); 168 spin_unlock(&swap_lock);
169 return -ENODEV;
345} 170}
346 171
347/** 172/**
@@ -359,72 +184,217 @@ static void lock_swapdevices(void)
359static int write_page(unsigned long addr, swp_entry_t *loc) 184static int write_page(unsigned long addr, swp_entry_t *loc)
360{ 185{
361 swp_entry_t entry; 186 swp_entry_t entry;
362 int error = 0; 187 int error = -ENOSPC;
363 188
364 entry = get_swap_page(); 189 entry = get_swap_page_of_type(root_swap);
365 if (swp_offset(entry) && 190 if (swp_offset(entry)) {
366 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { 191 error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
367 error = rw_swap_page_sync(WRITE, entry, 192 if (!error || error == -EIO)
368 virt_to_page(addr));
369 if (error == -EIO)
370 error = 0;
371 if (!error)
372 *loc = entry; 193 *loc = entry;
373 } else 194 }
374 error = -ENOSPC;
375 return error; 195 return error;
376} 196}
377 197
378/** 198/**
379 * data_free - Free the swap entries used by the saved image. 199 * Swap map-handling functions
200 *
201 * The swap map is a data structure used for keeping track of each page
202 * written to the swap. It consists of many swap_map_page structures
203 * that contain each an array of MAP_PAGE_SIZE swap entries.
204 * These structures are linked together with the help of either the
205 * .next (in memory) or the .next_swap (in swap) member.
380 * 206 *
381 * Walk the list of used swap entries and free each one. 207 * The swap map is created during suspend. At that time we need to keep
382 * This is only used for cleanup when suspend fails. 208 * it in memory, because we have to free all of the allocated swap
209 * entries if an error occurs. The memory needed is preallocated
210 * so that we know in advance if there's enough of it.
211 *
212 * The first swap_map_page structure is filled with the swap entries that
213 * correspond to the first MAP_PAGE_SIZE data pages written to swap and
214 * so on. After the all of the data pages have been written, the order
215 * of the swap_map_page structures in the map is reversed so that they
216 * can be read from swap in the original order. This causes the data
217 * pages to be loaded in exactly the same order in which they have been
218 * saved.
219 *
220 * During resume we only need to use one swap_map_page structure
221 * at a time, which means that we only need to use two memory pages for
222 * reading the image - one for reading the swap_map_page structures
223 * and the second for reading the data pages from swap.
383 */ 224 */
384static void data_free(void) 225
226#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
227 / sizeof(swp_entry_t))
228
229struct swap_map_page {
230 swp_entry_t entries[MAP_PAGE_SIZE];
231 swp_entry_t next_swap;
232 struct swap_map_page *next;
233};
234
235static inline void free_swap_map(struct swap_map_page *swap_map)
385{ 236{
386 swp_entry_t entry; 237 struct swap_map_page *swp;
387 struct pbe *p;
388 238
389 for_each_pbe (p, pagedir_nosave) { 239 while (swap_map) {
390 entry = p->swap_address; 240 swp = swap_map->next;
391 if (entry.val) 241 free_page((unsigned long)swap_map);
392 swap_free(entry); 242 swap_map = swp;
393 else
394 break;
395 } 243 }
396} 244}
397 245
246static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
247{
248 struct swap_map_page *swap_map, *swp;
249 unsigned n = 0;
250
251 if (!nr_pages)
252 return NULL;
253
254 pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
255 swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
256 swp = swap_map;
257 for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
258 swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
259 swp = swp->next;
260 if (!swp) {
261 free_swap_map(swap_map);
262 return NULL;
263 }
264 }
265 return swap_map;
266}
267
398/** 268/**
399 * data_write - Write saved image to swap. 269 * reverse_swap_map - reverse the order of pages in the swap map
400 * 270 * @swap_map
401 * Walk the list of pages in the image and sync each one to swap.
402 */ 271 */
403static int data_write(void) 272
273static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
404{ 274{
405 int error = 0, i = 0; 275 struct swap_map_page *prev, *next;
406 unsigned int mod = nr_copy_pages / 100; 276
407 struct pbe *p; 277 prev = NULL;
408 void *tfm; 278 while (swap_map) {
279 next = swap_map->next;
280 swap_map->next = prev;
281 prev = swap_map;
282 swap_map = next;
283 }
284 return prev;
285}
409 286
410 if ((error = crypto_init(1, &tfm))) 287/**
411 return error; 288 * free_swap_map_entries - free the swap entries allocated to store
289 * the swap map @swap_map (this is only called in case of an error)
290 */
291static inline void free_swap_map_entries(struct swap_map_page *swap_map)
292{
293 while (swap_map) {
294 if (swap_map->next_swap.val)
295 swap_free(swap_map->next_swap);
296 swap_map = swap_map->next;
297 }
298}
412 299
413 if (!mod) 300/**
414 mod = 1; 301 * save_swap_map - save the swap map used for tracing the data pages
302 * stored in the swap
303 */
415 304
416 printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); 305static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
417 for_each_pbe (p, pagedir_nosave) { 306{
418 if (!(i%mod)) 307 swp_entry_t entry = (swp_entry_t){0};
419 printk( "\b\b\b\b%3d%%", i / mod ); 308 int error;
420 if ((error = crypto_write(p, tfm))) { 309
421 crypto_exit(tfm); 310 while (swap_map) {
311 swap_map->next_swap = entry;
312 if ((error = write_page((unsigned long)swap_map, &entry)))
422 return error; 313 return error;
423 } 314 swap_map = swap_map->next;
424 i++;
425 } 315 }
426 printk("\b\b\b\bdone\n"); 316 *start = entry;
427 crypto_exit(tfm); 317 return 0;
318}
319
320/**
321 * free_image_entries - free the swap entries allocated to store
322 * the image data pages (this is only called in case of an error)
323 */
324
325static inline void free_image_entries(struct swap_map_page *swp)
326{
327 unsigned k;
328
329 while (swp) {
330 for (k = 0; k < MAP_PAGE_SIZE; k++)
331 if (swp->entries[k].val)
332 swap_free(swp->entries[k]);
333 swp = swp->next;
334 }
335}
336
337/**
338 * The swap_map_handle structure is used for handling the swap map in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 unsigned int k;
345};
346
347static inline void init_swap_map_handle(struct swap_map_handle *handle,
348 struct swap_map_page *map)
349{
350 handle->cur = map;
351 handle->k = 0;
352}
353
354static inline int swap_map_write_page(struct swap_map_handle *handle,
355 unsigned long addr)
356{
357 int error;
358
359 error = write_page(addr, handle->cur->entries + handle->k);
360 if (error)
361 return error;
362 if (++handle->k >= MAP_PAGE_SIZE) {
363 handle->cur = handle->cur->next;
364 handle->k = 0;
365 }
366 return 0;
367}
368
369/**
370 * save_image_data - save the data pages pointed to by the PBEs
371 * from the list @pblist using the swap map handle @handle
372 * (assume there are @nr_pages data pages to save)
373 */
374
375static int save_image_data(struct pbe *pblist,
376 struct swap_map_handle *handle,
377 unsigned int nr_pages)
378{
379 unsigned int m;
380 struct pbe *p;
381 int error = 0;
382
383 printk("Saving image data pages (%u pages) ... ", nr_pages);
384 m = nr_pages / 100;
385 if (!m)
386 m = 1;
387 nr_pages = 0;
388 for_each_pbe (p, pblist) {
389 error = swap_map_write_page(handle, p->address);
390 if (error)
391 break;
392 if (!(nr_pages % m))
393 printk("\b\b\b\b%3d%%", nr_pages / m);
394 nr_pages++;
395 }
396 if (!error)
397 printk("\b\b\b\bdone\n");
428 return error; 398 return error;
429} 399}
430 400
@@ -440,70 +410,70 @@ static void dump_info(void)
440 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); 410 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
441 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); 411 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
442 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); 412 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
443 pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); 413 pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
444} 414}
445 415
446static void init_header(void) 416static void init_header(unsigned int nr_pages)
447{ 417{
448 memset(&swsusp_info, 0, sizeof(swsusp_info)); 418 memset(&swsusp_info, 0, sizeof(swsusp_info));
449 swsusp_info.version_code = LINUX_VERSION_CODE; 419 swsusp_info.version_code = LINUX_VERSION_CODE;
450 swsusp_info.num_physpages = num_physpages; 420 swsusp_info.num_physpages = num_physpages;
451 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); 421 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
452 422
453 swsusp_info.suspend_pagedir = pagedir_nosave;
454 swsusp_info.cpus = num_online_cpus(); 423 swsusp_info.cpus = num_online_cpus();
455 swsusp_info.image_pages = nr_copy_pages; 424 swsusp_info.image_pages = nr_pages;
456} 425 swsusp_info.pages = nr_pages +
457 426 ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
458static int close_swap(void)
459{
460 swp_entry_t entry;
461 int error;
462
463 dump_info();
464 error = write_page((unsigned long)&swsusp_info, &entry);
465 if (!error) {
466 printk( "S" );
467 error = mark_swapfiles(entry);
468 printk( "|\n" );
469 }
470 return error;
471} 427}
472 428
473/** 429/**
474 * free_pagedir_entries - Free pages used by the page directory. 430 * pack_orig_addresses - the .orig_address fields of the PBEs from the
475 * 431 * list starting at @pbe are stored in the array @buf[] (1 page)
476 * This is used during suspend for error recovery.
477 */ 432 */
478 433
479static void free_pagedir_entries(void) 434static inline struct pbe *pack_orig_addresses(unsigned long *buf,
435 struct pbe *pbe)
480{ 436{
481 int i; 437 int j;
482 438
483 for (i = 0; i < swsusp_info.pagedir_pages; i++) 439 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
484 swap_free(swsusp_info.pagedir[i]); 440 buf[j] = pbe->orig_address;
441 pbe = pbe->next;
442 }
443 if (!pbe)
444 for (; j < PAGE_SIZE / sizeof(long); j++)
445 buf[j] = 0;
446 return pbe;
485} 447}
486 448
487
488/** 449/**
489 * write_pagedir - Write the array of pages holding the page directory. 450 * save_image_metadata - save the .orig_address fields of the PBEs
490 * @last: Last swap entry we write (needed for header). 451 * from the list @pblist using the swap map handle @handle
491 */ 452 */
492 453
493static int write_pagedir(void) 454static int save_image_metadata(struct pbe *pblist,
455 struct swap_map_handle *handle)
494{ 456{
495 int error = 0; 457 unsigned long *buf;
496 unsigned int n = 0; 458 unsigned int n = 0;
497 struct pbe *pbe; 459 struct pbe *p;
460 int error = 0;
498 461
499 printk( "Writing pagedir..."); 462 printk("Saving image metadata ... ");
500 for_each_pb_page (pbe, pagedir_nosave) { 463 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
501 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) 464 if (!buf)
502 return error; 465 return -ENOMEM;
466 p = pblist;
467 while (p) {
468 p = pack_orig_addresses(buf, p);
469 error = swap_map_write_page(handle, (unsigned long)buf);
470 if (error)
471 break;
472 n++;
503 } 473 }
504 474 free_page((unsigned long)buf);
505 swsusp_info.pagedir_pages = n; 475 if (!error)
506 printk("done (%u pages)\n", n); 476 printk("done (%u pages saved)\n", n);
507 return error; 477 return error;
508} 478}
509 479
@@ -511,75 +481,125 @@ static int write_pagedir(void)
511 * enough_swap - Make sure we have enough swap to save the image. 481 * enough_swap - Make sure we have enough swap to save the image.
512 * 482 *
513 * Returns TRUE or FALSE after checking the total amount of swap 483 * Returns TRUE or FALSE after checking the total amount of swap
514 * space avaiable. 484 * space avaiable from the resume partition.
515 *
516 * FIXME: si_swapinfo(&i) returns all swap devices information.
517 * We should only consider resume_device.
518 */ 485 */
519 486
520static int enough_swap(unsigned int nr_pages) 487static int enough_swap(unsigned int nr_pages)
521{ 488{
522 struct sysinfo i; 489 unsigned int free_swap = swap_info[root_swap].pages -
490 swap_info[root_swap].inuse_pages;
523 491
524 si_swapinfo(&i); 492 pr_debug("swsusp: free swap pages: %u\n", free_swap);
525 pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); 493 return free_swap > (nr_pages + PAGES_FOR_IO +
526 return i.freeswap > (nr_pages + PAGES_FOR_IO +
527 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 494 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
528} 495}
529 496
530/** 497/**
531 * write_suspend_image - Write entire image and metadata. 498 * swsusp_write - Write entire image and metadata.
532 * 499 *
500 * It is important _NOT_ to umount filesystems at this point. We want
501 * them synced (in case something goes wrong) but we DO not want to mark
502 * filesystem clean: it is not. (And it does not matter, if we resume
503 * correctly, we'll mark system clean, anyway.)
533 */ 504 */
534static int write_suspend_image(void) 505
506int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
535{ 507{
508 struct swap_map_page *swap_map;
509 struct swap_map_handle handle;
510 swp_entry_t start;
536 int error; 511 int error;
537 512
538 if (!enough_swap(nr_copy_pages)) { 513 if ((error = swsusp_swap_check())) {
514 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
515 return error;
516 }
517 if (!enough_swap(nr_pages)) {
539 printk(KERN_ERR "swsusp: Not enough free swap\n"); 518 printk(KERN_ERR "swsusp: Not enough free swap\n");
540 return -ENOSPC; 519 return -ENOSPC;
541 } 520 }
542 521
543 init_header(); 522 init_header(nr_pages);
544 if ((error = data_write())) 523 swap_map = alloc_swap_map(swsusp_info.pages);
545 goto FreeData; 524 if (!swap_map)
525 return -ENOMEM;
526 init_swap_map_handle(&handle, swap_map);
527
528 error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
529 if (!error)
530 error = save_image_metadata(pblist, &handle);
531 if (!error)
532 error = save_image_data(pblist, &handle, nr_pages);
533 if (error)
534 goto Free_image_entries;
546 535
547 if ((error = write_pagedir())) 536 swap_map = reverse_swap_map(swap_map);
548 goto FreePagedir; 537 error = save_swap_map(swap_map, &start);
538 if (error)
539 goto Free_map_entries;
549 540
550 if ((error = close_swap())) 541 dump_info();
551 goto FreePagedir; 542 printk( "S" );
552 Done: 543 error = mark_swapfiles(start);
553 memset(key_iv, 0, MAXKEY+MAXIV); 544 printk( "|\n" );
545 if (error)
546 goto Free_map_entries;
547
548Free_swap_map:
549 free_swap_map(swap_map);
554 return error; 550 return error;
555 FreePagedir: 551
556 free_pagedir_entries(); 552Free_map_entries:
557 FreeData: 553 free_swap_map_entries(swap_map);
558 data_free(); 554Free_image_entries:
559 goto Done; 555 free_image_entries(swap_map);
556 goto Free_swap_map;
560} 557}
561 558
562/* It is important _NOT_ to umount filesystems at this point. We want 559/**
563 * them synced (in case something goes wrong) but we DO not want to mark 560 * swsusp_shrink_memory - Try to free as much memory as needed
564 * filesystem clean: it is not. (And it does not matter, if we resume 561 *
565 * correctly, we'll mark system clean, anyway.) 562 * ... but do not OOM-kill anyone
563 *
564 * Notice: all userland should be stopped before it is called, or
565 * livelock is possible.
566 */ 566 */
567int swsusp_write(void)
568{
569 int error;
570 567
571 if ((error = swsusp_swap_check())) { 568#define SHRINK_BITE 10000
572 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
573 return error;
574 }
575 lock_swapdevices();
576 error = write_suspend_image();
577 /* This will unlock ignored swap devices since writing is finished */
578 lock_swapdevices();
579 return error;
580}
581 569
570int swsusp_shrink_memory(void)
571{
572 long size, tmp;
573 struct zone *zone;
574 unsigned long pages = 0;
575 unsigned int i = 0;
576 char *p = "-\\|/";
577
578 printk("Shrinking memory... ");
579 do {
580 size = 2 * count_highmem_pages();
581 size += size / 50 + count_data_pages();
582 size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
583 PAGES_FOR_IO;
584 tmp = size;
585 for_each_zone (zone)
586 if (!is_highmem(zone))
587 tmp -= zone->free_pages;
588 if (tmp > 0) {
589 tmp = shrink_all_memory(SHRINK_BITE);
590 if (!tmp)
591 return -ENOMEM;
592 pages += tmp;
593 } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
594 tmp = shrink_all_memory(SHRINK_BITE);
595 pages += tmp;
596 }
597 printk("\b%c", p[i++%4]);
598 } while (tmp > 0);
599 printk("\bdone (%lu pages freed)\n", pages);
582 600
601 return 0;
602}
583 603
584int swsusp_suspend(void) 604int swsusp_suspend(void)
585{ 605{
@@ -677,7 +697,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
677 /* We assume both lists contain the same number of elements */ 697 /* We assume both lists contain the same number of elements */
678 while (src) { 698 while (src) {
679 dst->orig_address = src->orig_address; 699 dst->orig_address = src->orig_address;
680 dst->swap_address = src->swap_address;
681 dst = dst->next; 700 dst = dst->next;
682 src = src->next; 701 src = src->next;
683 } 702 }
@@ -757,198 +776,224 @@ static int bio_write_page(pgoff_t page_off, void *page)
757 return submit(WRITE, page_off, page); 776 return submit(WRITE, page_off, page);
758} 777}
759 778
760/* 779/**
761 * Sanity check if this image makes sense with this kernel/swap context 780 * The following functions allow us to read data using a swap map
762 * I really don't think that it's foolproof but more than nothing.. 781 * in a file-alike way
763 */ 782 */
764 783
765static const char *sanity_check(void) 784static inline void release_swap_map_reader(struct swap_map_handle *handle)
766{ 785{
767 dump_info(); 786 if (handle->cur)
768 if (swsusp_info.version_code != LINUX_VERSION_CODE) 787 free_page((unsigned long)handle->cur);
769 return "kernel version"; 788 handle->cur = NULL;
770 if (swsusp_info.num_physpages != num_physpages)
771 return "memory size";
772 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
773 return "system type";
774 if (strcmp(swsusp_info.uts.release,system_utsname.release))
775 return "kernel release";
776 if (strcmp(swsusp_info.uts.version,system_utsname.version))
777 return "version";
778 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
779 return "machine";
780#if 0
781 /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
782 if (swsusp_info.cpus != num_possible_cpus())
783 return "number of cpus";
784#endif
785 return NULL;
786} 789}
787 790
788 791static inline int get_swap_map_reader(struct swap_map_handle *handle,
789static int check_header(void) 792 swp_entry_t start)
790{ 793{
791 const char *reason = NULL;
792 int error; 794 int error;
793 795
794 if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) 796 if (!swp_offset(start))
797 return -EINVAL;
798 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
799 if (!handle->cur)
800 return -ENOMEM;
801 error = bio_read_page(swp_offset(start), handle->cur);
802 if (error) {
803 release_swap_map_reader(handle);
795 return error; 804 return error;
796
797 /* Is this same machine? */
798 if ((reason = sanity_check())) {
799 printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
800 return -EPERM;
801 } 805 }
802 nr_copy_pages = swsusp_info.image_pages; 806 handle->k = 0;
803 return error; 807 return 0;
804} 808}
805 809
806static int check_sig(void) 810static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
807{ 811{
812 unsigned long offset;
808 int error; 813 int error;
809 814
810 memset(&swsusp_header, 0, sizeof(swsusp_header)); 815 if (!handle->cur)
811 if ((error = bio_read_page(0, &swsusp_header))) 816 return -EINVAL;
812 return error; 817 offset = swp_offset(handle->cur->entries[handle->k]);
813 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 818 if (!offset)
814 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
815 memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
816 memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
817
818 /*
819 * Reset swap signature now.
820 */
821 error = bio_write_page(0, &swsusp_header);
822 } else {
823 return -EINVAL; 819 return -EINVAL;
820 error = bio_read_page(offset, buf);
821 if (error)
822 return error;
823 if (++handle->k >= MAP_PAGE_SIZE) {
824 handle->k = 0;
825 offset = swp_offset(handle->cur->next_swap);
826 if (!offset)
827 release_swap_map_reader(handle);
828 else
829 error = bio_read_page(offset, handle->cur);
824 } 830 }
825 if (!error)
826 pr_debug("swsusp: Signature found, resuming\n");
827 return error; 831 return error;
828} 832}
829 833
830/** 834static int check_header(void)
831 * data_read - Read image pages from swap.
832 *
833 * You do not need to check for overlaps, check_pagedir()
834 * already did that.
835 */
836
837static int data_read(struct pbe *pblist)
838{ 835{
839 struct pbe *p; 836 char *reason = NULL;
840 int error = 0;
841 int i = 0;
842 int mod = swsusp_info.image_pages / 100;
843 void *tfm;
844
845 if ((error = crypto_init(0, &tfm)))
846 return error;
847
848 if (!mod)
849 mod = 1;
850
851 printk("swsusp: Reading image data (%lu pages): ",
852 swsusp_info.image_pages);
853
854 for_each_pbe (p, pblist) {
855 if (!(i % mod))
856 printk("\b\b\b\b%3d%%", i / mod);
857 837
858 if ((error = crypto_read(p, tfm))) { 838 dump_info();
859 crypto_exit(tfm); 839 if (swsusp_info.version_code != LINUX_VERSION_CODE)
860 return error; 840 reason = "kernel version";
861 } 841 if (swsusp_info.num_physpages != num_physpages)
862 842 reason = "memory size";
863 i++; 843 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
844 reason = "system type";
845 if (strcmp(swsusp_info.uts.release,system_utsname.release))
846 reason = "kernel release";
847 if (strcmp(swsusp_info.uts.version,system_utsname.version))
848 reason = "version";
849 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
850 reason = "machine";
851 if (reason) {
852 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
853 return -EPERM;
864 } 854 }
865 printk("\b\b\b\bdone\n"); 855 return 0;
866 crypto_exit(tfm);
867 return error;
868} 856}
869 857
870/** 858/**
871 * read_pagedir - Read page backup list pages from swap 859 * load_image_data - load the image data using the swap map handle
860 * @handle and store them using the page backup list @pblist
861 * (assume there are @nr_pages pages to load)
872 */ 862 */
873 863
874static int read_pagedir(struct pbe *pblist) 864static int load_image_data(struct pbe *pblist,
865 struct swap_map_handle *handle,
866 unsigned int nr_pages)
875{ 867{
876 struct pbe *pbpage, *p;
877 unsigned int i = 0;
878 int error; 868 int error;
869 unsigned int m;
870 struct pbe *p;
879 871
880 if (!pblist) 872 if (!pblist)
881 return -EFAULT; 873 return -EINVAL;
882 874 printk("Loading image data pages (%u pages) ... ", nr_pages);
883 printk("swsusp: Reading pagedir (%lu pages)\n", 875 m = nr_pages / 100;
884 swsusp_info.pagedir_pages); 876 if (!m)
885 877 m = 1;
886 for_each_pb_page (pbpage, pblist) { 878 nr_pages = 0;
887 unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); 879 p = pblist;
888 880 while (p) {
889 error = -EFAULT; 881 error = swap_map_read_page(handle, (void *)p->address);
890 if (offset) {
891 p = (pbpage + PB_PAGE_SKIP)->next;
892 error = bio_read_page(offset, (void *)pbpage);
893 (pbpage + PB_PAGE_SKIP)->next = p;
894 }
895 if (error) 882 if (error)
896 break; 883 break;
884 p = p->next;
885 if (!(nr_pages % m))
886 printk("\b\b\b\b%3d%%", nr_pages / m);
887 nr_pages++;
897 } 888 }
898
899 if (!error) 889 if (!error)
900 BUG_ON(i != swsusp_info.pagedir_pages); 890 printk("\b\b\b\bdone\n");
901
902 return error; 891 return error;
903} 892}
904 893
894/**
895 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
896 * the PBEs in the list starting at @pbe
897 */
905 898
906static int check_suspend_image(void) 899static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
900 struct pbe *pbe)
907{ 901{
908 int error = 0; 902 int j;
909 903
910 if ((error = check_sig())) 904 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
911 return error; 905 pbe->orig_address = buf[j];
912 906 pbe = pbe->next;
913 if ((error = check_header())) 907 }
914 return error; 908 return pbe;
915
916 return 0;
917} 909}
918 910
919static int read_suspend_image(void) 911/**
912 * load_image_metadata - load the image metadata using the swap map
913 * handle @handle and put them into the PBEs in the list @pblist
914 */
915
916static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
920{ 917{
921 int error = 0;
922 struct pbe *p; 918 struct pbe *p;
919 unsigned long *buf;
920 unsigned int n = 0;
921 int error = 0;
923 922
924 if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) 923 printk("Loading image metadata ... ");
924 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
925 if (!buf)
925 return -ENOMEM; 926 return -ENOMEM;
926 927 p = pblist;
927 if ((error = read_pagedir(p))) 928 while (p) {
928 return error; 929 error = swap_map_read_page(handle, buf);
929 create_pbe_list(p, nr_copy_pages); 930 if (error)
930 mark_unsafe_pages(p); 931 break;
931 pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); 932 p = unpack_orig_addresses(buf, p);
932 if (pagedir_nosave) { 933 n++;
933 create_pbe_list(pagedir_nosave, nr_copy_pages);
934 copy_page_backup_list(pagedir_nosave, p);
935 } 934 }
936 free_pagedir(p); 935 free_page((unsigned long)buf);
937 if (!pagedir_nosave) 936 if (!error)
938 return -ENOMEM; 937 printk("done (%u pages loaded)\n", n);
938 return error;
939}
939 940
940 /* Allocate memory for the image and read the data from swap */ 941int swsusp_read(struct pbe **pblist_ptr)
942{
943 int error;
944 struct pbe *p, *pblist;
945 struct swap_map_handle handle;
946 unsigned int nr_pages;
941 947
942 error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); 948 if (IS_ERR(resume_bdev)) {
949 pr_debug("swsusp: block device not initialised\n");
950 return PTR_ERR(resume_bdev);
951 }
943 952
953 error = get_swap_map_reader(&handle, swsusp_header.image);
944 if (!error) 954 if (!error)
945 error = data_read(pagedir_nosave); 955 error = swap_map_read_page(&handle, &swsusp_info);
956 if (!error)
957 error = check_header();
958 if (error)
959 return error;
960 nr_pages = swsusp_info.image_pages;
961 p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
962 if (!p)
963 return -ENOMEM;
964 error = load_image_metadata(p, &handle);
965 if (!error) {
966 mark_unsafe_pages(p);
967 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
968 if (pblist)
969 copy_page_backup_list(pblist, p);
970 free_pagedir(p);
971 if (!pblist)
972 error = -ENOMEM;
973
974 /* Allocate memory for the image and read the data from swap */
975 if (!error)
976 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
977 if (!error) {
978 release_eaten_pages();
979 error = load_image_data(pblist, &handle, nr_pages);
980 }
981 if (!error)
982 *pblist_ptr = pblist;
983 }
984 release_swap_map_reader(&handle);
946 985
986 blkdev_put(resume_bdev);
987
988 if (!error)
989 pr_debug("swsusp: Reading resume file was successful\n");
990 else
991 pr_debug("swsusp: Error %d resuming\n", error);
947 return error; 992 return error;
948} 993}
949 994
950/** 995/**
951 * swsusp_check - Check for saved image in swap 996 * swsusp_check - Check for swsusp signature in the resume device
952 */ 997 */
953 998
954int swsusp_check(void) 999int swsusp_check(void)
@@ -958,40 +1003,27 @@ int swsusp_check(void)
958 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 1003 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
959 if (!IS_ERR(resume_bdev)) { 1004 if (!IS_ERR(resume_bdev)) {
960 set_blocksize(resume_bdev, PAGE_SIZE); 1005 set_blocksize(resume_bdev, PAGE_SIZE);
961 error = check_suspend_image(); 1006 memset(&swsusp_header, 0, sizeof(swsusp_header));
1007 if ((error = bio_read_page(0, &swsusp_header)))
1008 return error;
1009 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1010 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1011 /* Reset swap signature now */
1012 error = bio_write_page(0, &swsusp_header);
1013 } else {
1014 return -EINVAL;
1015 }
962 if (error) 1016 if (error)
963 blkdev_put(resume_bdev); 1017 blkdev_put(resume_bdev);
964 } else 1018 else
1019 pr_debug("swsusp: Signature found, resuming\n");
1020 } else {
965 error = PTR_ERR(resume_bdev); 1021 error = PTR_ERR(resume_bdev);
966
967 if (!error)
968 pr_debug("swsusp: resume file found\n");
969 else
970 pr_debug("swsusp: Error %d check for resume file\n", error);
971 return error;
972}
973
974/**
975 * swsusp_read - Read saved image from swap.
976 */
977
978int swsusp_read(void)
979{
980 int error;
981
982 if (IS_ERR(resume_bdev)) {
983 pr_debug("swsusp: block device not initialised\n");
984 return PTR_ERR(resume_bdev);
985 } 1022 }
986 1023
987 error = read_suspend_image(); 1024 if (error)
988 blkdev_put(resume_bdev); 1025 pr_debug("swsusp: Error %d check for resume file\n", error);
989 memset(key_iv, 0, MAXKEY+MAXIV);
990 1026
991 if (!error)
992 pr_debug("swsusp: Reading resume file was successful\n");
993 else
994 pr_debug("swsusp: Error %d resuming\n", error);
995 return error; 1027 return error;
996} 1028}
997 1029
diff --git a/kernel/printk.c b/kernel/printk.c
index 5287be83e3e7..13ced0f7828f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -11,7 +11,7 @@
11 * Ted Ts'o, 2/11/93. 11 * Ted Ts'o, 2/11/93.
12 * Modified for sysctl support, 1/8/97, Chris Horn. 12 * Modified for sysctl support, 1/8/97, Chris Horn.
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfreds@colorfullife.com 14 * manfred@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au>
17 */ 17 */
@@ -569,7 +569,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
569 p[1] <= '7' && p[2] == '>') { 569 p[1] <= '7' && p[2] == '>') {
570 loglev_char = p[1]; 570 loglev_char = p[1];
571 p += 3; 571 p += 3;
572 printed_len += 3; 572 printed_len -= 3;
573 } else { 573 } else {
574 loglev_char = default_message_loglevel 574 loglev_char = default_message_loglevel
575 + '0'; 575 + '0';
@@ -584,7 +584,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
584 584
585 for (tp = tbuf; tp < tbuf + tlen; tp++) 585 for (tp = tbuf; tp < tbuf + tlen; tp++)
586 emit_log_char(*tp); 586 emit_log_char(*tp);
587 printed_len += tlen - 3; 587 printed_len += tlen;
588 } else { 588 } else {
589 if (p[0] != '<' || p[1] < '0' || 589 if (p[0] != '<' || p[1] < '0' ||
590 p[1] > '7' || p[2] != '>') { 590 p[1] > '7' || p[2] != '>') {
@@ -592,8 +592,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
592 emit_log_char(default_message_loglevel 592 emit_log_char(default_message_loglevel
593 + '0'); 593 + '0');
594 emit_log_char('>'); 594 emit_log_char('>');
595 printed_len += 3;
595 } 596 }
596 printed_len += 3;
597 } 597 }
598 log_level_unknown = 0; 598 log_level_unknown = 0;
599 if (!*p) 599 if (!*p)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 656476eedb1b..5f33cdb6fff5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -7,6 +7,7 @@
7 * to continually duplicate across every architecture. 7 * to continually duplicate across every architecture.
8 */ 8 */
9 9
10#include <linux/capability.h>
10#include <linux/module.h> 11#include <linux/module.h>
11#include <linux/sched.h> 12#include <linux/sched.h>
12#include <linux/errno.h> 13#include <linux/errno.h>
@@ -408,54 +409,62 @@ int ptrace_request(struct task_struct *child, long request,
408 return ret; 409 return ret;
409} 410}
410 411
411#ifndef __ARCH_SYS_PTRACE 412/**
412static int ptrace_get_task_struct(long request, long pid, 413 * ptrace_traceme -- helper for PTRACE_TRACEME
413 struct task_struct **childp) 414 *
415 * Performs checks and sets PT_PTRACED.
416 * Should be used by all ptrace implementations for PTRACE_TRACEME.
417 */
418int ptrace_traceme(void)
414{ 419{
415 struct task_struct *child;
416 int ret; 420 int ret;
417 421
418 /* 422 /*
419 * Callers use child == NULL as an indication to exit early even 423 * Are we already being traced?
420 * when the return value is 0, so make sure it is non-NULL here. 424 */
425 if (current->ptrace & PT_PTRACED)
426 return -EPERM;
427 ret = security_ptrace(current->parent, current);
428 if (ret)
429 return -EPERM;
430 /*
431 * Set the ptrace bit in the process ptrace flags.
421 */ 432 */
422 *childp = NULL; 433 current->ptrace |= PT_PTRACED;
434 return 0;
435}
423 436
424 if (request == PTRACE_TRACEME) { 437/**
425 /* 438 * ptrace_get_task_struct -- grab a task struct reference for ptrace
426 * Are we already being traced? 439 * @pid: process id to grab a task_struct reference of
427 */ 440 *
428 if (current->ptrace & PT_PTRACED) 441 * This function is a helper for ptrace implementations. It checks
429 return -EPERM; 442 * permissions and then grabs a task struct for use of the actual
430 ret = security_ptrace(current->parent, current); 443 * ptrace implementation.
431 if (ret) 444 *
432 return -EPERM; 445 * Returns the task_struct for @pid or an ERR_PTR() on failure.
433 /* 446 */
434 * Set the ptrace bit in the process ptrace flags. 447struct task_struct *ptrace_get_task_struct(pid_t pid)
435 */ 448{
436 current->ptrace |= PT_PTRACED; 449 struct task_struct *child;
437 return 0;
438 }
439 450
440 /* 451 /*
441 * You may not mess with init 452 * Tracing init is not allowed.
442 */ 453 */
443 if (pid == 1) 454 if (pid == 1)
444 return -EPERM; 455 return ERR_PTR(-EPERM);
445 456
446 ret = -ESRCH;
447 read_lock(&tasklist_lock); 457 read_lock(&tasklist_lock);
448 child = find_task_by_pid(pid); 458 child = find_task_by_pid(pid);
449 if (child) 459 if (child)
450 get_task_struct(child); 460 get_task_struct(child);
451 read_unlock(&tasklist_lock); 461 read_unlock(&tasklist_lock);
452 if (!child) 462 if (!child)
453 return -ESRCH; 463 return ERR_PTR(-ESRCH);
454 464 return child;
455 *childp = child;
456 return 0;
457} 465}
458 466
467#ifndef __ARCH_SYS_PTRACE
459asmlinkage long sys_ptrace(long request, long pid, long addr, long data) 468asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
460{ 469{
461 struct task_struct *child; 470 struct task_struct *child;
@@ -465,9 +474,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
465 * This lock_kernel fixes a subtle race with suid exec 474 * This lock_kernel fixes a subtle race with suid exec
466 */ 475 */
467 lock_kernel(); 476 lock_kernel();
468 ret = ptrace_get_task_struct(request, pid, &child); 477 if (request == PTRACE_TRACEME) {
469 if (!child) 478 ret = ptrace_traceme();
470 goto out; 479 goto out;
480 }
481
482 child = ptrace_get_task_struct(pid);
483 if (IS_ERR(child)) {
484 ret = PTR_ERR(child);
485 goto out;
486 }
471 487
472 if (request == PTRACE_ATTACH) { 488 if (request == PTRACE_ATTACH) {
473 ret = ptrace_attach(child); 489 ret = ptrace_attach(child);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c4d159a21e04..0cf8146bd585 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -35,6 +35,7 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/spinlock.h> 36#include <linux/spinlock.h>
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/rcupdate.h>
38#include <linux/interrupt.h> 39#include <linux/interrupt.h>
39#include <linux/sched.h> 40#include <linux/sched.h>
40#include <asm/atomic.h> 41#include <asm/atomic.h>
@@ -45,26 +46,21 @@
45#include <linux/percpu.h> 46#include <linux/percpu.h>
46#include <linux/notifier.h> 47#include <linux/notifier.h>
47#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
48#include <linux/rcuref.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50 50
51/* Definition for rcupdate control block. */ 51/* Definition for rcupdate control block. */
52struct rcu_ctrlblk rcu_ctrlblk = 52struct rcu_ctrlblk rcu_ctrlblk = {
53 { .cur = -300, .completed = -300 }; 53 .cur = -300,
54struct rcu_ctrlblk rcu_bh_ctrlblk = 54 .completed = -300,
55 { .cur = -300, .completed = -300 }; 55 .lock = SPIN_LOCK_UNLOCKED,
56 56 .cpumask = CPU_MASK_NONE,
57/* Bookkeeping of the progress of the grace period */ 57};
58struct rcu_state { 58struct rcu_ctrlblk rcu_bh_ctrlblk = {
59 spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */ 59 .cur = -300,
60 cpumask_t cpumask; /* CPUs that need to switch in order */ 60 .completed = -300,
61 /* for current batch to proceed. */ 61 .lock = SPIN_LOCK_UNLOCKED,
62 .cpumask = CPU_MASK_NONE,
62}; 63};
63
64static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
65 {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
66static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
67 {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
68 64
69DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 65DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
70DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; 66DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
@@ -73,19 +69,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
73static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; 69static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
74static int maxbatch = 10000; 70static int maxbatch = 10000;
75 71
76#ifndef __HAVE_ARCH_CMPXCHG
77/*
78 * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
79 * 32 bit atomic_t implementations, and a hash function similar to that
80 * for our refcounting needs.
81 * Can't help multiprocessors which donot have cmpxchg :(
82 */
83
84spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
85 [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
86};
87#endif
88
89/** 72/**
90 * call_rcu - Queue an RCU callback for invocation after a grace period. 73 * call_rcu - Queue an RCU callback for invocation after a grace period.
91 * @head: structure to be used for queueing the RCU updates. 74 * @head: structure to be used for queueing the RCU updates.
@@ -116,6 +99,10 @@ void fastcall call_rcu(struct rcu_head *head,
116 local_irq_restore(flags); 99 local_irq_restore(flags);
117} 100}
118 101
102static atomic_t rcu_barrier_cpu_count;
103static struct semaphore rcu_barrier_sema;
104static struct completion rcu_barrier_completion;
105
119/** 106/**
120 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. 107 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
121 * @head: structure to be used for queueing the RCU updates. 108 * @head: structure to be used for queueing the RCU updates.
@@ -162,6 +149,42 @@ long rcu_batches_completed(void)
162 return rcu_ctrlblk.completed; 149 return rcu_ctrlblk.completed;
163} 150}
164 151
152static void rcu_barrier_callback(struct rcu_head *notused)
153{
154 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
155 complete(&rcu_barrier_completion);
156}
157
158/*
159 * Called with preemption disabled, and from cross-cpu IRQ context.
160 */
161static void rcu_barrier_func(void *notused)
162{
163 int cpu = smp_processor_id();
164 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
165 struct rcu_head *head;
166
167 head = &rdp->barrier;
168 atomic_inc(&rcu_barrier_cpu_count);
169 call_rcu(head, rcu_barrier_callback);
170}
171
172/**
173 * rcu_barrier - Wait until all the in-flight RCUs are complete.
174 */
175void rcu_barrier(void)
176{
177 BUG_ON(in_interrupt());
178 /* Take cpucontrol semaphore to protect against CPU hotplug */
179 down(&rcu_barrier_sema);
180 init_completion(&rcu_barrier_completion);
181 atomic_set(&rcu_barrier_cpu_count, 0);
182 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
183 wait_for_completion(&rcu_barrier_completion);
184 up(&rcu_barrier_sema);
185}
186EXPORT_SYMBOL_GPL(rcu_barrier);
187
165/* 188/*
166 * Invoke the completed RCU callbacks. They are expected to be in 189 * Invoke the completed RCU callbacks. They are expected to be in
167 * a per-cpu list. 190 * a per-cpu list.
@@ -193,13 +216,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
193 * This is done by rcu_start_batch. The start is not broadcasted to 216 * This is done by rcu_start_batch. The start is not broadcasted to
194 * all cpus, they must pick this up by comparing rcp->cur with 217 * all cpus, they must pick this up by comparing rcp->cur with
195 * rdp->quiescbatch. All cpus are recorded in the 218 * rdp->quiescbatch. All cpus are recorded in the
196 * rcu_state.cpumask bitmap. 219 * rcu_ctrlblk.cpumask bitmap.
197 * - All cpus must go through a quiescent state. 220 * - All cpus must go through a quiescent state.
198 * Since the start of the grace period is not broadcasted, at least two 221 * Since the start of the grace period is not broadcasted, at least two
199 * calls to rcu_check_quiescent_state are required: 222 * calls to rcu_check_quiescent_state are required:
200 * The first call just notices that a new grace period is running. The 223 * The first call just notices that a new grace period is running. The
201 * following calls check if there was a quiescent state since the beginning 224 * following calls check if there was a quiescent state since the beginning
202 * of the grace period. If so, it updates rcu_state.cpumask. If 225 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
203 * the bitmap is empty, then the grace period is completed. 226 * the bitmap is empty, then the grace period is completed.
204 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace 227 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
205 * period (if necessary). 228 * period (if necessary).
@@ -207,25 +230,29 @@ static void rcu_do_batch(struct rcu_data *rdp)
207/* 230/*
208 * Register a new batch of callbacks, and start it up if there is currently no 231 * Register a new batch of callbacks, and start it up if there is currently no
209 * active batch and the batch to be registered has not already occurred. 232 * active batch and the batch to be registered has not already occurred.
210 * Caller must hold rcu_state.lock. 233 * Caller must hold rcu_ctrlblk.lock.
211 */ 234 */
212static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, 235static void rcu_start_batch(struct rcu_ctrlblk *rcp)
213 int next_pending)
214{ 236{
215 if (next_pending)
216 rcp->next_pending = 1;
217
218 if (rcp->next_pending && 237 if (rcp->next_pending &&
219 rcp->completed == rcp->cur) { 238 rcp->completed == rcp->cur) {
220 /* Can't change, since spin lock held. */
221 cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
222
223 rcp->next_pending = 0; 239 rcp->next_pending = 0;
224 /* next_pending == 0 must be visible in __rcu_process_callbacks() 240 /*
225 * before it can see new value of cur. 241 * next_pending == 0 must be visible in
242 * __rcu_process_callbacks() before it can see new value of cur.
226 */ 243 */
227 smp_wmb(); 244 smp_wmb();
228 rcp->cur++; 245 rcp->cur++;
246
247 /*
248 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
249 * Barrier Otherwise it can cause tickless idle CPUs to be
250 * included in rcp->cpumask, which will extend graceperiods
251 * unnecessarily.
252 */
253 smp_mb();
254 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
255
229 } 256 }
230} 257}
231 258
@@ -234,13 +261,13 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
234 * Clear it from the cpu mask and complete the grace period if it was the last 261 * Clear it from the cpu mask and complete the grace period if it was the last
235 * cpu. Start another grace period if someone has further entries pending 262 * cpu. Start another grace period if someone has further entries pending
236 */ 263 */
237static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) 264static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
238{ 265{
239 cpu_clear(cpu, rsp->cpumask); 266 cpu_clear(cpu, rcp->cpumask);
240 if (cpus_empty(rsp->cpumask)) { 267 if (cpus_empty(rcp->cpumask)) {
241 /* batch completed ! */ 268 /* batch completed ! */
242 rcp->completed = rcp->cur; 269 rcp->completed = rcp->cur;
243 rcu_start_batch(rcp, rsp, 0); 270 rcu_start_batch(rcp);
244 } 271 }
245} 272}
246 273
@@ -250,7 +277,7 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
250 * quiescent cycle, then indicate that it has done so. 277 * quiescent cycle, then indicate that it has done so.
251 */ 278 */
252static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, 279static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
253 struct rcu_state *rsp, struct rcu_data *rdp) 280 struct rcu_data *rdp)
254{ 281{
255 if (rdp->quiescbatch != rcp->cur) { 282 if (rdp->quiescbatch != rcp->cur) {
256 /* start new grace period: */ 283 /* start new grace period: */
@@ -275,15 +302,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
275 return; 302 return;
276 rdp->qs_pending = 0; 303 rdp->qs_pending = 0;
277 304
278 spin_lock(&rsp->lock); 305 spin_lock(&rcp->lock);
279 /* 306 /*
280 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync 307 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
281 * during cpu startup. Ignore the quiescent state. 308 * during cpu startup. Ignore the quiescent state.
282 */ 309 */
283 if (likely(rdp->quiescbatch == rcp->cur)) 310 if (likely(rdp->quiescbatch == rcp->cur))
284 cpu_quiet(rdp->cpu, rcp, rsp); 311 cpu_quiet(rdp->cpu, rcp);
285 312
286 spin_unlock(&rsp->lock); 313 spin_unlock(&rcp->lock);
287} 314}
288 315
289 316
@@ -304,28 +331,29 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
304} 331}
305 332
306static void __rcu_offline_cpu(struct rcu_data *this_rdp, 333static void __rcu_offline_cpu(struct rcu_data *this_rdp,
307 struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp) 334 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
308{ 335{
309 /* if the cpu going offline owns the grace period 336 /* if the cpu going offline owns the grace period
310 * we can block indefinitely waiting for it, so flush 337 * we can block indefinitely waiting for it, so flush
311 * it here 338 * it here
312 */ 339 */
313 spin_lock_bh(&rsp->lock); 340 spin_lock_bh(&rcp->lock);
314 if (rcp->cur != rcp->completed) 341 if (rcp->cur != rcp->completed)
315 cpu_quiet(rdp->cpu, rcp, rsp); 342 cpu_quiet(rdp->cpu, rcp);
316 spin_unlock_bh(&rsp->lock); 343 spin_unlock_bh(&rcp->lock);
317 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); 344 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
318 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); 345 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
319 346 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
320} 347}
348
321static void rcu_offline_cpu(int cpu) 349static void rcu_offline_cpu(int cpu)
322{ 350{
323 struct rcu_data *this_rdp = &get_cpu_var(rcu_data); 351 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
324 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); 352 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
325 353
326 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state, 354 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
327 &per_cpu(rcu_data, cpu)); 355 &per_cpu(rcu_data, cpu));
328 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state, 356 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
329 &per_cpu(rcu_bh_data, cpu)); 357 &per_cpu(rcu_bh_data, cpu));
330 put_cpu_var(rcu_data); 358 put_cpu_var(rcu_data);
331 put_cpu_var(rcu_bh_data); 359 put_cpu_var(rcu_bh_data);
@@ -344,7 +372,7 @@ static void rcu_offline_cpu(int cpu)
344 * This does the RCU processing work from tasklet context. 372 * This does the RCU processing work from tasklet context.
345 */ 373 */
346static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, 374static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
347 struct rcu_state *rsp, struct rcu_data *rdp) 375 struct rcu_data *rdp)
348{ 376{
349 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { 377 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
350 *rdp->donetail = rdp->curlist; 378 *rdp->donetail = rdp->curlist;
@@ -374,24 +402,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
374 402
375 if (!rcp->next_pending) { 403 if (!rcp->next_pending) {
376 /* and start it/schedule start if it's a new batch */ 404 /* and start it/schedule start if it's a new batch */
377 spin_lock(&rsp->lock); 405 spin_lock(&rcp->lock);
378 rcu_start_batch(rcp, rsp, 1); 406 rcp->next_pending = 1;
379 spin_unlock(&rsp->lock); 407 rcu_start_batch(rcp);
408 spin_unlock(&rcp->lock);
380 } 409 }
381 } else { 410 } else {
382 local_irq_enable(); 411 local_irq_enable();
383 } 412 }
384 rcu_check_quiescent_state(rcp, rsp, rdp); 413 rcu_check_quiescent_state(rcp, rdp);
385 if (rdp->donelist) 414 if (rdp->donelist)
386 rcu_do_batch(rdp); 415 rcu_do_batch(rdp);
387} 416}
388 417
389static void rcu_process_callbacks(unsigned long unused) 418static void rcu_process_callbacks(unsigned long unused)
390{ 419{
391 __rcu_process_callbacks(&rcu_ctrlblk, &rcu_state, 420 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
392 &__get_cpu_var(rcu_data)); 421 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
393 __rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state, 422}
394 &__get_cpu_var(rcu_bh_data)); 423
424static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
425{
426 /* This cpu has pending rcu entries and the grace period
427 * for them has completed.
428 */
429 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
430 return 1;
431
432 /* This cpu has no pending entries, but there are new entries */
433 if (!rdp->curlist && rdp->nxtlist)
434 return 1;
435
436 /* This cpu has finished callbacks to invoke */
437 if (rdp->donelist)
438 return 1;
439
440 /* The rcu core waits for a quiescent state from the cpu */
441 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
442 return 1;
443
444 /* nothing to do */
445 return 0;
446}
447
448int rcu_pending(int cpu)
449{
450 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
451 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
395} 452}
396 453
397void rcu_check_callbacks(int cpu, int user) 454void rcu_check_callbacks(int cpu, int user)
@@ -457,6 +514,7 @@ static struct notifier_block __devinitdata rcu_nb = {
457 */ 514 */
458void __init rcu_init(void) 515void __init rcu_init(void)
459{ 516{
517 sema_init(&rcu_barrier_sema, 1);
460 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 518 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
461 (void *)(long)smp_processor_id()); 519 (void *)(long)smp_processor_id());
462 /* Register notifier for non-boot CPUs */ 520 /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 88c28d476550..773219907dd8 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,7 +39,6 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/rcuref.h>
43#include <linux/cpu.h> 42#include <linux/cpu.h>
44#include <linux/random.h> 43#include <linux/random.h>
45#include <linux/delay.h> 44#include <linux/delay.h>
@@ -49,9 +48,11 @@
49MODULE_LICENSE("GPL"); 48MODULE_LICENSE("GPL");
50 49
51static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ 50static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */
52static int stat_interval = 0; /* Interval between stats, in seconds. */ 51static int stat_interval; /* Interval between stats, in seconds. */
53 /* Defaults to "only at end of test". */ 52 /* Defaults to "only at end of test". */
54static int verbose = 0; /* Print more debug info. */ 53static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
55 56
56MODULE_PARM(nreaders, "i"); 57MODULE_PARM(nreaders, "i");
57MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -59,6 +60,10 @@ MODULE_PARM(stat_interval, "i");
59MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 60MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
60MODULE_PARM(verbose, "i"); 61MODULE_PARM(verbose, "i");
61MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 62MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
63MODULE_PARM(test_no_idle_hz, "i");
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65MODULE_PARM(shuffle_interval, "i");
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
62#define TORTURE_FLAG "rcutorture: " 67#define TORTURE_FLAG "rcutorture: "
63#define PRINTK_STRING(s) \ 68#define PRINTK_STRING(s) \
64 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 69 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
@@ -73,6 +78,7 @@ static int nrealreaders;
73static struct task_struct *writer_task; 78static struct task_struct *writer_task;
74static struct task_struct **reader_tasks; 79static struct task_struct **reader_tasks;
75static struct task_struct *stats_task; 80static struct task_struct *stats_task;
81static struct task_struct *shuffler_task;
76 82
77#define RCU_TORTURE_PIPE_LEN 10 83#define RCU_TORTURE_PIPE_LEN 10
78 84
@@ -103,7 +109,7 @@ atomic_t n_rcu_torture_error;
103/* 109/*
104 * Allocate an element from the rcu_tortures pool. 110 * Allocate an element from the rcu_tortures pool.
105 */ 111 */
106struct rcu_torture * 112static struct rcu_torture *
107rcu_torture_alloc(void) 113rcu_torture_alloc(void)
108{ 114{
109 struct list_head *p; 115 struct list_head *p;
@@ -376,12 +382,77 @@ rcu_torture_stats(void *arg)
376 return 0; 382 return 0;
377} 383}
378 384
385static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
386
387/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
388 * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
389 */
390void rcu_torture_shuffle_tasks(void)
391{
392 cpumask_t tmp_mask = CPU_MASK_ALL;
393 int i;
394
395 lock_cpu_hotplug();
396
397 /* No point in shuffling if there is only one online CPU (ex: UP) */
398 if (num_online_cpus() == 1) {
399 unlock_cpu_hotplug();
400 return;
401 }
402
403 if (rcu_idle_cpu != -1)
404 cpu_clear(rcu_idle_cpu, tmp_mask);
405
406 set_cpus_allowed(current, tmp_mask);
407
408 if (reader_tasks != NULL) {
409 for (i = 0; i < nrealreaders; i++)
410 if (reader_tasks[i])
411 set_cpus_allowed(reader_tasks[i], tmp_mask);
412 }
413
414 if (writer_task)
415 set_cpus_allowed(writer_task, tmp_mask);
416
417 if (stats_task)
418 set_cpus_allowed(stats_task, tmp_mask);
419
420 if (rcu_idle_cpu == -1)
421 rcu_idle_cpu = num_online_cpus() - 1;
422 else
423 rcu_idle_cpu--;
424
425 unlock_cpu_hotplug();
426}
427
428/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
429 * system to become idle at a time and cut off its timer ticks. This is meant
430 * to test the support for such tickless idle CPU in RCU.
431 */
432static int
433rcu_torture_shuffle(void *arg)
434{
435 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
436 do {
437 schedule_timeout_interruptible(shuffle_interval * HZ);
438 rcu_torture_shuffle_tasks();
439 } while (!kthread_should_stop());
440 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
441 return 0;
442}
443
379static void 444static void
380rcu_torture_cleanup(void) 445rcu_torture_cleanup(void)
381{ 446{
382 int i; 447 int i;
383 448
384 fullstop = 1; 449 fullstop = 1;
450 if (shuffler_task != NULL) {
451 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
452 kthread_stop(shuffler_task);
453 }
454 shuffler_task = NULL;
455
385 if (writer_task != NULL) { 456 if (writer_task != NULL) {
386 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); 457 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
387 kthread_stop(writer_task); 458 kthread_stop(writer_task);
@@ -409,9 +480,8 @@ rcu_torture_cleanup(void)
409 stats_task = NULL; 480 stats_task = NULL;
410 481
411 /* Wait for all RCU callbacks to fire. */ 482 /* Wait for all RCU callbacks to fire. */
483 rcu_barrier();
412 484
413 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
414 synchronize_rcu();
415 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 485 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
416 printk(KERN_ALERT TORTURE_FLAG 486 printk(KERN_ALERT TORTURE_FLAG
417 "--- End of test: %s\n", 487 "--- End of test: %s\n",
@@ -431,9 +501,11 @@ rcu_torture_init(void)
431 nrealreaders = nreaders; 501 nrealreaders = nreaders;
432 else 502 else
433 nrealreaders = 2 * num_online_cpus(); 503 nrealreaders = 2 * num_online_cpus();
434 printk(KERN_ALERT TORTURE_FLAG 504 printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d "
435 "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", 505 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
436 nrealreaders, stat_interval, verbose); 506 "shuffle_interval = %d\n",
507 nrealreaders, stat_interval, verbose, test_no_idle_hz,
508 shuffle_interval);
437 fullstop = 0; 509 fullstop = 0;
438 510
439 /* Set up the freelist. */ 511 /* Set up the freelist. */
@@ -503,6 +575,18 @@ rcu_torture_init(void)
503 goto unwind; 575 goto unwind;
504 } 576 }
505 } 577 }
578 if (test_no_idle_hz) {
579 rcu_idle_cpu = num_online_cpus() - 1;
580 /* Create the shuffler thread */
581 shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
582 "rcu_torture_shuffle");
583 if (IS_ERR(shuffler_task)) {
584 firsterr = PTR_ERR(shuffler_task);
585 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
586 shuffler_task = NULL;
587 goto unwind;
588 }
589 }
506 return 0; 590 return 0;
507 591
508unwind: 592unwind:
diff --git a/kernel/resource.c b/kernel/resource.c
index 92285d822de6..e3080fcc66a3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -464,7 +464,7 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
464 464
465EXPORT_SYMBOL(__request_region); 465EXPORT_SYMBOL(__request_region);
466 466
467int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n) 467int __check_region(struct resource *parent, unsigned long start, unsigned long n)
468{ 468{
469 struct resource * res; 469 struct resource * res;
470 470
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f46c94cc29e..3ee2ae45125f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -27,12 +27,14 @@
27#include <linux/smp_lock.h> 27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 28#include <asm/mmu_context.h>
29#include <linux/interrupt.h> 29#include <linux/interrupt.h>
30#include <linux/capability.h>
30#include <linux/completion.h> 31#include <linux/completion.h>
31#include <linux/kernel_stat.h> 32#include <linux/kernel_stat.h>
32#include <linux/security.h> 33#include <linux/security.h>
33#include <linux/notifier.h> 34#include <linux/notifier.h>
34#include <linux/profile.h> 35#include <linux/profile.h>
35#include <linux/suspend.h> 36#include <linux/suspend.h>
37#include <linux/vmalloc.h>
36#include <linux/blkdev.h> 38#include <linux/blkdev.h>
37#include <linux/delay.h> 39#include <linux/delay.h>
38#include <linux/smp.h> 40#include <linux/smp.h>
@@ -176,6 +178,13 @@ static unsigned int task_timeslice(task_t *p)
176#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 178#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
177 < (long long) (sd)->cache_hot_time) 179 < (long long) (sd)->cache_hot_time)
178 180
181void __put_task_struct_cb(struct rcu_head *rhp)
182{
183 __put_task_struct(container_of(rhp, struct task_struct, rcu));
184}
185
186EXPORT_SYMBOL_GPL(__put_task_struct_cb);
187
179/* 188/*
180 * These are the runqueue data structures: 189 * These are the runqueue data structures:
181 */ 190 */
@@ -512,7 +521,7 @@ static inline void sched_info_dequeued(task_t *t)
512 * long it was waiting to run. We also note when it began so that we 521 * long it was waiting to run. We also note when it began so that we
513 * can keep stats on how long its timeslice is. 522 * can keep stats on how long its timeslice is.
514 */ 523 */
515static inline void sched_info_arrive(task_t *t) 524static void sched_info_arrive(task_t *t)
516{ 525{
517 unsigned long now = jiffies, diff = 0; 526 unsigned long now = jiffies, diff = 0;
518 struct runqueue *rq = task_rq(t); 527 struct runqueue *rq = task_rq(t);
@@ -739,10 +748,14 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
739 unsigned long long __sleep_time = now - p->timestamp; 748 unsigned long long __sleep_time = now - p->timestamp;
740 unsigned long sleep_time; 749 unsigned long sleep_time;
741 750
742 if (__sleep_time > NS_MAX_SLEEP_AVG) 751 if (unlikely(p->policy == SCHED_BATCH))
743 sleep_time = NS_MAX_SLEEP_AVG; 752 sleep_time = 0;
744 else 753 else {
745 sleep_time = (unsigned long)__sleep_time; 754 if (__sleep_time > NS_MAX_SLEEP_AVG)
755 sleep_time = NS_MAX_SLEEP_AVG;
756 else
757 sleep_time = (unsigned long)__sleep_time;
758 }
746 759
747 if (likely(sleep_time > 0)) { 760 if (likely(sleep_time > 0)) {
748 /* 761 /*
@@ -994,7 +1007,7 @@ void kick_process(task_t *p)
994 * We want to under-estimate the load of migration sources, to 1007 * We want to under-estimate the load of migration sources, to
995 * balance conservatively. 1008 * balance conservatively.
996 */ 1009 */
997static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) 1010static unsigned long __source_load(int cpu, int type, enum idle_type idle)
998{ 1011{
999 runqueue_t *rq = cpu_rq(cpu); 1012 runqueue_t *rq = cpu_rq(cpu);
1000 unsigned long running = rq->nr_running; 1013 unsigned long running = rq->nr_running;
@@ -1281,6 +1294,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1281 } 1294 }
1282 } 1295 }
1283 1296
1297 if (p->last_waker_cpu != this_cpu)
1298 goto out_set_cpu;
1299
1284 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1300 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1285 goto out_set_cpu; 1301 goto out_set_cpu;
1286 1302
@@ -1351,6 +1367,8 @@ out_set_cpu:
1351 cpu = task_cpu(p); 1367 cpu = task_cpu(p);
1352 } 1368 }
1353 1369
1370 p->last_waker_cpu = this_cpu;
1371
1354out_activate: 1372out_activate:
1355#endif /* CONFIG_SMP */ 1373#endif /* CONFIG_SMP */
1356 if (old_state == TASK_UNINTERRUPTIBLE) { 1374 if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1432,9 +1450,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1432#ifdef CONFIG_SCHEDSTATS 1450#ifdef CONFIG_SCHEDSTATS
1433 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1451 memset(&p->sched_info, 0, sizeof(p->sched_info));
1434#endif 1452#endif
1435#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1453#if defined(CONFIG_SMP)
1454 p->last_waker_cpu = cpu;
1455#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
1436 p->oncpu = 0; 1456 p->oncpu = 0;
1437#endif 1457#endif
1458#endif
1438#ifdef CONFIG_PREEMPT 1459#ifdef CONFIG_PREEMPT
1439 /* Want to start with kernel preemption disabled. */ 1460 /* Want to start with kernel preemption disabled. */
1440 task_thread_info(p)->preempt_count = 1; 1461 task_thread_info(p)->preempt_count = 1;
@@ -1849,7 +1870,7 @@ void sched_exec(void)
1849 * pull_task - move a task from a remote runqueue to the local runqueue. 1870 * pull_task - move a task from a remote runqueue to the local runqueue.
1850 * Both runqueues must be locked. 1871 * Both runqueues must be locked.
1851 */ 1872 */
1852static inline 1873static
1853void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, 1874void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1854 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1875 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1855{ 1876{
@@ -1871,7 +1892,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1871/* 1892/*
1872 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 1893 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1873 */ 1894 */
1874static inline 1895static
1875int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1896int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1876 struct sched_domain *sd, enum idle_type idle, 1897 struct sched_domain *sd, enum idle_type idle,
1877 int *all_pinned) 1898 int *all_pinned)
@@ -2357,7 +2378,7 @@ out_balanced:
2357 * idle_balance is called by schedule() if this_cpu is about to become 2378 * idle_balance is called by schedule() if this_cpu is about to become
2358 * idle. Attempts to pull tasks from other CPUs. 2379 * idle. Attempts to pull tasks from other CPUs.
2359 */ 2380 */
2360static inline void idle_balance(int this_cpu, runqueue_t *this_rq) 2381static void idle_balance(int this_cpu, runqueue_t *this_rq)
2361{ 2382{
2362 struct sched_domain *sd; 2383 struct sched_domain *sd;
2363 2384
@@ -2741,7 +2762,7 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
2741 resched_task(rq->idle); 2762 resched_task(rq->idle);
2742} 2763}
2743 2764
2744static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2765static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2745{ 2766{
2746 struct sched_domain *tmp, *sd = NULL; 2767 struct sched_domain *tmp, *sd = NULL;
2747 cpumask_t sibling_map; 2768 cpumask_t sibling_map;
@@ -2795,7 +2816,7 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2795 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 2816 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2796} 2817}
2797 2818
2798static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2819static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2799{ 2820{
2800 struct sched_domain *tmp, *sd = NULL; 2821 struct sched_domain *tmp, *sd = NULL;
2801 cpumask_t sibling_map; 2822 cpumask_t sibling_map;
@@ -3543,7 +3564,7 @@ void set_user_nice(task_t *p, long nice)
3543 * The RT priorities are set via sched_setscheduler(), but we still 3564 * The RT priorities are set via sched_setscheduler(), but we still
3544 * allow the 'normal' nice value to be set - but as expected 3565 * allow the 'normal' nice value to be set - but as expected
3545 * it wont have any effect on scheduling until the task is 3566 * it wont have any effect on scheduling until the task is
3546 * not SCHED_NORMAL: 3567 * not SCHED_NORMAL/SCHED_BATCH:
3547 */ 3568 */
3548 if (rt_task(p)) { 3569 if (rt_task(p)) {
3549 p->static_prio = NICE_TO_PRIO(nice); 3570 p->static_prio = NICE_TO_PRIO(nice);
@@ -3689,10 +3710,16 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3689 BUG_ON(p->array); 3710 BUG_ON(p->array);
3690 p->policy = policy; 3711 p->policy = policy;
3691 p->rt_priority = prio; 3712 p->rt_priority = prio;
3692 if (policy != SCHED_NORMAL) 3713 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
3693 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3714 p->prio = MAX_RT_PRIO-1 - p->rt_priority;
3694 else 3715 } else {
3695 p->prio = p->static_prio; 3716 p->prio = p->static_prio;
3717 /*
3718 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3719 */
3720 if (policy == SCHED_BATCH)
3721 p->sleep_avg = 0;
3722 }
3696} 3723}
3697 3724
3698/** 3725/**
@@ -3716,29 +3743,35 @@ recheck:
3716 if (policy < 0) 3743 if (policy < 0)
3717 policy = oldpolicy = p->policy; 3744 policy = oldpolicy = p->policy;
3718 else if (policy != SCHED_FIFO && policy != SCHED_RR && 3745 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3719 policy != SCHED_NORMAL) 3746 policy != SCHED_NORMAL && policy != SCHED_BATCH)
3720 return -EINVAL; 3747 return -EINVAL;
3721 /* 3748 /*
3722 * Valid priorities for SCHED_FIFO and SCHED_RR are 3749 * Valid priorities for SCHED_FIFO and SCHED_RR are
3723 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3750 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
3751 * SCHED_BATCH is 0.
3724 */ 3752 */
3725 if (param->sched_priority < 0 || 3753 if (param->sched_priority < 0 ||
3726 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3754 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3727 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3755 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3728 return -EINVAL; 3756 return -EINVAL;
3729 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3757 if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
3758 != (param->sched_priority == 0))
3730 return -EINVAL; 3759 return -EINVAL;
3731 3760
3732 /* 3761 /*
3733 * Allow unprivileged RT tasks to decrease priority: 3762 * Allow unprivileged RT tasks to decrease priority:
3734 */ 3763 */
3735 if (!capable(CAP_SYS_NICE)) { 3764 if (!capable(CAP_SYS_NICE)) {
3736 /* can't change policy */ 3765 /*
3737 if (policy != p->policy && 3766 * can't change policy, except between SCHED_NORMAL
3738 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3767 * and SCHED_BATCH:
3768 */
3769 if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
3770 (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
3771 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3739 return -EPERM; 3772 return -EPERM;
3740 /* can't increase priority */ 3773 /* can't increase priority */
3741 if (policy != SCHED_NORMAL && 3774 if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
3742 param->sched_priority > p->rt_priority && 3775 param->sched_priority > p->rt_priority &&
3743 param->sched_priority > 3776 param->sched_priority >
3744 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3777 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
@@ -3817,6 +3850,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3817asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 3850asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3818 struct sched_param __user *param) 3851 struct sched_param __user *param)
3819{ 3852{
3853 /* negative values for policy are not valid */
3854 if (policy < 0)
3855 return -EINVAL;
3856
3820 return do_sched_setscheduler(pid, policy, param); 3857 return do_sched_setscheduler(pid, policy, param);
3821} 3858}
3822 3859
@@ -3972,12 +4009,12 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3972 * method, such as ACPI for e.g. 4009 * method, such as ACPI for e.g.
3973 */ 4010 */
3974 4011
3975cpumask_t cpu_present_map; 4012cpumask_t cpu_present_map __read_mostly;
3976EXPORT_SYMBOL(cpu_present_map); 4013EXPORT_SYMBOL(cpu_present_map);
3977 4014
3978#ifndef CONFIG_SMP 4015#ifndef CONFIG_SMP
3979cpumask_t cpu_online_map = CPU_MASK_ALL; 4016cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
3980cpumask_t cpu_possible_map = CPU_MASK_ALL; 4017cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
3981#endif 4018#endif
3982 4019
3983long sched_getaffinity(pid_t pid, cpumask_t *mask) 4020long sched_getaffinity(pid_t pid, cpumask_t *mask)
@@ -4216,6 +4253,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4216 ret = MAX_USER_RT_PRIO-1; 4253 ret = MAX_USER_RT_PRIO-1;
4217 break; 4254 break;
4218 case SCHED_NORMAL: 4255 case SCHED_NORMAL:
4256 case SCHED_BATCH:
4219 ret = 0; 4257 ret = 0;
4220 break; 4258 break;
4221 } 4259 }
@@ -4239,6 +4277,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4239 ret = 1; 4277 ret = 1;
4240 break; 4278 break;
4241 case SCHED_NORMAL: 4279 case SCHED_NORMAL:
4280 case SCHED_BATCH:
4242 ret = 0; 4281 ret = 0;
4243 } 4282 }
4244 return ret; 4283 return ret;
@@ -4379,6 +4418,7 @@ void show_state(void)
4379 } while_each_thread(g, p); 4418 } while_each_thread(g, p);
4380 4419
4381 read_unlock(&tasklist_lock); 4420 read_unlock(&tasklist_lock);
4421 mutex_debug_show_all_locks();
4382} 4422}
4383 4423
4384/** 4424/**
@@ -5073,7 +5113,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5073 5113
5074#define SD_NODES_PER_DOMAIN 16 5114#define SD_NODES_PER_DOMAIN 16
5075 5115
5116/*
5117 * Self-tuning task migration cost measurement between source and target CPUs.
5118 *
5119 * This is done by measuring the cost of manipulating buffers of varying
5120 * sizes. For a given buffer-size here are the steps that are taken:
5121 *
5122 * 1) the source CPU reads+dirties a shared buffer
5123 * 2) the target CPU reads+dirties the same shared buffer
5124 *
5125 * We measure how long they take, in the following 4 scenarios:
5126 *
5127 * - source: CPU1, target: CPU2 | cost1
5128 * - source: CPU2, target: CPU1 | cost2
5129 * - source: CPU1, target: CPU1 | cost3
5130 * - source: CPU2, target: CPU2 | cost4
5131 *
5132 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5133 * the cost of migration.
5134 *
5135 * We then start off from a small buffer-size and iterate up to larger
5136 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5137 * doing a maximum search for the cost. (The maximum cost for a migration
5138 * normally occurs when the working set size is around the effective cache
5139 * size.)
5140 */
5141#define SEARCH_SCOPE 2
5142#define MIN_CACHE_SIZE (64*1024U)
5143#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5144#define ITERATIONS 2
5145#define SIZE_THRESH 130
5146#define COST_THRESH 130
5147
5148/*
5149 * The migration cost is a function of 'domain distance'. Domain
5150 * distance is the number of steps a CPU has to iterate down its
5151 * domain tree to share a domain with the other CPU. The farther
5152 * two CPUs are from each other, the larger the distance gets.
5153 *
5154 * Note that we use the distance only to cache measurement results,
5155 * the distance value is not used numerically otherwise. When two
5156 * CPUs have the same distance it is assumed that the migration
5157 * cost is the same. (this is a simplification but quite practical)
5158 */
5159#define MAX_DOMAIN_DISTANCE 32
5160
5161static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5162 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
5163
5164/*
5165 * Allow override of migration cost - in units of microseconds.
5166 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5167 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5168 */
5169static int __init migration_cost_setup(char *str)
5170{
5171 int ints[MAX_DOMAIN_DISTANCE+1], i;
5172
5173 str = get_options(str, ARRAY_SIZE(ints), ints);
5174
5175 printk("#ints: %d\n", ints[0]);
5176 for (i = 1; i <= ints[0]; i++) {
5177 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5178 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5179 }
5180 return 1;
5181}
5182
5183__setup ("migration_cost=", migration_cost_setup);
5184
5185/*
5186 * Global multiplier (divisor) for migration-cutoff values,
5187 * in percentiles. E.g. use a value of 150 to get 1.5 times
5188 * longer cache-hot cutoff times.
5189 *
5190 * (We scale it from 100 to 128 to long long handling easier.)
5191 */
5192
5193#define MIGRATION_FACTOR_SCALE 128
5194
5195static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5196
5197static int __init setup_migration_factor(char *str)
5198{
5199 get_option(&str, &migration_factor);
5200 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5201 return 1;
5202}
5203
5204__setup("migration_factor=", setup_migration_factor);
5205
5206/*
5207 * Estimated distance of two CPUs, measured via the number of domains
5208 * we have to pass for the two CPUs to be in the same span:
5209 */
5210static unsigned long domain_distance(int cpu1, int cpu2)
5211{
5212 unsigned long distance = 0;
5213 struct sched_domain *sd;
5214
5215 for_each_domain(cpu1, sd) {
5216 WARN_ON(!cpu_isset(cpu1, sd->span));
5217 if (cpu_isset(cpu2, sd->span))
5218 return distance;
5219 distance++;
5220 }
5221 if (distance >= MAX_DOMAIN_DISTANCE) {
5222 WARN_ON(1);
5223 distance = MAX_DOMAIN_DISTANCE-1;
5224 }
5225
5226 return distance;
5227}
5228
5229static unsigned int migration_debug;
5230
5231static int __init setup_migration_debug(char *str)
5232{
5233 get_option(&str, &migration_debug);
5234 return 1;
5235}
5236
5237__setup("migration_debug=", setup_migration_debug);
5238
5239/*
5240 * Maximum cache-size that the scheduler should try to measure.
5241 * Architectures with larger caches should tune this up during
5242 * bootup. Gets used in the domain-setup code (i.e. during SMP
5243 * bootup).
5244 */
5245unsigned int max_cache_size;
5246
5247static int __init setup_max_cache_size(char *str)
5248{
5249 get_option(&str, &max_cache_size);
5250 return 1;
5251}
5252
5253__setup("max_cache_size=", setup_max_cache_size);
5254
5255/*
5256 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5257 * is the operation that is timed, so we try to generate unpredictable
5258 * cachemisses that still end up filling the L2 cache:
5259 */
5260static void touch_cache(void *__cache, unsigned long __size)
5261{
5262 unsigned long size = __size/sizeof(long), chunk1 = size/3,
5263 chunk2 = 2*size/3;
5264 unsigned long *cache = __cache;
5265 int i;
5266
5267 for (i = 0; i < size/6; i += 8) {
5268 switch (i % 6) {
5269 case 0: cache[i]++;
5270 case 1: cache[size-1-i]++;
5271 case 2: cache[chunk1-i]++;
5272 case 3: cache[chunk1+i]++;
5273 case 4: cache[chunk2-i]++;
5274 case 5: cache[chunk2+i]++;
5275 }
5276 }
5277}
5278
5279/*
5280 * Measure the cache-cost of one task migration. Returns in units of nsec.
5281 */
5282static unsigned long long measure_one(void *cache, unsigned long size,
5283 int source, int target)
5284{
5285 cpumask_t mask, saved_mask;
5286 unsigned long long t0, t1, t2, t3, cost;
5287
5288 saved_mask = current->cpus_allowed;
5289
5290 /*
5291 * Flush source caches to RAM and invalidate them:
5292 */
5293 sched_cacheflush();
5294
5295 /*
5296 * Migrate to the source CPU:
5297 */
5298 mask = cpumask_of_cpu(source);
5299 set_cpus_allowed(current, mask);
5300 WARN_ON(smp_processor_id() != source);
5301
5302 /*
5303 * Dirty the working set:
5304 */
5305 t0 = sched_clock();
5306 touch_cache(cache, size);
5307 t1 = sched_clock();
5308
5309 /*
5310 * Migrate to the target CPU, dirty the L2 cache and access
5311 * the shared buffer. (which represents the working set
5312 * of a migrated task.)
5313 */
5314 mask = cpumask_of_cpu(target);
5315 set_cpus_allowed(current, mask);
5316 WARN_ON(smp_processor_id() != target);
5317
5318 t2 = sched_clock();
5319 touch_cache(cache, size);
5320 t3 = sched_clock();
5321
5322 cost = t1-t0 + t3-t2;
5323
5324 if (migration_debug >= 2)
5325 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
5326 source, target, t1-t0, t1-t0, t3-t2, cost);
5327 /*
5328 * Flush target caches to RAM and invalidate them:
5329 */
5330 sched_cacheflush();
5331
5332 set_cpus_allowed(current, saved_mask);
5333
5334 return cost;
5335}
5336
5337/*
5338 * Measure a series of task migrations and return the average
5339 * result. Since this code runs early during bootup the system
5340 * is 'undisturbed' and the average latency makes sense.
5341 *
5342 * The algorithm in essence auto-detects the relevant cache-size,
5343 * so it will properly detect different cachesizes for different
5344 * cache-hierarchies, depending on how the CPUs are connected.
5345 *
5346 * Architectures can prime the upper limit of the search range via
5347 * max_cache_size, otherwise the search range defaults to 20MB...64K.
5348 */
5349static unsigned long long
5350measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5351{
5352 unsigned long long cost1, cost2;
5353 int i;
5354
5355 /*
5356 * Measure the migration cost of 'size' bytes, over an
5357 * average of 10 runs:
5358 *
5359 * (We perturb the cache size by a small (0..4k)
5360 * value to compensate size/alignment related artifacts.
5361 * We also subtract the cost of the operation done on
5362 * the same CPU.)
5363 */
5364 cost1 = 0;
5365
5366 /*
5367 * dry run, to make sure we start off cache-cold on cpu1,
5368 * and to get any vmalloc pagefaults in advance:
5369 */
5370 measure_one(cache, size, cpu1, cpu2);
5371 for (i = 0; i < ITERATIONS; i++)
5372 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
5373
5374 measure_one(cache, size, cpu2, cpu1);
5375 for (i = 0; i < ITERATIONS; i++)
5376 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
5377
5378 /*
5379 * (We measure the non-migrating [cached] cost on both
5380 * cpu1 and cpu2, to handle CPUs with different speeds)
5381 */
5382 cost2 = 0;
5383
5384 measure_one(cache, size, cpu1, cpu1);
5385 for (i = 0; i < ITERATIONS; i++)
5386 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
5387
5388 measure_one(cache, size, cpu2, cpu2);
5389 for (i = 0; i < ITERATIONS; i++)
5390 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
5391
5392 /*
5393 * Get the per-iteration migration cost:
5394 */
5395 do_div(cost1, 2*ITERATIONS);
5396 do_div(cost2, 2*ITERATIONS);
5397
5398 return cost1 - cost2;
5399}
5400
5401static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5402{
5403 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
5404 unsigned int max_size, size, size_found = 0;
5405 long long cost = 0, prev_cost;
5406 void *cache;
5407
5408 /*
5409 * Search from max_cache_size*5 down to 64K - the real relevant
5410 * cachesize has to lie somewhere inbetween.
5411 */
5412 if (max_cache_size) {
5413 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
5414 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
5415 } else {
5416 /*
5417 * Since we have no estimation about the relevant
5418 * search range
5419 */
5420 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
5421 size = MIN_CACHE_SIZE;
5422 }
5423
5424 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
5425 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
5426 return 0;
5427 }
5428
5429 /*
5430 * Allocate the working set:
5431 */
5432 cache = vmalloc(max_size);
5433 if (!cache) {
5434 printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
5435 return 1000000; // return 1 msec on very small boxen
5436 }
5437
5438 while (size <= max_size) {
5439 prev_cost = cost;
5440 cost = measure_cost(cpu1, cpu2, cache, size);
5441
5442 /*
5443 * Update the max:
5444 */
5445 if (cost > 0) {
5446 if (max_cost < cost) {
5447 max_cost = cost;
5448 size_found = size;
5449 }
5450 }
5451 /*
5452 * Calculate average fluctuation, we use this to prevent
5453 * noise from triggering an early break out of the loop:
5454 */
5455 fluct = abs(cost - prev_cost);
5456 avg_fluct = (avg_fluct + fluct)/2;
5457
5458 if (migration_debug)
5459 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
5460 cpu1, cpu2, size,
5461 (long)cost / 1000000,
5462 ((long)cost / 100000) % 10,
5463 (long)max_cost / 1000000,
5464 ((long)max_cost / 100000) % 10,
5465 domain_distance(cpu1, cpu2),
5466 cost, avg_fluct);
5467
5468 /*
5469 * If we iterated at least 20% past the previous maximum,
5470 * and the cost has dropped by more than 20% already,
5471 * (taking fluctuations into account) then we assume to
5472 * have found the maximum and break out of the loop early:
5473 */
5474 if (size_found && (size*100 > size_found*SIZE_THRESH))
5475 if (cost+avg_fluct <= 0 ||
5476 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
5477
5478 if (migration_debug)
5479 printk("-> found max.\n");
5480 break;
5481 }
5482 /*
5483 * Increase the cachesize in 5% steps:
5484 */
5485 size = size * 20 / 19;
5486 }
5487
5488 if (migration_debug)
5489 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
5490 cpu1, cpu2, size_found, max_cost);
5491
5492 vfree(cache);
5493
5494 /*
5495 * A task is considered 'cache cold' if at least 2 times
5496 * the worst-case cost of migration has passed.
5497 *
5498 * (this limit is only listened to if the load-balancing
5499 * situation is 'nice' - if there is a large imbalance we
5500 * ignore it for the sake of CPU utilization and
5501 * processing fairness.)
5502 */
5503 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
5504}
5505
5506static void calibrate_migration_costs(const cpumask_t *cpu_map)
5507{
5508 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
5509 unsigned long j0, j1, distance, max_distance = 0;
5510 struct sched_domain *sd;
5511
5512 j0 = jiffies;
5513
5514 /*
5515 * First pass - calculate the cacheflush times:
5516 */
5517 for_each_cpu_mask(cpu1, *cpu_map) {
5518 for_each_cpu_mask(cpu2, *cpu_map) {
5519 if (cpu1 == cpu2)
5520 continue;
5521 distance = domain_distance(cpu1, cpu2);
5522 max_distance = max(max_distance, distance);
5523 /*
5524 * No result cached yet?
5525 */
5526 if (migration_cost[distance] == -1LL)
5527 migration_cost[distance] =
5528 measure_migration_cost(cpu1, cpu2);
5529 }
5530 }
5531 /*
5532 * Second pass - update the sched domain hierarchy with
5533 * the new cache-hot-time estimations:
5534 */
5535 for_each_cpu_mask(cpu, *cpu_map) {
5536 distance = 0;
5537 for_each_domain(cpu, sd) {
5538 sd->cache_hot_time = migration_cost[distance];
5539 distance++;
5540 }
5541 }
5542 /*
5543 * Print the matrix:
5544 */
5545 if (migration_debug)
5546 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
5547 max_cache_size,
5548#ifdef CONFIG_X86
5549 cpu_khz/1000
5550#else
5551 -1
5552#endif
5553 );
5554 printk("migration_cost=");
5555 for (distance = 0; distance <= max_distance; distance++) {
5556 if (distance)
5557 printk(",");
5558 printk("%ld", (long)migration_cost[distance] / 1000);
5559 }
5560 printk("\n");
5561 j1 = jiffies;
5562 if (migration_debug)
5563 printk("migration: %ld seconds\n", (j1-j0)/HZ);
5564
5565 /*
5566 * Move back to the original CPU. NUMA-Q gets confused
5567 * if we migrate to another quad during bootup.
5568 */
5569 if (raw_smp_processor_id() != orig_cpu) {
5570 cpumask_t mask = cpumask_of_cpu(orig_cpu),
5571 saved_mask = current->cpus_allowed;
5572
5573 set_cpus_allowed(current, mask);
5574 set_cpus_allowed(current, saved_mask);
5575 }
5576}
5577
5076#ifdef CONFIG_NUMA 5578#ifdef CONFIG_NUMA
5579
5077/** 5580/**
5078 * find_next_best_node - find the next node to include in a sched_domain 5581 * find_next_best_node - find the next node to include in a sched_domain
5079 * @node: node whose sched_domain we're building 5582 * @node: node whose sched_domain we're building
@@ -5439,6 +5942,10 @@ next_sg:
5439#endif 5942#endif
5440 cpu_attach_domain(sd, i); 5943 cpu_attach_domain(sd, i);
5441 } 5944 }
5945 /*
5946 * Tune cache-hot values:
5947 */
5948 calibrate_migration_costs(cpu_map);
5442} 5949}
5443/* 5950/*
5444 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5951 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
@@ -5505,7 +6012,7 @@ next_sg:
5505 * Detach sched domains from a group of cpus specified in cpu_map 6012 * Detach sched domains from a group of cpus specified in cpu_map
5506 * These cpus will now be attached to the NULL domain 6013 * These cpus will now be attached to the NULL domain
5507 */ 6014 */
5508static inline void detach_destroy_domains(const cpumask_t *cpu_map) 6015static void detach_destroy_domains(const cpumask_t *cpu_map)
5509{ 6016{
5510 int i; 6017 int i;
5511 6018
diff --git a/kernel/signal.c b/kernel/signal.c
index d7611f189ef7..d3efafd8109a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -25,6 +25,7 @@
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/audit.h> 27#include <linux/audit.h>
28#include <linux/capability.h>
28#include <asm/param.h> 29#include <asm/param.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/unistd.h> 31#include <asm/unistd.h>
@@ -329,13 +330,20 @@ void __exit_sighand(struct task_struct *tsk)
329 /* Ok, we're done with the signal handlers */ 330 /* Ok, we're done with the signal handlers */
330 tsk->sighand = NULL; 331 tsk->sighand = NULL;
331 if (atomic_dec_and_test(&sighand->count)) 332 if (atomic_dec_and_test(&sighand->count))
332 kmem_cache_free(sighand_cachep, sighand); 333 sighand_free(sighand);
333} 334}
334 335
335void exit_sighand(struct task_struct *tsk) 336void exit_sighand(struct task_struct *tsk)
336{ 337{
337 write_lock_irq(&tasklist_lock); 338 write_lock_irq(&tasklist_lock);
338 __exit_sighand(tsk); 339 rcu_read_lock();
340 if (tsk->sighand != NULL) {
341 struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
342 spin_lock(&sighand->siglock);
343 __exit_sighand(tsk);
344 spin_unlock(&sighand->siglock);
345 }
346 rcu_read_unlock();
339 write_unlock_irq(&tasklist_lock); 347 write_unlock_irq(&tasklist_lock);
340} 348}
341 349
@@ -345,19 +353,20 @@ void exit_sighand(struct task_struct *tsk)
345void __exit_signal(struct task_struct *tsk) 353void __exit_signal(struct task_struct *tsk)
346{ 354{
347 struct signal_struct * sig = tsk->signal; 355 struct signal_struct * sig = tsk->signal;
348 struct sighand_struct * sighand = tsk->sighand; 356 struct sighand_struct * sighand;
349 357
350 if (!sig) 358 if (!sig)
351 BUG(); 359 BUG();
352 if (!atomic_read(&sig->count)) 360 if (!atomic_read(&sig->count))
353 BUG(); 361 BUG();
362 rcu_read_lock();
363 sighand = rcu_dereference(tsk->sighand);
354 spin_lock(&sighand->siglock); 364 spin_lock(&sighand->siglock);
355 posix_cpu_timers_exit(tsk); 365 posix_cpu_timers_exit(tsk);
356 if (atomic_dec_and_test(&sig->count)) { 366 if (atomic_dec_and_test(&sig->count)) {
357 posix_cpu_timers_exit_group(tsk); 367 posix_cpu_timers_exit_group(tsk);
358 if (tsk == sig->curr_target)
359 sig->curr_target = next_thread(tsk);
360 tsk->signal = NULL; 368 tsk->signal = NULL;
369 __exit_sighand(tsk);
361 spin_unlock(&sighand->siglock); 370 spin_unlock(&sighand->siglock);
362 flush_sigqueue(&sig->shared_pending); 371 flush_sigqueue(&sig->shared_pending);
363 } else { 372 } else {
@@ -389,9 +398,11 @@ void __exit_signal(struct task_struct *tsk)
389 sig->nvcsw += tsk->nvcsw; 398 sig->nvcsw += tsk->nvcsw;
390 sig->nivcsw += tsk->nivcsw; 399 sig->nivcsw += tsk->nivcsw;
391 sig->sched_time += tsk->sched_time; 400 sig->sched_time += tsk->sched_time;
401 __exit_sighand(tsk);
392 spin_unlock(&sighand->siglock); 402 spin_unlock(&sighand->siglock);
393 sig = NULL; /* Marker for below. */ 403 sig = NULL; /* Marker for below. */
394 } 404 }
405 rcu_read_unlock();
395 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 406 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
396 flush_sigqueue(&tsk->pending); 407 flush_sigqueue(&tsk->pending);
397 if (sig) { 408 if (sig) {
@@ -465,7 +476,7 @@ unblock_all_signals(void)
465 spin_unlock_irqrestore(&current->sighand->siglock, flags); 476 spin_unlock_irqrestore(&current->sighand->siglock, flags);
466} 477}
467 478
468static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info) 479static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
469{ 480{
470 struct sigqueue *q, *first = NULL; 481 struct sigqueue *q, *first = NULL;
471 int still_pending = 0; 482 int still_pending = 0;
@@ -613,6 +624,33 @@ void signal_wake_up(struct task_struct *t, int resume)
613 * Returns 1 if any signals were found. 624 * Returns 1 if any signals were found.
614 * 625 *
615 * All callers must be holding the siglock. 626 * All callers must be holding the siglock.
627 *
628 * This version takes a sigset mask and looks at all signals,
629 * not just those in the first mask word.
630 */
631static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
632{
633 struct sigqueue *q, *n;
634 sigset_t m;
635
636 sigandsets(&m, mask, &s->signal);
637 if (sigisemptyset(&m))
638 return 0;
639
640 signandsets(&s->signal, &s->signal, mask);
641 list_for_each_entry_safe(q, n, &s->list, list) {
642 if (sigismember(mask, q->info.si_signo)) {
643 list_del_init(&q->list);
644 __sigqueue_free(q);
645 }
646 }
647 return 1;
648}
649/*
650 * Remove signals in mask from the pending set and queue.
651 * Returns 1 if any signals were found.
652 *
653 * All callers must be holding the siglock.
616 */ 654 */
617static int rm_from_queue(unsigned long mask, struct sigpending *s) 655static int rm_from_queue(unsigned long mask, struct sigpending *s)
618{ 656{
@@ -1080,18 +1118,29 @@ void zap_other_threads(struct task_struct *p)
1080} 1118}
1081 1119
1082/* 1120/*
1083 * Must be called with the tasklist_lock held for reading! 1121 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
1084 */ 1122 */
1085int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1123int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1086{ 1124{
1087 unsigned long flags; 1125 unsigned long flags;
1126 struct sighand_struct *sp;
1088 int ret; 1127 int ret;
1089 1128
1129retry:
1090 ret = check_kill_permission(sig, info, p); 1130 ret = check_kill_permission(sig, info, p);
1091 if (!ret && sig && p->sighand) { 1131 if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
1092 spin_lock_irqsave(&p->sighand->siglock, flags); 1132 spin_lock_irqsave(&sp->siglock, flags);
1133 if (p->sighand != sp) {
1134 spin_unlock_irqrestore(&sp->siglock, flags);
1135 goto retry;
1136 }
1137 if ((atomic_read(&sp->count) == 0) ||
1138 (atomic_read(&p->usage) == 0)) {
1139 spin_unlock_irqrestore(&sp->siglock, flags);
1140 return -ESRCH;
1141 }
1093 ret = __group_send_sig_info(sig, info, p); 1142 ret = __group_send_sig_info(sig, info, p);
1094 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1143 spin_unlock_irqrestore(&sp->siglock, flags);
1095 } 1144 }
1096 1145
1097 return ret; 1146 return ret;
@@ -1136,14 +1185,21 @@ int
1136kill_proc_info(int sig, struct siginfo *info, pid_t pid) 1185kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1137{ 1186{
1138 int error; 1187 int error;
1188 int acquired_tasklist_lock = 0;
1139 struct task_struct *p; 1189 struct task_struct *p;
1140 1190
1141 read_lock(&tasklist_lock); 1191 rcu_read_lock();
1192 if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
1193 read_lock(&tasklist_lock);
1194 acquired_tasklist_lock = 1;
1195 }
1142 p = find_task_by_pid(pid); 1196 p = find_task_by_pid(pid);
1143 error = -ESRCH; 1197 error = -ESRCH;
1144 if (p) 1198 if (p)
1145 error = group_send_sig_info(sig, info, p); 1199 error = group_send_sig_info(sig, info, p);
1146 read_unlock(&tasklist_lock); 1200 if (unlikely(acquired_tasklist_lock))
1201 read_unlock(&tasklist_lock);
1202 rcu_read_unlock();
1147 return error; 1203 return error;
1148} 1204}
1149 1205
@@ -1163,8 +1219,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1163 ret = -ESRCH; 1219 ret = -ESRCH;
1164 goto out_unlock; 1220 goto out_unlock;
1165 } 1221 }
1166 if ((!info || ((unsigned long)info != 1 && 1222 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
1167 (unsigned long)info != 2 && SI_FROMUSER(info)))
1168 && (euid != p->suid) && (euid != p->uid) 1223 && (euid != p->suid) && (euid != p->uid)
1169 && (uid != p->suid) && (uid != p->uid)) { 1224 && (uid != p->suid) && (uid != p->uid)) {
1170 ret = -EPERM; 1225 ret = -EPERM;
@@ -1355,16 +1410,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1355{ 1410{
1356 unsigned long flags; 1411 unsigned long flags;
1357 int ret = 0; 1412 int ret = 0;
1413 struct sighand_struct *sh;
1358 1414
1359 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1415 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1360 read_lock(&tasklist_lock); 1416
1417 /*
1418 * The rcu based delayed sighand destroy makes it possible to
1419 * run this without tasklist lock held. The task struct itself
1420 * cannot go away as create_timer did get_task_struct().
1421 *
1422 * We return -1, when the task is marked exiting, so
1423 * posix_timer_event can redirect it to the group leader
1424 */
1425 rcu_read_lock();
1361 1426
1362 if (unlikely(p->flags & PF_EXITING)) { 1427 if (unlikely(p->flags & PF_EXITING)) {
1363 ret = -1; 1428 ret = -1;
1364 goto out_err; 1429 goto out_err;
1365 } 1430 }
1366 1431
1367 spin_lock_irqsave(&p->sighand->siglock, flags); 1432retry:
1433 sh = rcu_dereference(p->sighand);
1434
1435 spin_lock_irqsave(&sh->siglock, flags);
1436 if (p->sighand != sh) {
1437 /* We raced with exec() in a multithreaded process... */
1438 spin_unlock_irqrestore(&sh->siglock, flags);
1439 goto retry;
1440 }
1441
1442 /*
1443 * We do the check here again to handle the following scenario:
1444 *
1445 * CPU 0 CPU 1
1446 * send_sigqueue
1447 * check PF_EXITING
1448 * interrupt exit code running
1449 * __exit_signal
1450 * lock sighand->siglock
1451 * unlock sighand->siglock
1452 * lock sh->siglock
1453 * add(tsk->pending) flush_sigqueue(tsk->pending)
1454 *
1455 */
1456
1457 if (unlikely(p->flags & PF_EXITING)) {
1458 ret = -1;
1459 goto out;
1460 }
1368 1461
1369 if (unlikely(!list_empty(&q->list))) { 1462 if (unlikely(!list_empty(&q->list))) {
1370 /* 1463 /*
@@ -1388,9 +1481,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1388 signal_wake_up(p, sig == SIGKILL); 1481 signal_wake_up(p, sig == SIGKILL);
1389 1482
1390out: 1483out:
1391 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1484 spin_unlock_irqrestore(&sh->siglock, flags);
1392out_err: 1485out_err:
1393 read_unlock(&tasklist_lock); 1486 rcu_read_unlock();
1394 1487
1395 return ret; 1488 return ret;
1396} 1489}
@@ -1402,7 +1495,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1402 int ret = 0; 1495 int ret = 0;
1403 1496
1404 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1497 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1498
1405 read_lock(&tasklist_lock); 1499 read_lock(&tasklist_lock);
1500 /* Since it_lock is held, p->sighand cannot be NULL. */
1406 spin_lock_irqsave(&p->sighand->siglock, flags); 1501 spin_lock_irqsave(&p->sighand->siglock, flags);
1407 handle_stop_signal(sig, p); 1502 handle_stop_signal(sig, p);
1408 1503
@@ -1436,7 +1531,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1436out: 1531out:
1437 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1532 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1438 read_unlock(&tasklist_lock); 1533 read_unlock(&tasklist_lock);
1439 return(ret); 1534 return ret;
1440} 1535}
1441 1536
1442/* 1537/*
@@ -1786,7 +1881,7 @@ do_signal_stop(int signr)
1786 * We return zero if we still hold the siglock and should look 1881 * We return zero if we still hold the siglock and should look
1787 * for another signal without checking group_stop_count again. 1882 * for another signal without checking group_stop_count again.
1788 */ 1883 */
1789static inline int handle_group_stop(void) 1884static int handle_group_stop(void)
1790{ 1885{
1791 int stop_count; 1886 int stop_count;
1792 1887
@@ -2338,6 +2433,7 @@ int
2338do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) 2433do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2339{ 2434{
2340 struct k_sigaction *k; 2435 struct k_sigaction *k;
2436 sigset_t mask;
2341 2437
2342 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) 2438 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
2343 return -EINVAL; 2439 return -EINVAL;
@@ -2385,9 +2481,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2385 *k = *act; 2481 *k = *act;
2386 sigdelsetmask(&k->sa.sa_mask, 2482 sigdelsetmask(&k->sa.sa_mask,
2387 sigmask(SIGKILL) | sigmask(SIGSTOP)); 2483 sigmask(SIGKILL) | sigmask(SIGSTOP));
2388 rm_from_queue(sigmask(sig), &t->signal->shared_pending); 2484 sigemptyset(&mask);
2485 sigaddset(&mask, sig);
2486 rm_from_queue_full(&mask, &t->signal->shared_pending);
2389 do { 2487 do {
2390 rm_from_queue(sigmask(sig), &t->pending); 2488 rm_from_queue_full(&mask, &t->pending);
2391 recalc_sigpending_tsk(t); 2489 recalc_sigpending_tsk(t);
2392 t = next_thread(t); 2490 t = next_thread(t);
2393 } while (t != current); 2491 } while (t != current);
@@ -2623,6 +2721,32 @@ sys_pause(void)
2623 2721
2624#endif 2722#endif
2625 2723
2724#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
2725asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
2726{
2727 sigset_t newset;
2728
2729 /* XXX: Don't preclude handling different sized sigset_t's. */
2730 if (sigsetsize != sizeof(sigset_t))
2731 return -EINVAL;
2732
2733 if (copy_from_user(&newset, unewset, sizeof(newset)))
2734 return -EFAULT;
2735 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2736
2737 spin_lock_irq(&current->sighand->siglock);
2738 current->saved_sigmask = current->blocked;
2739 current->blocked = newset;
2740 recalc_sigpending();
2741 spin_unlock_irq(&current->sighand->siglock);
2742
2743 current->state = TASK_INTERRUPTIBLE;
2744 schedule();
2745 set_thread_flag(TIF_RESTORE_SIGMASK);
2746 return -ERESTARTNOHAND;
2747}
2748#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
2749
2626void __init signals_init(void) 2750void __init signals_init(void)
2627{ 2751{
2628 sigqueue_cachep = 2752 sigqueue_cachep =
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b3d4dc858e35..dcfb5d731466 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -87,13 +87,9 @@ static int stop_machine(void)
87{ 87{
88 int i, ret = 0; 88 int i, ret = 0;
89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
90 mm_segment_t old_fs = get_fs();
91 90
92 /* One high-prio thread per cpu. We'll do this one. */ 91 /* One high-prio thread per cpu. We'll do this one. */
93 set_fs(KERNEL_DS); 92 sched_setscheduler(current, SCHED_FIFO, &param);
94 sys_sched_setscheduler(current->pid, SCHED_FIFO,
95 (struct sched_param __user *)&param);
96 set_fs(old_fs);
97 93
98 atomic_set(&stopmachine_thread_ack, 0); 94 atomic_set(&stopmachine_thread_ack, 0);
99 stopmachine_num_threads = 0; 95 stopmachine_num_threads = 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index bce933ebb29f..0929c698affc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -19,6 +19,7 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22#include <linux/capability.h>
22#include <linux/device.h> 23#include <linux/device.h>
23#include <linux/key.h> 24#include <linux/key.h>
24#include <linux/times.h> 25#include <linux/times.h>
@@ -32,6 +33,7 @@
32 33
33#include <linux/compat.h> 34#include <linux/compat.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/kprobes.h>
35 37
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37#include <asm/io.h> 39#include <asm/io.h>
@@ -168,7 +170,7 @@ EXPORT_SYMBOL(notifier_chain_unregister);
168 * of the last notifier function called. 170 * of the last notifier function called.
169 */ 171 */
170 172
171int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) 173int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
172{ 174{
173 int ret=NOTIFY_DONE; 175 int ret=NOTIFY_DONE;
174 struct notifier_block *nb = *n; 176 struct notifier_block *nb = *n;
@@ -222,6 +224,18 @@ int unregister_reboot_notifier(struct notifier_block * nb)
222 224
223EXPORT_SYMBOL(unregister_reboot_notifier); 225EXPORT_SYMBOL(unregister_reboot_notifier);
224 226
227#ifndef CONFIG_SECURITY
228int capable(int cap)
229{
230 if (cap_raised(current->cap_effective, cap)) {
231 current->flags |= PF_SUPERPRIV;
232 return 1;
233 }
234 return 0;
235}
236EXPORT_SYMBOL(capable);
237#endif
238
225static int set_one_prio(struct task_struct *p, int niceval, int error) 239static int set_one_prio(struct task_struct *p, int niceval, int error)
226{ 240{
227 int no_nice; 241 int no_nice;
@@ -426,23 +440,25 @@ void kernel_kexec(void)
426} 440}
427EXPORT_SYMBOL_GPL(kernel_kexec); 441EXPORT_SYMBOL_GPL(kernel_kexec);
428 442
443void kernel_shutdown_prepare(enum system_states state)
444{
445 notifier_call_chain(&reboot_notifier_list,
446 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
447 system_state = state;
448 device_shutdown();
449}
429/** 450/**
430 * kernel_halt - halt the system 451 * kernel_halt - halt the system
431 * 452 *
432 * Shutdown everything and perform a clean system halt. 453 * Shutdown everything and perform a clean system halt.
433 */ 454 */
434void kernel_halt_prepare(void)
435{
436 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
437 system_state = SYSTEM_HALT;
438 device_shutdown();
439}
440void kernel_halt(void) 455void kernel_halt(void)
441{ 456{
442 kernel_halt_prepare(); 457 kernel_shutdown_prepare(SYSTEM_HALT);
443 printk(KERN_EMERG "System halted.\n"); 458 printk(KERN_EMERG "System halted.\n");
444 machine_halt(); 459 machine_halt();
445} 460}
461
446EXPORT_SYMBOL_GPL(kernel_halt); 462EXPORT_SYMBOL_GPL(kernel_halt);
447 463
448/** 464/**
@@ -450,20 +466,13 @@ EXPORT_SYMBOL_GPL(kernel_halt);
450 * 466 *
451 * Shutdown everything and perform a clean system power_off. 467 * Shutdown everything and perform a clean system power_off.
452 */ 468 */
453void kernel_power_off_prepare(void)
454{
455 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
456 system_state = SYSTEM_POWER_OFF;
457 device_shutdown();
458}
459void kernel_power_off(void) 469void kernel_power_off(void)
460{ 470{
461 kernel_power_off_prepare(); 471 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
462 printk(KERN_EMERG "Power down.\n"); 472 printk(KERN_EMERG "Power down.\n");
463 machine_power_off(); 473 machine_power_off();
464} 474}
465EXPORT_SYMBOL_GPL(kernel_power_off); 475EXPORT_SYMBOL_GPL(kernel_power_off);
466
467/* 476/*
468 * Reboot system call: for obvious reasons only root may call it, 477 * Reboot system call: for obvious reasons only root may call it,
469 * and even root needs to set up some magic numbers in the registers 478 * and even root needs to set up some magic numbers in the registers
@@ -488,6 +497,12 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
488 magic2 != LINUX_REBOOT_MAGIC2C)) 497 magic2 != LINUX_REBOOT_MAGIC2C))
489 return -EINVAL; 498 return -EINVAL;
490 499
500 /* Instead of trying to make the power_off code look like
501 * halt when pm_power_off is not set do it the easy way.
502 */
503 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
504 cmd = LINUX_REBOOT_CMD_HALT;
505
491 lock_kernel(); 506 lock_kernel();
492 switch (cmd) { 507 switch (cmd) {
493 case LINUX_REBOOT_CMD_RESTART: 508 case LINUX_REBOOT_CMD_RESTART:
@@ -1083,10 +1098,11 @@ asmlinkage long sys_times(struct tms __user * tbuf)
1083asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) 1098asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1084{ 1099{
1085 struct task_struct *p; 1100 struct task_struct *p;
1101 struct task_struct *group_leader = current->group_leader;
1086 int err = -EINVAL; 1102 int err = -EINVAL;
1087 1103
1088 if (!pid) 1104 if (!pid)
1089 pid = current->pid; 1105 pid = group_leader->pid;
1090 if (!pgid) 1106 if (!pgid)
1091 pgid = pid; 1107 pgid = pid;
1092 if (pgid < 0) 1108 if (pgid < 0)
@@ -1106,16 +1122,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1106 if (!thread_group_leader(p)) 1122 if (!thread_group_leader(p))
1107 goto out; 1123 goto out;
1108 1124
1109 if (p->parent == current || p->real_parent == current) { 1125 if (p->real_parent == group_leader) {
1110 err = -EPERM; 1126 err = -EPERM;
1111 if (p->signal->session != current->signal->session) 1127 if (p->signal->session != group_leader->signal->session)
1112 goto out; 1128 goto out;
1113 err = -EACCES; 1129 err = -EACCES;
1114 if (p->did_exec) 1130 if (p->did_exec)
1115 goto out; 1131 goto out;
1116 } else { 1132 } else {
1117 err = -ESRCH; 1133 err = -ESRCH;
1118 if (p != current) 1134 if (p != group_leader)
1119 goto out; 1135 goto out;
1120 } 1136 }
1121 1137
@@ -1127,7 +1143,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1127 struct task_struct *p; 1143 struct task_struct *p;
1128 1144
1129 do_each_task_pid(pgid, PIDTYPE_PGID, p) { 1145 do_each_task_pid(pgid, PIDTYPE_PGID, p) {
1130 if (p->signal->session == current->signal->session) 1146 if (p->signal->session == group_leader->signal->session)
1131 goto ok_pgid; 1147 goto ok_pgid;
1132 } while_each_task_pid(pgid, PIDTYPE_PGID, p); 1148 } while_each_task_pid(pgid, PIDTYPE_PGID, p);
1133 goto out; 1149 goto out;
@@ -1207,24 +1223,22 @@ asmlinkage long sys_getsid(pid_t pid)
1207 1223
1208asmlinkage long sys_setsid(void) 1224asmlinkage long sys_setsid(void)
1209{ 1225{
1226 struct task_struct *group_leader = current->group_leader;
1210 struct pid *pid; 1227 struct pid *pid;
1211 int err = -EPERM; 1228 int err = -EPERM;
1212 1229
1213 if (!thread_group_leader(current))
1214 return -EINVAL;
1215
1216 down(&tty_sem); 1230 down(&tty_sem);
1217 write_lock_irq(&tasklist_lock); 1231 write_lock_irq(&tasklist_lock);
1218 1232
1219 pid = find_pid(PIDTYPE_PGID, current->pid); 1233 pid = find_pid(PIDTYPE_PGID, group_leader->pid);
1220 if (pid) 1234 if (pid)
1221 goto out; 1235 goto out;
1222 1236
1223 current->signal->leader = 1; 1237 group_leader->signal->leader = 1;
1224 __set_special_pids(current->pid, current->pid); 1238 __set_special_pids(group_leader->pid, group_leader->pid);
1225 current->signal->tty = NULL; 1239 group_leader->signal->tty = NULL;
1226 current->signal->tty_old_pgrp = 0; 1240 group_leader->signal->tty_old_pgrp = 0;
1227 err = process_group(current); 1241 err = process_group(group_leader);
1228out: 1242out:
1229 write_unlock_irq(&tasklist_lock); 1243 write_unlock_irq(&tasklist_lock);
1230 up(&tty_sem); 1244 up(&tty_sem);
@@ -1686,7 +1700,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1686 if (unlikely(!p->signal)) 1700 if (unlikely(!p->signal))
1687 return; 1701 return;
1688 1702
1703 utime = stime = cputime_zero;
1704
1689 switch (who) { 1705 switch (who) {
1706 case RUSAGE_BOTH:
1690 case RUSAGE_CHILDREN: 1707 case RUSAGE_CHILDREN:
1691 spin_lock_irqsave(&p->sighand->siglock, flags); 1708 spin_lock_irqsave(&p->sighand->siglock, flags);
1692 utime = p->signal->cutime; 1709 utime = p->signal->cutime;
@@ -1696,22 +1713,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1696 r->ru_minflt = p->signal->cmin_flt; 1713 r->ru_minflt = p->signal->cmin_flt;
1697 r->ru_majflt = p->signal->cmaj_flt; 1714 r->ru_majflt = p->signal->cmaj_flt;
1698 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1715 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1699 cputime_to_timeval(utime, &r->ru_utime); 1716
1700 cputime_to_timeval(stime, &r->ru_stime); 1717 if (who == RUSAGE_CHILDREN)
1701 break; 1718 break;
1719
1702 case RUSAGE_SELF: 1720 case RUSAGE_SELF:
1703 spin_lock_irqsave(&p->sighand->siglock, flags);
1704 utime = stime = cputime_zero;
1705 goto sum_group;
1706 case RUSAGE_BOTH:
1707 spin_lock_irqsave(&p->sighand->siglock, flags);
1708 utime = p->signal->cutime;
1709 stime = p->signal->cstime;
1710 r->ru_nvcsw = p->signal->cnvcsw;
1711 r->ru_nivcsw = p->signal->cnivcsw;
1712 r->ru_minflt = p->signal->cmin_flt;
1713 r->ru_majflt = p->signal->cmaj_flt;
1714 sum_group:
1715 utime = cputime_add(utime, p->signal->utime); 1721 utime = cputime_add(utime, p->signal->utime);
1716 stime = cputime_add(stime, p->signal->stime); 1722 stime = cputime_add(stime, p->signal->stime);
1717 r->ru_nvcsw += p->signal->nvcsw; 1723 r->ru_nvcsw += p->signal->nvcsw;
@@ -1728,13 +1734,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1728 r->ru_majflt += t->maj_flt; 1734 r->ru_majflt += t->maj_flt;
1729 t = next_thread(t); 1735 t = next_thread(t);
1730 } while (t != p); 1736 } while (t != p);
1731 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1732 cputime_to_timeval(utime, &r->ru_utime);
1733 cputime_to_timeval(stime, &r->ru_stime);
1734 break; 1737 break;
1738
1735 default: 1739 default:
1736 BUG(); 1740 BUG();
1737 } 1741 }
1742
1743 cputime_to_timeval(utime, &r->ru_utime);
1744 cputime_to_timeval(stime, &r->ru_stime);
1738} 1745}
1739 1746
1740int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1747int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2efa..17313b99e53d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -82,6 +82,28 @@ cond_syscall(compat_sys_socketcall);
82cond_syscall(sys_inotify_init); 82cond_syscall(sys_inotify_init);
83cond_syscall(sys_inotify_add_watch); 83cond_syscall(sys_inotify_add_watch);
84cond_syscall(sys_inotify_rm_watch); 84cond_syscall(sys_inotify_rm_watch);
85cond_syscall(sys_migrate_pages);
86cond_syscall(sys_chown16);
87cond_syscall(sys_fchown16);
88cond_syscall(sys_getegid16);
89cond_syscall(sys_geteuid16);
90cond_syscall(sys_getgid16);
91cond_syscall(sys_getgroups16);
92cond_syscall(sys_getresgid16);
93cond_syscall(sys_getresuid16);
94cond_syscall(sys_getuid16);
95cond_syscall(sys_lchown16);
96cond_syscall(sys_setfsgid16);
97cond_syscall(sys_setfsuid16);
98cond_syscall(sys_setgid16);
99cond_syscall(sys_setgroups16);
100cond_syscall(sys_setregid16);
101cond_syscall(sys_setresgid16);
102cond_syscall(sys_setresuid16);
103cond_syscall(sys_setreuid16);
104cond_syscall(sys_setuid16);
105cond_syscall(sys_vm86old);
106cond_syscall(sys_vm86);
85 107
86/* arch-specific weak syscall entries */ 108/* arch-specific weak syscall entries */
87cond_syscall(sys_pciconfig_read); 109cond_syscall(sys_pciconfig_read);
@@ -90,3 +112,5 @@ cond_syscall(sys_pciconfig_iobase);
90cond_syscall(sys32_ipc); 112cond_syscall(sys32_ipc);
91cond_syscall(sys32_sysctl); 113cond_syscall(sys32_sysctl);
92cond_syscall(ppc_rtas); 114cond_syscall(ppc_rtas);
115cond_syscall(sys_spu_run);
116cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9990e10192e8..cb99a42f8b37 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -25,12 +25,14 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
28#include <linux/capability.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
29#include <linux/utsname.h> 30#include <linux/utsname.h>
30#include <linux/capability.h> 31#include <linux/capability.h>
31#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
32#include <linux/init.h> 33#include <linux/init.h>
33#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/kobject.h>
34#include <linux/net.h> 36#include <linux/net.h>
35#include <linux/sysrq.h> 37#include <linux/sysrq.h>
36#include <linux/highuid.h> 38#include <linux/highuid.h>
@@ -67,6 +69,8 @@ extern int min_free_kbytes;
67extern int printk_ratelimit_jiffies; 69extern int printk_ratelimit_jiffies;
68extern int printk_ratelimit_burst; 70extern int printk_ratelimit_burst;
69extern int pid_max_min, pid_max_max; 71extern int pid_max_min, pid_max_max;
72extern int sysctl_drop_caches;
73extern int percpu_pagelist_fraction;
70 74
71#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 75#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
72int unknown_nmi_panic; 76int unknown_nmi_panic;
@@ -77,15 +81,13 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
77/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
78static int maxolduid = 65535; 82static int maxolduid = 65535;
79static int minolduid; 83static int minolduid;
84static int min_percpu_pagelist_fract = 8;
80 85
81static int ngroups_max = NGROUPS_MAX; 86static int ngroups_max = NGROUPS_MAX;
82 87
83#ifdef CONFIG_KMOD 88#ifdef CONFIG_KMOD
84extern char modprobe_path[]; 89extern char modprobe_path[];
85#endif 90#endif
86#ifdef CONFIG_HOTPLUG
87extern char hotplug_path[];
88#endif
89#ifdef CONFIG_CHR_DEV_SG 91#ifdef CONFIG_CHR_DEV_SG
90extern int sg_big_buff; 92extern int sg_big_buff;
91#endif 93#endif
@@ -110,7 +112,7 @@ extern int pwrsw_enabled;
110extern int unaligned_enabled; 112extern int unaligned_enabled;
111#endif 113#endif
112 114
113#ifdef CONFIG_ARCH_S390 115#ifdef CONFIG_S390
114#ifdef CONFIG_MATHEMU 116#ifdef CONFIG_MATHEMU
115extern int sysctl_ieee_emulation_warnings; 117extern int sysctl_ieee_emulation_warnings;
116#endif 118#endif
@@ -397,8 +399,8 @@ static ctl_table kern_table[] = {
397 { 399 {
398 .ctl_name = KERN_HOTPLUG, 400 .ctl_name = KERN_HOTPLUG,
399 .procname = "hotplug", 401 .procname = "hotplug",
400 .data = &hotplug_path, 402 .data = &uevent_helper,
401 .maxlen = HOTPLUG_PATH_LEN, 403 .maxlen = UEVENT_HELPER_PATH_LEN,
402 .mode = 0644, 404 .mode = 0644,
403 .proc_handler = &proc_dostring, 405 .proc_handler = &proc_dostring,
404 .strategy = &sysctl_string, 406 .strategy = &sysctl_string,
@@ -544,7 +546,7 @@ static ctl_table kern_table[] = {
544 .extra1 = &minolduid, 546 .extra1 = &minolduid,
545 .extra2 = &maxolduid, 547 .extra2 = &maxolduid,
546 }, 548 },
547#ifdef CONFIG_ARCH_S390 549#ifdef CONFIG_S390
548#ifdef CONFIG_MATHEMU 550#ifdef CONFIG_MATHEMU
549 { 551 {
550 .ctl_name = KERN_IEEE_EMULATION_WARNINGS, 552 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
@@ -646,7 +648,7 @@ static ctl_table kern_table[] = {
646 .mode = 0644, 648 .mode = 0644,
647 .proc_handler = &proc_dointvec, 649 .proc_handler = &proc_dointvec,
648 }, 650 },
649#if defined(CONFIG_ARCH_S390) 651#if defined(CONFIG_S390) && defined(CONFIG_SMP)
650 { 652 {
651 .ctl_name = KERN_SPIN_RETRY, 653 .ctl_name = KERN_SPIN_RETRY,
652 .procname = "spin_retry", 654 .procname = "spin_retry",
@@ -777,6 +779,15 @@ static ctl_table vm_table[] = {
777 .strategy = &sysctl_intvec, 779 .strategy = &sysctl_intvec,
778 }, 780 },
779 { 781 {
782 .ctl_name = VM_DROP_PAGECACHE,
783 .procname = "drop_caches",
784 .data = &sysctl_drop_caches,
785 .maxlen = sizeof(int),
786 .mode = 0644,
787 .proc_handler = drop_caches_sysctl_handler,
788 .strategy = &sysctl_intvec,
789 },
790 {
780 .ctl_name = VM_MIN_FREE_KBYTES, 791 .ctl_name = VM_MIN_FREE_KBYTES,
781 .procname = "min_free_kbytes", 792 .procname = "min_free_kbytes",
782 .data = &min_free_kbytes, 793 .data = &min_free_kbytes,
@@ -786,6 +797,16 @@ static ctl_table vm_table[] = {
786 .strategy = &sysctl_intvec, 797 .strategy = &sysctl_intvec,
787 .extra1 = &zero, 798 .extra1 = &zero,
788 }, 799 },
800 {
801 .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
802 .procname = "percpu_pagelist_fraction",
803 .data = &percpu_pagelist_fraction,
804 .maxlen = sizeof(percpu_pagelist_fraction),
805 .mode = 0644,
806 .proc_handler = &percpu_pagelist_fraction_sysctl_handler,
807 .strategy = &sysctl_intvec,
808 .extra1 = &min_percpu_pagelist_fract,
809 },
789#ifdef CONFIG_MMU 810#ifdef CONFIG_MMU
790 { 811 {
791 .ctl_name = VM_MAX_MAP_COUNT, 812 .ctl_name = VM_MAX_MAP_COUNT,
@@ -849,6 +870,17 @@ static ctl_table vm_table[] = {
849 .strategy = &sysctl_jiffies, 870 .strategy = &sysctl_jiffies,
850 }, 871 },
851#endif 872#endif
873#ifdef CONFIG_NUMA
874 {
875 .ctl_name = VM_ZONE_RECLAIM_MODE,
876 .procname = "zone_reclaim_mode",
877 .data = &zone_reclaim_mode,
878 .maxlen = sizeof(zone_reclaim_mode),
879 .mode = 0644,
880 .proc_handler = &proc_dointvec,
881 .strategy = &zero,
882 },
883#endif
852 { .ctl_name = 0 } 884 { .ctl_name = 0 }
853}; 885};
854 886
@@ -2192,29 +2224,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2192 void __user *oldval, size_t __user *oldlenp, 2224 void __user *oldval, size_t __user *oldlenp,
2193 void __user *newval, size_t newlen, void **context) 2225 void __user *newval, size_t newlen, void **context)
2194{ 2226{
2195 size_t l, len;
2196
2197 if (!table->data || !table->maxlen) 2227 if (!table->data || !table->maxlen)
2198 return -ENOTDIR; 2228 return -ENOTDIR;
2199 2229
2200 if (oldval && oldlenp) { 2230 if (oldval && oldlenp) {
2201 if (get_user(len, oldlenp)) 2231 size_t bufsize;
2232 if (get_user(bufsize, oldlenp))
2202 return -EFAULT; 2233 return -EFAULT;
2203 if (len) { 2234 if (bufsize) {
2204 l = strlen(table->data); 2235 size_t len = strlen(table->data), copied;
2205 if (len > l) len = l; 2236
2206 if (len >= table->maxlen) 2237 /* This shouldn't trigger for a well-formed sysctl */
2238 if (len > table->maxlen)
2207 len = table->maxlen; 2239 len = table->maxlen;
2208 if(copy_to_user(oldval, table->data, len)) 2240
2209 return -EFAULT; 2241 /* Copy up to a max of bufsize-1 bytes of the string */
2210 if(put_user(0, ((char __user *) oldval) + len)) 2242 copied = (len >= bufsize) ? bufsize - 1 : len;
2243
2244 if (copy_to_user(oldval, table->data, copied) ||
2245 put_user(0, (char __user *)(oldval + copied)))
2211 return -EFAULT; 2246 return -EFAULT;
2212 if(put_user(len, oldlenp)) 2247 if (put_user(len, oldlenp))
2213 return -EFAULT; 2248 return -EFAULT;
2214 } 2249 }
2215 } 2250 }
2216 if (newval && newlen) { 2251 if (newval && newlen) {
2217 len = newlen; 2252 size_t len = newlen;
2218 if (len > table->maxlen) 2253 if (len > table->maxlen)
2219 len = table->maxlen; 2254 len = table->maxlen;
2220 if(copy_from_user(table->data, newval, len)) 2255 if(copy_from_user(table->data, newval, len))
@@ -2223,7 +2258,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2223 len--; 2258 len--;
2224 ((char *) table->data)[len] = 0; 2259 ((char *) table->data)[len] = 0;
2225 } 2260 }
2226 return 0; 2261 return 1;
2227} 2262}
2228 2263
2229/* 2264/*
diff --git a/kernel/time.c b/kernel/time.c
index 245d595a13cb..7477b1d2079e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h>
32#include <linux/errno.h> 33#include <linux/errno.h>
33#include <linux/smp_lock.h> 34#include <linux/smp_lock.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
@@ -154,6 +155,9 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
154 static int firsttime = 1; 155 static int firsttime = 1;
155 int error = 0; 156 int error = 0;
156 157
158 if (!timespec_valid(tv))
159 return -EINVAL;
160
157 error = security_settime(tv, tz); 161 error = security_settime(tv, tz);
158 if (error) 162 if (error)
159 return error; 163 return error;
@@ -561,6 +565,108 @@ void getnstimeofday(struct timespec *tv)
561EXPORT_SYMBOL_GPL(getnstimeofday); 565EXPORT_SYMBOL_GPL(getnstimeofday);
562#endif 566#endif
563 567
568/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
569 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
570 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
571 *
572 * [For the Julian calendar (which was used in Russia before 1917,
573 * Britain & colonies before 1752, anywhere else before 1582,
574 * and is still in use by some communities) leave out the
575 * -year/100+year/400 terms, and add 10.]
576 *
577 * This algorithm was first published by Gauss (I think).
578 *
579 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
580 * machines were long is 32-bit! (However, as time_t is signed, we
581 * will already get problems at other places on 2038-01-19 03:14:08)
582 */
583unsigned long
584mktime(const unsigned int year0, const unsigned int mon0,
585 const unsigned int day, const unsigned int hour,
586 const unsigned int min, const unsigned int sec)
587{
588 unsigned int mon = mon0, year = year0;
589
590 /* 1..12 -> 11,12,1..10 */
591 if (0 >= (int) (mon -= 2)) {
592 mon += 12; /* Puts Feb last since it has leap day */
593 year -= 1;
594 }
595
596 return ((((unsigned long)
597 (year/4 - year/100 + year/400 + 367*mon/12 + day) +
598 year*365 - 719499
599 )*24 + hour /* now have hours */
600 )*60 + min /* now have minutes */
601 )*60 + sec; /* finally seconds */
602}
603
604EXPORT_SYMBOL(mktime);
605
606/**
607 * set_normalized_timespec - set timespec sec and nsec parts and normalize
608 *
609 * @ts: pointer to timespec variable to be set
610 * @sec: seconds to set
611 * @nsec: nanoseconds to set
612 *
613 * Set seconds and nanoseconds field of a timespec variable and
614 * normalize to the timespec storage format
615 *
616 * Note: The tv_nsec part is always in the range of
617 * 0 <= tv_nsec < NSEC_PER_SEC
618 * For negative values only the tv_sec field is negative !
619 */
620void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
621{
622 while (nsec >= NSEC_PER_SEC) {
623 nsec -= NSEC_PER_SEC;
624 ++sec;
625 }
626 while (nsec < 0) {
627 nsec += NSEC_PER_SEC;
628 --sec;
629 }
630 ts->tv_sec = sec;
631 ts->tv_nsec = nsec;
632}
633
634/**
635 * ns_to_timespec - Convert nanoseconds to timespec
636 * @nsec: the nanoseconds value to be converted
637 *
638 * Returns the timespec representation of the nsec parameter.
639 */
640inline struct timespec ns_to_timespec(const nsec_t nsec)
641{
642 struct timespec ts;
643
644 if (nsec)
645 ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC,
646 &ts.tv_nsec);
647 else
648 ts.tv_sec = ts.tv_nsec = 0;
649
650 return ts;
651}
652
653/**
654 * ns_to_timeval - Convert nanoseconds to timeval
655 * @nsec: the nanoseconds value to be converted
656 *
657 * Returns the timeval representation of the nsec parameter.
658 */
659struct timeval ns_to_timeval(const nsec_t nsec)
660{
661 struct timespec ts = ns_to_timespec(nsec);
662 struct timeval tv;
663
664 tv.tv_sec = ts.tv_sec;
665 tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
666
667 return tv;
668}
669
564#if (BITS_PER_LONG < 64) 670#if (BITS_PER_LONG < 64)
565u64 get_jiffies_64(void) 671u64 get_jiffies_64(void)
566{ 672{
diff --git a/kernel/timer.c b/kernel/timer.c
index fd74268d8663..4f1cb0ab5251 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -33,6 +33,7 @@
33#include <linux/posix-timers.h> 33#include <linux/posix-timers.h>
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/delay.h>
36 37
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38#include <asm/unistd.h> 39#include <asm/unistd.h>
@@ -857,6 +858,7 @@ static void run_timer_softirq(struct softirq_action *h)
857{ 858{
858 tvec_base_t *base = &__get_cpu_var(tvec_bases); 859 tvec_base_t *base = &__get_cpu_var(tvec_bases);
859 860
861 hrtimer_run_queues();
860 if (time_after_eq(jiffies, base->timer_jiffies)) 862 if (time_after_eq(jiffies, base->timer_jiffies))
861 __run_timers(base); 863 __run_timers(base);
862} 864}
@@ -1118,62 +1120,6 @@ asmlinkage long sys_gettid(void)
1118 return current->pid; 1120 return current->pid;
1119} 1121}
1120 1122
1121static long __sched nanosleep_restart(struct restart_block *restart)
1122{
1123 unsigned long expire = restart->arg0, now = jiffies;
1124 struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
1125 long ret;
1126
1127 /* Did it expire while we handled signals? */
1128 if (!time_after(expire, now))
1129 return 0;
1130
1131 expire = schedule_timeout_interruptible(expire - now);
1132
1133 ret = 0;
1134 if (expire) {
1135 struct timespec t;
1136 jiffies_to_timespec(expire, &t);
1137
1138 ret = -ERESTART_RESTARTBLOCK;
1139 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1140 ret = -EFAULT;
1141 /* The 'restart' block is already filled in */
1142 }
1143 return ret;
1144}
1145
1146asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1147{
1148 struct timespec t;
1149 unsigned long expire;
1150 long ret;
1151
1152 if (copy_from_user(&t, rqtp, sizeof(t)))
1153 return -EFAULT;
1154
1155 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
1156 return -EINVAL;
1157
1158 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1159 expire = schedule_timeout_interruptible(expire);
1160
1161 ret = 0;
1162 if (expire) {
1163 struct restart_block *restart;
1164 jiffies_to_timespec(expire, &t);
1165 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1166 return -EFAULT;
1167
1168 restart = &current_thread_info()->restart_block;
1169 restart->fn = nanosleep_restart;
1170 restart->arg0 = jiffies + expire;
1171 restart->arg1 = (unsigned long) rmtp;
1172 ret = -ERESTART_RESTARTBLOCK;
1173 }
1174 return ret;
1175}
1176
1177/* 1123/*
1178 * sys_sysinfo - fill in sysinfo struct 1124 * sys_sysinfo - fill in sysinfo struct
1179 */ 1125 */
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f669941e8b26..aa25605027c8 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -10,6 +10,7 @@
10#include <linux/notifier.h> 10#include <linux/notifier.h>
11#include <linux/reboot.h> 11#include <linux/reboot.h>
12#include <linux/prctl.h> 12#include <linux/prctl.h>
13#include <linux/capability.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/highuid.h> 15#include <linux/highuid.h>
15#include <linux/security.h> 16#include <linux/security.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2bd5aee1c736..b052e2c4c710 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,7 +29,8 @@
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30 30
31/* 31/*
32 * The per-CPU workqueue (if single thread, we always use cpu 0's). 32 * The per-CPU workqueue (if single thread, we always use the first
33 * possible cpu).
33 * 34 *
34 * The sequence counters are for flush_scheduled_work(). It wants to wait 35 * The sequence counters are for flush_scheduled_work(). It wants to wait
35 * until until all currently-scheduled works are completed, but it doesn't 36 * until until all currently-scheduled works are completed, but it doesn't
@@ -69,6 +70,8 @@ struct workqueue_struct {
69static DEFINE_SPINLOCK(workqueue_lock); 70static DEFINE_SPINLOCK(workqueue_lock);
70static LIST_HEAD(workqueues); 71static LIST_HEAD(workqueues);
71 72
73static int singlethread_cpu;
74
72/* If it's single threaded, it isn't in the list of workqueues. */ 75/* If it's single threaded, it isn't in the list of workqueues. */
73static inline int is_single_threaded(struct workqueue_struct *wq) 76static inline int is_single_threaded(struct workqueue_struct *wq)
74{ 77{
@@ -102,7 +105,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
102 105
103 if (!test_and_set_bit(0, &work->pending)) { 106 if (!test_and_set_bit(0, &work->pending)) {
104 if (unlikely(is_single_threaded(wq))) 107 if (unlikely(is_single_threaded(wq)))
105 cpu = any_online_cpu(cpu_online_map); 108 cpu = singlethread_cpu;
106 BUG_ON(!list_empty(&work->entry)); 109 BUG_ON(!list_empty(&work->entry));
107 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 110 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
108 ret = 1; 111 ret = 1;
@@ -118,7 +121,7 @@ static void delayed_work_timer_fn(unsigned long __data)
118 int cpu = smp_processor_id(); 121 int cpu = smp_processor_id();
119 122
120 if (unlikely(is_single_threaded(wq))) 123 if (unlikely(is_single_threaded(wq)))
121 cpu = any_online_cpu(cpu_online_map); 124 cpu = singlethread_cpu;
122 125
123 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 126 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
124} 127}
@@ -144,7 +147,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
144 return ret; 147 return ret;
145} 148}
146 149
147static inline void run_workqueue(struct cpu_workqueue_struct *cwq) 150static void run_workqueue(struct cpu_workqueue_struct *cwq)
148{ 151{
149 unsigned long flags; 152 unsigned long flags;
150 153
@@ -267,7 +270,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
267 270
268 if (is_single_threaded(wq)) { 271 if (is_single_threaded(wq)) {
269 /* Always use first cpu's area. */ 272 /* Always use first cpu's area. */
270 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map))); 273 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
271 } else { 274 } else {
272 int cpu; 275 int cpu;
273 276
@@ -315,12 +318,17 @@ struct workqueue_struct *__create_workqueue(const char *name,
315 return NULL; 318 return NULL;
316 319
317 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 320 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
321 if (!wq->cpu_wq) {
322 kfree(wq);
323 return NULL;
324 }
325
318 wq->name = name; 326 wq->name = name;
319 /* We don't need the distraction of CPUs appearing and vanishing. */ 327 /* We don't need the distraction of CPUs appearing and vanishing. */
320 lock_cpu_hotplug(); 328 lock_cpu_hotplug();
321 if (singlethread) { 329 if (singlethread) {
322 INIT_LIST_HEAD(&wq->list); 330 INIT_LIST_HEAD(&wq->list);
323 p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map)); 331 p = create_workqueue_thread(wq, singlethread_cpu);
324 if (!p) 332 if (!p)
325 destroy = 1; 333 destroy = 1;
326 else 334 else
@@ -374,7 +382,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
374 /* We don't need the distraction of CPUs appearing and vanishing. */ 382 /* We don't need the distraction of CPUs appearing and vanishing. */
375 lock_cpu_hotplug(); 383 lock_cpu_hotplug();
376 if (is_single_threaded(wq)) 384 if (is_single_threaded(wq))
377 cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map)); 385 cleanup_workqueue_thread(wq, singlethread_cpu);
378 else { 386 else {
379 for_each_online_cpu(cpu) 387 for_each_online_cpu(cpu)
380 cleanup_workqueue_thread(wq, cpu); 388 cleanup_workqueue_thread(wq, cpu);
@@ -419,6 +427,25 @@ int schedule_delayed_work_on(int cpu,
419 return ret; 427 return ret;
420} 428}
421 429
430int schedule_on_each_cpu(void (*func) (void *info), void *info)
431{
432 int cpu;
433 struct work_struct *work;
434
435 work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
436
437 if (!work)
438 return -ENOMEM;
439 for_each_online_cpu(cpu) {
440 INIT_WORK(work + cpu, func, info);
441 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
442 work + cpu);
443 }
444 flush_workqueue(keventd_wq);
445 kfree(work);
446 return 0;
447}
448
422void flush_scheduled_work(void) 449void flush_scheduled_work(void)
423{ 450{
424 flush_workqueue(keventd_wq); 451 flush_workqueue(keventd_wq);
@@ -543,6 +570,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
543 570
544void init_workqueues(void) 571void init_workqueues(void)
545{ 572{
573 singlethread_cpu = first_cpu(cpu_possible_map);
546 hotcpu_notifier(workqueue_cpu_callback, 0); 574 hotcpu_notifier(workqueue_cpu_callback, 0);
547 keventd_wq = create_workqueue("events"); 575 keventd_wq = create_workqueue("events");
548 BUG_ON(!keventd_wq); 576 BUG_ON(!keventd_wq);