aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore5
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c17
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/compat.c48
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu.c83
-rw-r--r--kernel/cpuset.c587
-rw-r--r--kernel/crash_dump.c61
-rw-r--r--kernel/exit.c65
-rw-r--r--kernel/fork.c63
-rw-r--r--kernel/futex.c28
-rw-r--r--kernel/hrtimer.c826
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/irq/proc.c6
-rw-r--r--kernel/itimer.c106
-rw-r--r--kernel/kexec.c21
-rw-r--r--kernel/kprobes.c157
-rw-r--r--kernel/ksysfs.c37
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/mutex-debug.c462
-rw-r--r--kernel/mutex-debug.h134
-rw-r--r--kernel/mutex.c315
-rw-r--r--kernel/mutex.h35
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c22
-rw-r--r--kernel/posix-cpu-timers.c78
-rw-r--r--kernel/posix-timers.c899
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/disk.c92
-rw-r--r--kernel/power/main.c21
-rw-r--r--kernel/power/pm.c1
-rw-r--r--kernel/power/power.h24
-rw-r--r--kernel/power/snapshot.c89
-rw-r--r--kernel/power/swsusp.c1020
-rw-r--r--kernel/printk.c26
-rw-r--r--kernel/ptrace.c83
-rw-r--r--kernel/rcupdate.c190
-rw-r--r--kernel/rcutorture.c136
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c571
-rw-r--r--kernel/signal.c179
-rw-r--r--kernel/stop_machine.c12
-rw-r--r--kernel/sys.c78
-rw-r--r--kernel/sys_ni.c24
-rw-r--r--kernel/sysctl.c77
-rw-r--r--kernel/time.c106
-rw-r--r--kernel/timer.c58
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c44
54 files changed, 4918 insertions, 2089 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
new file mode 100644
index 000000000000..f2ab70073bd4
--- /dev/null
+++ b/kernel/.gitignore
@@ -0,0 +1,5 @@
1#
2# Generated files
3#
4config_data.h
5config_data.gz
diff --git a/kernel/Makefile b/kernel/Makefile
index 4f5a1453093a..4ae0fbde815d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,15 +6,18 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o intermodule.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o
11 12
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
12obj-$(CONFIG_FUTEX) += futex.o 14obj-$(CONFIG_FUTEX) += futex.o
13obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 15obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
14obj-$(CONFIG_SMP) += cpu.o spinlock.o 16obj-$(CONFIG_SMP) += cpu.o spinlock.o
15obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 17obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
16obj-$(CONFIG_UID16) += uid16.o 18obj-$(CONFIG_UID16) += uid16.o
17obj-$(CONFIG_MODULES) += module.o 19obj-$(CONFIG_MODULES) += module.o
20obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
18obj-$(CONFIG_KALLSYMS) += kallsyms.o 21obj-$(CONFIG_KALLSYMS) += kallsyms.o
19obj-$(CONFIG_PM) += power/ 22obj-$(CONFIG_PM) += power/
20obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 23obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
@@ -29,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
29obj-$(CONFIG_SYSFS) += ksysfs.o 32obj-$(CONFIG_SYSFS) += ksysfs.o
30obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 33obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
31obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 34obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
32obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
33obj-$(CONFIG_SECCOMP) += seccomp.o 35obj-$(CONFIG_SECCOMP) += seccomp.o
34obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 36obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
35 37
diff --git a/kernel/acct.c b/kernel/acct.c
index 6312d6bd43e3..065d8b4e51ef 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -47,6 +47,7 @@
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/acct.h> 49#include <linux/acct.h>
50#include <linux/capability.h>
50#include <linux/file.h> 51#include <linux/file.h>
51#include <linux/tty.h> 52#include <linux/tty.h>
52#include <linux/security.h> 53#include <linux/security.h>
@@ -427,6 +428,7 @@ static void do_acct_process(long exitcode, struct file *file)
427 u64 elapsed; 428 u64 elapsed;
428 u64 run_time; 429 u64 run_time;
429 struct timespec uptime; 430 struct timespec uptime;
431 unsigned long jiffies;
430 432
431 /* 433 /*
432 * First check to see if there is enough free_space to continue 434 * First check to see if there is enough free_space to continue
@@ -467,12 +469,12 @@ static void do_acct_process(long exitcode, struct file *file)
467#endif 469#endif
468 do_div(elapsed, AHZ); 470 do_div(elapsed, AHZ);
469 ac.ac_btime = xtime.tv_sec - elapsed; 471 ac.ac_btime = xtime.tv_sec - elapsed;
470 ac.ac_utime = encode_comp_t(jiffies_to_AHZ( 472 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
471 current->signal->utime + 473 current->signal->utime));
472 current->group_leader->utime)); 474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
473 ac.ac_stime = encode_comp_t(jiffies_to_AHZ( 475 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
474 current->signal->stime + 476 current->signal->stime));
475 current->group_leader->stime)); 477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
476 /* we really need to bite the bullet and change layout */ 478 /* we really need to bite the bullet and change layout */
477 ac.ac_uid = current->uid; 479 ac.ac_uid = current->uid;
478 ac.ac_gid = current->gid; 480 ac.ac_gid = current->gid;
@@ -580,7 +582,8 @@ void acct_process(long exitcode)
580void acct_update_integrals(struct task_struct *tsk) 582void acct_update_integrals(struct task_struct *tsk)
581{ 583{
582 if (likely(tsk->mm)) { 584 if (likely(tsk->mm)) {
583 long delta = tsk->stime - tsk->acct_stimexpd; 585 long delta =
586 cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
584 587
585 if (delta == 0) 588 if (delta == 0)
586 return; 589 return;
diff --git a/kernel/audit.c b/kernel/audit.c
index 0c56320d38dc..0a813d2883e5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -42,8 +42,8 @@
42 */ 42 */
43 43
44#include <linux/init.h> 44#include <linux/init.h>
45#include <asm/atomic.h>
46#include <asm/types.h> 45#include <asm/types.h>
46#include <asm/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/err.h> 49#include <linux/err.h>
@@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid)
267 return old; 267 return old;
268} 268}
269 269
270int kauditd_thread(void *dummy) 270static int kauditd_thread(void *dummy)
271{ 271{
272 struct sk_buff *skb; 272 struct sk_buff *skb;
273 273
@@ -291,8 +291,10 @@ int kauditd_thread(void *dummy)
291 set_current_state(TASK_INTERRUPTIBLE); 291 set_current_state(TASK_INTERRUPTIBLE);
292 add_wait_queue(&kauditd_wait, &wait); 292 add_wait_queue(&kauditd_wait, &wait);
293 293
294 if (!skb_queue_len(&audit_skb_queue)) 294 if (!skb_queue_len(&audit_skb_queue)) {
295 try_to_freeze();
295 schedule(); 296 schedule();
297 }
296 298
297 __set_current_state(TASK_RUNNING); 299 __set_current_state(TASK_RUNNING);
298 remove_wait_queue(&kauditd_wait, &wait); 300 remove_wait_queue(&kauditd_wait, &wait);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d8a68509e729..685c25175d96 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -30,8 +30,8 @@
30 */ 30 */
31 31
32#include <linux/init.h> 32#include <linux/init.h>
33#include <asm/atomic.h>
34#include <asm/types.h> 33#include <asm/types.h>
34#include <asm/atomic.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/mount.h> 37#include <linux/mount.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 8986a37a67ea..bfa3c92e16f2 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#include <linux/capability.h>
10#include <linux/mm.h> 11#include <linux/mm.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/security.h> 13#include <linux/security.h>
diff --git a/kernel/compat.c b/kernel/compat.c
index 102296e21ea8..1867290c37e3 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -514,6 +514,24 @@ static int put_compat_itimerspec(struct compat_itimerspec __user *dst,
514 return 0; 514 return 0;
515} 515}
516 516
517long compat_sys_timer_create(clockid_t which_clock,
518 struct compat_sigevent __user *timer_event_spec,
519 timer_t __user *created_timer_id)
520{
521 struct sigevent __user *event = NULL;
522
523 if (timer_event_spec) {
524 struct sigevent kevent;
525
526 event = compat_alloc_user_space(sizeof(*event));
527 if (get_compat_sigevent(&kevent, timer_event_spec) ||
528 copy_to_user(event, &kevent, sizeof(*event)))
529 return -EFAULT;
530 }
531
532 return sys_timer_create(which_clock, event, created_timer_id);
533}
534
517long compat_sys_timer_settime(timer_t timer_id, int flags, 535long compat_sys_timer_settime(timer_t timer_id, int flags,
518 struct compat_itimerspec __user *new, 536 struct compat_itimerspec __user *new,
519 struct compat_itimerspec __user *old) 537 struct compat_itimerspec __user *old)
@@ -649,8 +667,6 @@ int get_compat_sigevent(struct sigevent *event,
649 ? -EFAULT : 0; 667 ? -EFAULT : 0;
650} 668}
651 669
652/* timer_create is architecture specific because it needs sigevent conversion */
653
654long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, 670long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
655 unsigned long bitmap_size) 671 unsigned long bitmap_size)
656{ 672{
@@ -855,3 +871,31 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
855} 871}
856 872
857#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ 873#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
874
875#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
876asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
877{
878 sigset_t newset;
879 compat_sigset_t newset32;
880
881 /* XXX: Don't preclude handling different sized sigset_t's. */
882 if (sigsetsize != sizeof(sigset_t))
883 return -EINVAL;
884
885 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
886 return -EFAULT;
887 sigset_from_compat(&newset, &newset32);
888 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
889
890 spin_lock_irq(&current->sighand->siglock);
891 current->saved_sigmask = current->blocked;
892 current->blocked = newset;
893 recalc_sigpending();
894 spin_unlock_irq(&current->sighand->siglock);
895
896 current->state = TASK_INTERRUPTIBLE;
897 schedule();
898 set_thread_flag(TIF_RESTORE_SIGMASK);
899 return -ERESTARTNOHAND;
900}
901#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/configs.c b/kernel/configs.c
index 986f7af31e0a..009e1ebdcb88 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -3,7 +3,7 @@
3 * Echo the kernel .config file used to build the kernel 3 * Echo the kernel .config file used to build the kernel
4 * 4 *
5 * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> 5 * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
6 * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> 6 * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
7 * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> 7 * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
8 * Copyright (C) 2002 Hewlett-Packard Company 8 * Copyright (C) 2002 Hewlett-Packard Company
9 * 9 *
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d61ba88f34e5..e882c6babf41 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,47 +16,76 @@
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19DECLARE_MUTEX(cpucontrol); 19static DECLARE_MUTEX(cpucontrol);
20EXPORT_SYMBOL_GPL(cpucontrol);
21 20
22static struct notifier_block *cpu_chain; 21static struct notifier_block *cpu_chain;
23 22
24/* 23#ifdef CONFIG_HOTPLUG_CPU
25 * Used to check by callers if they need to acquire the cpucontrol 24static struct task_struct *lock_cpu_hotplug_owner;
26 * or not to protect a cpu from being removed. Its sometimes required to 25static int lock_cpu_hotplug_depth;
27 * call these functions both for normal operations, and in response to
28 * a cpu being added/removed. If the context of the call is in the same
29 * thread context as a CPU hotplug thread, we dont need to take the lock
30 * since its already protected
31 * check drivers/cpufreq/cpufreq.c for its usage - Ashok Raj
32 */
33 26
34int current_in_cpu_hotplug(void) 27static int __lock_cpu_hotplug(int interruptible)
35{ 28{
36 return (current->flags & PF_HOTPLUG_CPU); 29 int ret = 0;
30
31 if (lock_cpu_hotplug_owner != current) {
32 if (interruptible)
33 ret = down_interruptible(&cpucontrol);
34 else
35 down(&cpucontrol);
36 }
37
38 /*
39 * Set only if we succeed in locking
40 */
41 if (!ret) {
42 lock_cpu_hotplug_depth++;
43 lock_cpu_hotplug_owner = current;
44 }
45
46 return ret;
37} 47}
38 48
39EXPORT_SYMBOL_GPL(current_in_cpu_hotplug); 49void lock_cpu_hotplug(void)
50{
51 __lock_cpu_hotplug(0);
52}
53EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
40 54
55void unlock_cpu_hotplug(void)
56{
57 if (--lock_cpu_hotplug_depth == 0) {
58 lock_cpu_hotplug_owner = NULL;
59 up(&cpucontrol);
60 }
61}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
63
64int lock_cpu_hotplug_interruptible(void)
65{
66 return __lock_cpu_hotplug(1);
67}
68EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */
41 70
42/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
43int register_cpu_notifier(struct notifier_block *nb) 72int register_cpu_notifier(struct notifier_block *nb)
44{ 73{
45 int ret; 74 int ret;
46 75
47 if ((ret = down_interruptible(&cpucontrol)) != 0) 76 if ((ret = lock_cpu_hotplug_interruptible()) != 0)
48 return ret; 77 return ret;
49 ret = notifier_chain_register(&cpu_chain, nb); 78 ret = notifier_chain_register(&cpu_chain, nb);
50 up(&cpucontrol); 79 unlock_cpu_hotplug();
51 return ret; 80 return ret;
52} 81}
53EXPORT_SYMBOL(register_cpu_notifier); 82EXPORT_SYMBOL(register_cpu_notifier);
54 83
55void unregister_cpu_notifier(struct notifier_block *nb) 84void unregister_cpu_notifier(struct notifier_block *nb)
56{ 85{
57 down(&cpucontrol); 86 lock_cpu_hotplug();
58 notifier_chain_unregister(&cpu_chain, nb); 87 notifier_chain_unregister(&cpu_chain, nb);
59 up(&cpucontrol); 88 unlock_cpu_hotplug();
60} 89}
61EXPORT_SYMBOL(unregister_cpu_notifier); 90EXPORT_SYMBOL(unregister_cpu_notifier);
62 91
@@ -112,13 +141,6 @@ int cpu_down(unsigned int cpu)
112 goto out; 141 goto out;
113 } 142 }
114 143
115 /*
116 * Leave a trace in current->flags indicating we are already in
117 * process of performing CPU hotplug. Callers can check if cpucontrol
118 * is already acquired by current thread, and if so not cause
119 * a dead lock by not acquiring the lock
120 */
121 current->flags |= PF_HOTPLUG_CPU;
122 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 144 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
123 (void *)(long)cpu); 145 (void *)(long)cpu);
124 if (err == NOTIFY_BAD) { 146 if (err == NOTIFY_BAD) {
@@ -171,7 +193,6 @@ out_thread:
171out_allowed: 193out_allowed:
172 set_cpus_allowed(current, old_allowed); 194 set_cpus_allowed(current, old_allowed);
173out: 195out:
174 current->flags &= ~PF_HOTPLUG_CPU;
175 unlock_cpu_hotplug(); 196 unlock_cpu_hotplug();
176 return err; 197 return err;
177} 198}
@@ -182,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
182 int ret; 203 int ret;
183 void *hcpu = (void *)(long)cpu; 204 void *hcpu = (void *)(long)cpu;
184 205
185 if ((ret = down_interruptible(&cpucontrol)) != 0) 206 if ((ret = lock_cpu_hotplug_interruptible()) != 0)
186 return ret; 207 return ret;
187 208
188 if (cpu_online(cpu) || !cpu_present(cpu)) { 209 if (cpu_online(cpu) || !cpu_present(cpu)) {
@@ -190,11 +211,6 @@ int __devinit cpu_up(unsigned int cpu)
190 goto out; 211 goto out;
191 } 212 }
192 213
193 /*
194 * Leave a trace in current->flags indicating we are already in
195 * process of performing CPU hotplug.
196 */
197 current->flags |= PF_HOTPLUG_CPU;
198 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 214 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
199 if (ret == NOTIFY_BAD) { 215 if (ret == NOTIFY_BAD) {
200 printk("%s: attempt to bring up CPU %u failed\n", 216 printk("%s: attempt to bring up CPU %u failed\n",
@@ -217,7 +233,6 @@ out_notify:
217 if (ret != 0) 233 if (ret != 0)
218 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); 234 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
219out: 235out:
220 current->flags &= ~PF_HOTPLUG_CPU; 236 unlock_cpu_hotplug();
221 up(&cpucontrol);
222 return ret; 237 return ret;
223} 238}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5a737ed9dac7..fe2f71f92ae0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -39,6 +39,7 @@
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/proc_fs.h> 41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
42#include <linux/sched.h> 43#include <linux/sched.h>
43#include <linux/seq_file.h> 44#include <linux/seq_file.h>
44#include <linux/slab.h> 45#include <linux/slab.h>
@@ -54,7 +55,23 @@
54#include <asm/atomic.h> 55#include <asm/atomic.h>
55#include <asm/semaphore.h> 56#include <asm/semaphore.h>
56 57
57#define CPUSET_SUPER_MAGIC 0x27e0eb 58#define CPUSET_SUPER_MAGIC 0x27e0eb
59
60/*
61 * Tracks how many cpusets are currently defined in system.
62 * When there is only one cpuset (the root cpuset) we can
63 * short circuit some hooks.
64 */
65int number_of_cpusets __read_mostly;
66
67/* See "Frequency meter" comments, below. */
68
69struct fmeter {
70 int cnt; /* unprocessed events count */
71 int val; /* most recent output value */
72 time_t time; /* clock (secs) when val computed */
73 spinlock_t lock; /* guards read or write of above */
74};
58 75
59struct cpuset { 76struct cpuset {
60 unsigned long flags; /* "unsigned long" so bitops work */ 77 unsigned long flags; /* "unsigned long" so bitops work */
@@ -80,13 +97,16 @@ struct cpuset {
80 * Copy of global cpuset_mems_generation as of the most 97 * Copy of global cpuset_mems_generation as of the most
81 * recent time this cpuset changed its mems_allowed. 98 * recent time this cpuset changed its mems_allowed.
82 */ 99 */
83 int mems_generation; 100 int mems_generation;
101
102 struct fmeter fmeter; /* memory_pressure filter */
84}; 103};
85 104
86/* bits in struct cpuset flags field */ 105/* bits in struct cpuset flags field */
87typedef enum { 106typedef enum {
88 CS_CPU_EXCLUSIVE, 107 CS_CPU_EXCLUSIVE,
89 CS_MEM_EXCLUSIVE, 108 CS_MEM_EXCLUSIVE,
109 CS_MEMORY_MIGRATE,
90 CS_REMOVED, 110 CS_REMOVED,
91 CS_NOTIFY_ON_RELEASE 111 CS_NOTIFY_ON_RELEASE
92} cpuset_flagbits_t; 112} cpuset_flagbits_t;
@@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs)
112 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 132 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
113} 133}
114 134
135static inline int is_memory_migrate(const struct cpuset *cs)
136{
137 return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
138}
139
115/* 140/*
116 * Increment this atomic integer everytime any cpuset changes its 141 * Increment this atomic integer everytime any cpuset changes its
117 * mems_allowed value. Users of cpusets can track this generation 142 * mems_allowed value. Users of cpusets can track this generation
@@ -137,13 +162,10 @@ static struct cpuset top_cpuset = {
137 .count = ATOMIC_INIT(0), 162 .count = ATOMIC_INIT(0),
138 .sibling = LIST_HEAD_INIT(top_cpuset.sibling), 163 .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
139 .children = LIST_HEAD_INIT(top_cpuset.children), 164 .children = LIST_HEAD_INIT(top_cpuset.children),
140 .parent = NULL,
141 .dentry = NULL,
142 .mems_generation = 0,
143}; 165};
144 166
145static struct vfsmount *cpuset_mount; 167static struct vfsmount *cpuset_mount;
146static struct super_block *cpuset_sb = NULL; 168static struct super_block *cpuset_sb;
147 169
148/* 170/*
149 * We have two global cpuset semaphores below. They can nest. 171 * We have two global cpuset semaphores below. They can nest.
@@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL;
227 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock 249 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
228 * (task->alloc_lock) already in the task_struct routinely used for 250 * (task->alloc_lock) already in the task_struct routinely used for
229 * such matters. 251 * such matters.
252 *
253 * P.S. One more locking exception. RCU is used to guard the
254 * update of a tasks cpuset pointer by attach_task() and the
255 * access of task->cpuset->mems_generation via that pointer in
256 * the routine cpuset_update_task_memory_state().
230 */ 257 */
231 258
232static DECLARE_MUTEX(manage_sem); 259static DECLARE_MUTEX(manage_sem);
@@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
304 spin_lock(&dcache_lock); 331 spin_lock(&dcache_lock);
305 node = dentry->d_subdirs.next; 332 node = dentry->d_subdirs.next;
306 while (node != &dentry->d_subdirs) { 333 while (node != &dentry->d_subdirs) {
307 struct dentry *d = list_entry(node, struct dentry, d_child); 334 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
308 list_del_init(node); 335 list_del_init(node);
309 if (d->d_inode) { 336 if (d->d_inode) {
310 d = dget_locked(d); 337 d = dget_locked(d);
@@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
316 } 343 }
317 node = dentry->d_subdirs.next; 344 node = dentry->d_subdirs.next;
318 } 345 }
319 list_del_init(&dentry->d_child); 346 list_del_init(&dentry->d_u.d_child);
320 spin_unlock(&dcache_lock); 347 spin_unlock(&dcache_lock);
321 remove_dir(dentry); 348 remove_dir(dentry);
322} 349}
@@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
570 BUG_ON(!nodes_intersects(*pmask, node_online_map)); 597 BUG_ON(!nodes_intersects(*pmask, node_online_map));
571} 598}
572 599
573/* 600/**
574 * Refresh current tasks mems_allowed and mems_generation from current 601 * cpuset_update_task_memory_state - update task memory placement
575 * tasks cpuset. 602 *
603 * If the current tasks cpusets mems_allowed changed behind our
604 * backs, update current->mems_allowed, mems_generation and task NUMA
605 * mempolicy to the new value.
576 * 606 *
577 * Call without callback_sem or task_lock() held. May be called with 607 * Task mempolicy is updated by rebinding it relative to the
578 * or without manage_sem held. Will acquire task_lock() and might 608 * current->cpuset if a task has its memory placement changed.
579 * acquire callback_sem during call. 609 * Do not call this routine if in_interrupt().
580 * 610 *
581 * The task_lock() is required to dereference current->cpuset safely. 611 * Call without callback_sem or task_lock() held. May be called
582 * Without it, we could pick up the pointer value of current->cpuset 612 * with or without manage_sem held. Doesn't need task_lock to guard
583 * in one instruction, and then attach_task could give us a different 613 * against another task changing a non-NULL cpuset pointer to NULL,
584 * cpuset, and then the cpuset we had could be removed and freed, 614 * as that is only done by a task on itself, and if the current task
585 * and then on our next instruction, we could dereference a no longer 615 * is here, it is not simultaneously in the exit code NULL'ing its
586 * valid cpuset pointer to get its mems_generation field. 616 * cpuset pointer. This routine also might acquire callback_sem and
617 * current->mm->mmap_sem during call.
618 *
619 * Reading current->cpuset->mems_generation doesn't need task_lock
620 * to guard the current->cpuset derefence, because it is guarded
621 * from concurrent freeing of current->cpuset by attach_task(),
622 * using RCU.
623 *
624 * The rcu_dereference() is technically probably not needed,
625 * as I don't actually mind if I see a new cpuset pointer but
626 * an old value of mems_generation. However this really only
627 * matters on alpha systems using cpusets heavily. If I dropped
628 * that rcu_dereference(), it would save them a memory barrier.
629 * For all other arch's, rcu_dereference is a no-op anyway, and for
630 * alpha systems not using cpusets, another planned optimization,
631 * avoiding the rcu critical section for tasks in the root cpuset
632 * which is statically allocated, so can't vanish, will make this
633 * irrelevant. Better to use RCU as intended, than to engage in
634 * some cute trick to save a memory barrier that is impossible to
635 * test, for alpha systems using cpusets heavily, which might not
636 * even exist.
587 * 637 *
588 * This routine is needed to update the per-task mems_allowed data, 638 * This routine is needed to update the per-task mems_allowed data,
589 * within the tasks context, when it is trying to allocate memory 639 * within the tasks context, when it is trying to allocate memory
@@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
591 * task has been modifying its cpuset. 641 * task has been modifying its cpuset.
592 */ 642 */
593 643
594static void refresh_mems(void) 644void cpuset_update_task_memory_state()
595{ 645{
596 int my_cpusets_mem_gen; 646 int my_cpusets_mem_gen;
647 struct task_struct *tsk = current;
648 struct cpuset *cs;
597 649
598 task_lock(current); 650 if (tsk->cpuset == &top_cpuset) {
599 my_cpusets_mem_gen = current->cpuset->mems_generation; 651 /* Don't need rcu for top_cpuset. It's never freed. */
600 task_unlock(current); 652 my_cpusets_mem_gen = top_cpuset.mems_generation;
601 653 } else {
602 if (current->cpuset_mems_generation != my_cpusets_mem_gen) { 654 rcu_read_lock();
603 struct cpuset *cs; 655 cs = rcu_dereference(tsk->cpuset);
604 nodemask_t oldmem = current->mems_allowed; 656 my_cpusets_mem_gen = cs->mems_generation;
657 rcu_read_unlock();
658 }
605 659
660 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
606 down(&callback_sem); 661 down(&callback_sem);
607 task_lock(current); 662 task_lock(tsk);
608 cs = current->cpuset; 663 cs = tsk->cpuset; /* Maybe changed when task not locked */
609 guarantee_online_mems(cs, &current->mems_allowed); 664 guarantee_online_mems(cs, &tsk->mems_allowed);
610 current->cpuset_mems_generation = cs->mems_generation; 665 tsk->cpuset_mems_generation = cs->mems_generation;
611 task_unlock(current); 666 task_unlock(tsk);
612 up(&callback_sem); 667 up(&callback_sem);
613 if (!nodes_equal(oldmem, current->mems_allowed)) 668 mpol_rebind_task(tsk, &tsk->mems_allowed);
614 numa_policy_rebind(&oldmem, &current->mems_allowed);
615 } 669 }
616} 670}
617 671
@@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf)
766} 820}
767 821
768/* 822/*
823 * Handle user request to change the 'mems' memory placement
824 * of a cpuset. Needs to validate the request, update the
825 * cpusets mems_allowed and mems_generation, and for each
826 * task in the cpuset, rebind any vma mempolicies and if
827 * the cpuset is marked 'memory_migrate', migrate the tasks
828 * pages to the new memory.
829 *
769 * Call with manage_sem held. May take callback_sem during call. 830 * Call with manage_sem held. May take callback_sem during call.
831 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
832 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
833 * their mempolicies to the cpusets new mems_allowed.
770 */ 834 */
771 835
772static int update_nodemask(struct cpuset *cs, char *buf) 836static int update_nodemask(struct cpuset *cs, char *buf)
773{ 837{
774 struct cpuset trialcs; 838 struct cpuset trialcs;
839 nodemask_t oldmem;
840 struct task_struct *g, *p;
841 struct mm_struct **mmarray;
842 int i, n, ntasks;
843 int migrate;
844 int fudge;
775 int retval; 845 int retval;
776 846
777 trialcs = *cs; 847 trialcs = *cs;
778 retval = nodelist_parse(buf, trialcs.mems_allowed); 848 retval = nodelist_parse(buf, trialcs.mems_allowed);
779 if (retval < 0) 849 if (retval < 0)
780 return retval; 850 goto done;
781 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); 851 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
782 if (nodes_empty(trialcs.mems_allowed)) 852 oldmem = cs->mems_allowed;
783 return -ENOSPC; 853 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
854 retval = 0; /* Too easy - nothing to do */
855 goto done;
856 }
857 if (nodes_empty(trialcs.mems_allowed)) {
858 retval = -ENOSPC;
859 goto done;
860 }
784 retval = validate_change(cs, &trialcs); 861 retval = validate_change(cs, &trialcs);
785 if (retval == 0) { 862 if (retval < 0)
786 down(&callback_sem); 863 goto done;
787 cs->mems_allowed = trialcs.mems_allowed; 864
788 atomic_inc(&cpuset_mems_generation); 865 down(&callback_sem);
789 cs->mems_generation = atomic_read(&cpuset_mems_generation); 866 cs->mems_allowed = trialcs.mems_allowed;
790 up(&callback_sem); 867 atomic_inc(&cpuset_mems_generation);
868 cs->mems_generation = atomic_read(&cpuset_mems_generation);
869 up(&callback_sem);
870
871 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
872
873 fudge = 10; /* spare mmarray[] slots */
874 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
875 retval = -ENOMEM;
876
877 /*
878 * Allocate mmarray[] to hold mm reference for each task
879 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
880 * tasklist_lock. We could use GFP_ATOMIC, but with a
881 * few more lines of code, we can retry until we get a big
882 * enough mmarray[] w/o using GFP_ATOMIC.
883 */
884 while (1) {
885 ntasks = atomic_read(&cs->count); /* guess */
886 ntasks += fudge;
887 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
888 if (!mmarray)
889 goto done;
890 write_lock_irq(&tasklist_lock); /* block fork */
891 if (atomic_read(&cs->count) <= ntasks)
892 break; /* got enough */
893 write_unlock_irq(&tasklist_lock); /* try again */
894 kfree(mmarray);
791 } 895 }
896
897 n = 0;
898
899 /* Load up mmarray[] with mm reference for each task in cpuset. */
900 do_each_thread(g, p) {
901 struct mm_struct *mm;
902
903 if (n >= ntasks) {
904 printk(KERN_WARNING
905 "Cpuset mempolicy rebind incomplete.\n");
906 continue;
907 }
908 if (p->cpuset != cs)
909 continue;
910 mm = get_task_mm(p);
911 if (!mm)
912 continue;
913 mmarray[n++] = mm;
914 } while_each_thread(g, p);
915 write_unlock_irq(&tasklist_lock);
916
917 /*
918 * Now that we've dropped the tasklist spinlock, we can
919 * rebind the vma mempolicies of each mm in mmarray[] to their
920 * new cpuset, and release that mm. The mpol_rebind_mm()
921 * call takes mmap_sem, which we couldn't take while holding
922 * tasklist_lock. Forks can happen again now - the mpol_copy()
923 * cpuset_being_rebound check will catch such forks, and rebind
924 * their vma mempolicies too. Because we still hold the global
925 * cpuset manage_sem, we know that no other rebind effort will
926 * be contending for the global variable cpuset_being_rebound.
927 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
928 * is idempotent. Also migrate pages in each mm to new nodes.
929 */
930 migrate = is_memory_migrate(cs);
931 for (i = 0; i < n; i++) {
932 struct mm_struct *mm = mmarray[i];
933
934 mpol_rebind_mm(mm, &cs->mems_allowed);
935 if (migrate) {
936 do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
937 MPOL_MF_MOVE_ALL);
938 }
939 mmput(mm);
940 }
941
942 /* We're done rebinding vma's to this cpusets new mems_allowed. */
943 kfree(mmarray);
944 set_cpuset_being_rebound(NULL);
945 retval = 0;
946done:
792 return retval; 947 return retval;
793} 948}
794 949
795/* 950/*
951 * Call with manage_sem held.
952 */
953
954static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
955{
956 if (simple_strtoul(buf, NULL, 10) != 0)
957 cpuset_memory_pressure_enabled = 1;
958 else
959 cpuset_memory_pressure_enabled = 0;
960 return 0;
961}
962
963/*
796 * update_flag - read a 0 or a 1 in a file and update associated flag 964 * update_flag - read a 0 or a 1 in a file and update associated flag
797 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 965 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
798 * CS_NOTIFY_ON_RELEASE) 966 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
799 * cs: the cpuset to update 967 * cs: the cpuset to update
800 * buf: the buffer where we read the 0 or 1 968 * buf: the buffer where we read the 0 or 1
801 * 969 *
@@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
834} 1002}
835 1003
836/* 1004/*
1005 * Frequency meter - How fast is some event occuring?
1006 *
1007 * These routines manage a digitally filtered, constant time based,
1008 * event frequency meter. There are four routines:
1009 * fmeter_init() - initialize a frequency meter.
1010 * fmeter_markevent() - called each time the event happens.
1011 * fmeter_getrate() - returns the recent rate of such events.
1012 * fmeter_update() - internal routine used to update fmeter.
1013 *
1014 * A common data structure is passed to each of these routines,
1015 * which is used to keep track of the state required to manage the
1016 * frequency meter and its digital filter.
1017 *
1018 * The filter works on the number of events marked per unit time.
1019 * The filter is single-pole low-pass recursive (IIR). The time unit
1020 * is 1 second. Arithmetic is done using 32-bit integers scaled to
1021 * simulate 3 decimal digits of precision (multiplied by 1000).
1022 *
1023 * With an FM_COEF of 933, and a time base of 1 second, the filter
1024 * has a half-life of 10 seconds, meaning that if the events quit
1025 * happening, then the rate returned from the fmeter_getrate()
1026 * will be cut in half each 10 seconds, until it converges to zero.
1027 *
1028 * It is not worth doing a real infinitely recursive filter. If more
1029 * than FM_MAXTICKS ticks have elapsed since the last filter event,
1030 * just compute FM_MAXTICKS ticks worth, by which point the level
1031 * will be stable.
1032 *
1033 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
1034 * arithmetic overflow in the fmeter_update() routine.
1035 *
1036 * Given the simple 32 bit integer arithmetic used, this meter works
1037 * best for reporting rates between one per millisecond (msec) and
1038 * one per 32 (approx) seconds. At constant rates faster than one
1039 * per msec it maxes out at values just under 1,000,000. At constant
1040 * rates between one per msec, and one per second it will stabilize
1041 * to a value N*1000, where N is the rate of events per second.
1042 * At constant rates between one per second and one per 32 seconds,
1043 * it will be choppy, moving up on the seconds that have an event,
1044 * and then decaying until the next event. At rates slower than
1045 * about one in 32 seconds, it decays all the way back to zero between
1046 * each event.
1047 */
1048
1049#define FM_COEF 933 /* coefficient for half-life of 10 secs */
1050#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
1051#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
1052#define FM_SCALE 1000 /* faux fixed point scale */
1053
1054/* Initialize a frequency meter */
1055static void fmeter_init(struct fmeter *fmp)
1056{
1057 fmp->cnt = 0;
1058 fmp->val = 0;
1059 fmp->time = 0;
1060 spin_lock_init(&fmp->lock);
1061}
1062
1063/* Internal meter update - process cnt events and update value */
1064static void fmeter_update(struct fmeter *fmp)
1065{
1066 time_t now = get_seconds();
1067 time_t ticks = now - fmp->time;
1068
1069 if (ticks == 0)
1070 return;
1071
1072 ticks = min(FM_MAXTICKS, ticks);
1073 while (ticks-- > 0)
1074 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1075 fmp->time = now;
1076
1077 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1078 fmp->cnt = 0;
1079}
1080
1081/* Process any previous ticks, then bump cnt by one (times scale). */
1082static void fmeter_markevent(struct fmeter *fmp)
1083{
1084 spin_lock(&fmp->lock);
1085 fmeter_update(fmp);
1086 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1087 spin_unlock(&fmp->lock);
1088}
1089
1090/* Process any previous ticks, then return current value. */
1091static int fmeter_getrate(struct fmeter *fmp)
1092{
1093 int val;
1094
1095 spin_lock(&fmp->lock);
1096 fmeter_update(fmp);
1097 val = fmp->val;
1098 spin_unlock(&fmp->lock);
1099 return val;
1100}
1101
1102/*
837 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 1103 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
838 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 1104 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
839 * notified on release. 1105 * notified on release.
@@ -848,6 +1114,8 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
848 struct task_struct *tsk; 1114 struct task_struct *tsk;
849 struct cpuset *oldcs; 1115 struct cpuset *oldcs;
850 cpumask_t cpus; 1116 cpumask_t cpus;
1117 nodemask_t from, to;
1118 struct mm_struct *mm;
851 1119
852 if (sscanf(pidbuf, "%d", &pid) != 1) 1120 if (sscanf(pidbuf, "%d", &pid) != 1)
853 return -EIO; 1121 return -EIO;
@@ -887,14 +1155,27 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
887 return -ESRCH; 1155 return -ESRCH;
888 } 1156 }
889 atomic_inc(&cs->count); 1157 atomic_inc(&cs->count);
890 tsk->cpuset = cs; 1158 rcu_assign_pointer(tsk->cpuset, cs);
891 task_unlock(tsk); 1159 task_unlock(tsk);
892 1160
893 guarantee_online_cpus(cs, &cpus); 1161 guarantee_online_cpus(cs, &cpus);
894 set_cpus_allowed(tsk, cpus); 1162 set_cpus_allowed(tsk, cpus);
895 1163
1164 from = oldcs->mems_allowed;
1165 to = cs->mems_allowed;
1166
896 up(&callback_sem); 1167 up(&callback_sem);
1168
1169 mm = get_task_mm(tsk);
1170 if (mm) {
1171 mpol_rebind_mm(mm, &to);
1172 mmput(mm);
1173 }
1174
1175 if (is_memory_migrate(cs))
1176 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
897 put_task_struct(tsk); 1177 put_task_struct(tsk);
1178 synchronize_rcu();
898 if (atomic_dec_and_test(&oldcs->count)) 1179 if (atomic_dec_and_test(&oldcs->count))
899 check_for_release(oldcs, ppathbuf); 1180 check_for_release(oldcs, ppathbuf);
900 return 0; 1181 return 0;
@@ -905,11 +1186,14 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
905typedef enum { 1186typedef enum {
906 FILE_ROOT, 1187 FILE_ROOT,
907 FILE_DIR, 1188 FILE_DIR,
1189 FILE_MEMORY_MIGRATE,
908 FILE_CPULIST, 1190 FILE_CPULIST,
909 FILE_MEMLIST, 1191 FILE_MEMLIST,
910 FILE_CPU_EXCLUSIVE, 1192 FILE_CPU_EXCLUSIVE,
911 FILE_MEM_EXCLUSIVE, 1193 FILE_MEM_EXCLUSIVE,
912 FILE_NOTIFY_ON_RELEASE, 1194 FILE_NOTIFY_ON_RELEASE,
1195 FILE_MEMORY_PRESSURE_ENABLED,
1196 FILE_MEMORY_PRESSURE,
913 FILE_TASKLIST, 1197 FILE_TASKLIST,
914} cpuset_filetype_t; 1198} cpuset_filetype_t;
915 1199
@@ -960,6 +1244,15 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
960 case FILE_NOTIFY_ON_RELEASE: 1244 case FILE_NOTIFY_ON_RELEASE:
961 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); 1245 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
962 break; 1246 break;
1247 case FILE_MEMORY_MIGRATE:
1248 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1249 break;
1250 case FILE_MEMORY_PRESSURE_ENABLED:
1251 retval = update_memory_pressure_enabled(cs, buffer);
1252 break;
1253 case FILE_MEMORY_PRESSURE:
1254 retval = -EACCES;
1255 break;
963 case FILE_TASKLIST: 1256 case FILE_TASKLIST:
964 retval = attach_task(cs, buffer, &pathbuf); 1257 retval = attach_task(cs, buffer, &pathbuf);
965 break; 1258 break;
@@ -1060,6 +1353,15 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1060 case FILE_NOTIFY_ON_RELEASE: 1353 case FILE_NOTIFY_ON_RELEASE:
1061 *s++ = notify_on_release(cs) ? '1' : '0'; 1354 *s++ = notify_on_release(cs) ? '1' : '0';
1062 break; 1355 break;
1356 case FILE_MEMORY_MIGRATE:
1357 *s++ = is_memory_migrate(cs) ? '1' : '0';
1358 break;
1359 case FILE_MEMORY_PRESSURE_ENABLED:
1360 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1361 break;
1362 case FILE_MEMORY_PRESSURE:
1363 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1364 break;
1063 default: 1365 default:
1064 retval = -EINVAL; 1366 retval = -EINVAL;
1065 goto out; 1367 goto out;
@@ -1178,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
1178 1480
1179/* 1481/*
1180 * cpuset_create_dir - create a directory for an object. 1482 * cpuset_create_dir - create a directory for an object.
1181 * cs: the cpuset we create the directory for. 1483 * cs: the cpuset we create the directory for.
1182 * It must have a valid ->parent field 1484 * It must have a valid ->parent field
1183 * And we are going to fill its ->dentry field. 1485 * And we are going to fill its ->dentry field.
1184 * name: The name to give to the cpuset directory. Will be copied. 1486 * name: The name to give to the cpuset directory. Will be copied.
@@ -1211,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1211 struct dentry *dentry; 1513 struct dentry *dentry;
1212 int error; 1514 int error;
1213 1515
1214 down(&dir->d_inode->i_sem); 1516 mutex_lock(&dir->d_inode->i_mutex);
1215 dentry = cpuset_get_dentry(dir, cft->name); 1517 dentry = cpuset_get_dentry(dir, cft->name);
1216 if (!IS_ERR(dentry)) { 1518 if (!IS_ERR(dentry)) {
1217 error = cpuset_create_file(dentry, 0644 | S_IFREG); 1519 error = cpuset_create_file(dentry, 0644 | S_IFREG);
@@ -1220,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1220 dput(dentry); 1522 dput(dentry);
1221 } else 1523 } else
1222 error = PTR_ERR(dentry); 1524 error = PTR_ERR(dentry);
1223 up(&dir->d_inode->i_sem); 1525 mutex_unlock(&dir->d_inode->i_mutex);
1224 return error; 1526 return error;
1225} 1527}
1226 1528
@@ -1252,7 +1554,7 @@ struct ctr_struct {
1252 * when reading out p->cpuset, as we don't really care if it changes 1554 * when reading out p->cpuset, as we don't really care if it changes
1253 * on the next cycle, and we are not going to try to dereference it. 1555 * on the next cycle, and we are not going to try to dereference it.
1254 */ 1556 */
1255static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) 1557static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1256{ 1558{
1257 int n = 0; 1559 int n = 0;
1258 struct task_struct *g, *p; 1560 struct task_struct *g, *p;
@@ -1408,6 +1710,21 @@ static struct cftype cft_notify_on_release = {
1408 .private = FILE_NOTIFY_ON_RELEASE, 1710 .private = FILE_NOTIFY_ON_RELEASE,
1409}; 1711};
1410 1712
1713static struct cftype cft_memory_migrate = {
1714 .name = "memory_migrate",
1715 .private = FILE_MEMORY_MIGRATE,
1716};
1717
1718static struct cftype cft_memory_pressure_enabled = {
1719 .name = "memory_pressure_enabled",
1720 .private = FILE_MEMORY_PRESSURE_ENABLED,
1721};
1722
1723static struct cftype cft_memory_pressure = {
1724 .name = "memory_pressure",
1725 .private = FILE_MEMORY_PRESSURE,
1726};
1727
1411static int cpuset_populate_dir(struct dentry *cs_dentry) 1728static int cpuset_populate_dir(struct dentry *cs_dentry)
1412{ 1729{
1413 int err; 1730 int err;
@@ -1422,6 +1739,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1422 return err; 1739 return err;
1423 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) 1740 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
1424 return err; 1741 return err;
1742 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
1743 return err;
1744 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1745 return err;
1425 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1746 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1426 return err; 1747 return err;
1427 return 0; 1748 return 0;
@@ -1446,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1446 return -ENOMEM; 1767 return -ENOMEM;
1447 1768
1448 down(&manage_sem); 1769 down(&manage_sem);
1449 refresh_mems(); 1770 cpuset_update_task_memory_state();
1450 cs->flags = 0; 1771 cs->flags = 0;
1451 if (notify_on_release(parent)) 1772 if (notify_on_release(parent))
1452 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1773 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1457,11 +1778,13 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1457 INIT_LIST_HEAD(&cs->children); 1778 INIT_LIST_HEAD(&cs->children);
1458 atomic_inc(&cpuset_mems_generation); 1779 atomic_inc(&cpuset_mems_generation);
1459 cs->mems_generation = atomic_read(&cpuset_mems_generation); 1780 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1781 fmeter_init(&cs->fmeter);
1460 1782
1461 cs->parent = parent; 1783 cs->parent = parent;
1462 1784
1463 down(&callback_sem); 1785 down(&callback_sem);
1464 list_add(&cs->sibling, &cs->parent->children); 1786 list_add(&cs->sibling, &cs->parent->children);
1787 number_of_cpusets++;
1465 up(&callback_sem); 1788 up(&callback_sem);
1466 1789
1467 err = cpuset_create_dir(cs, name, mode); 1790 err = cpuset_create_dir(cs, name, mode);
@@ -1470,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1470 1793
1471 /* 1794 /*
1472 * Release manage_sem before cpuset_populate_dir() because it 1795 * Release manage_sem before cpuset_populate_dir() because it
1473 * will down() this new directory's i_sem and if we race with 1796 * will down() this new directory's i_mutex and if we race with
1474 * another mkdir, we might deadlock. 1797 * another mkdir, we might deadlock.
1475 */ 1798 */
1476 up(&manage_sem); 1799 up(&manage_sem);
@@ -1489,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1489{ 1812{
1490 struct cpuset *c_parent = dentry->d_parent->d_fsdata; 1813 struct cpuset *c_parent = dentry->d_parent->d_fsdata;
1491 1814
1492 /* the vfs holds inode->i_sem already */ 1815 /* the vfs holds inode->i_mutex already */
1493 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1816 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1494} 1817}
1495 1818
@@ -1500,10 +1823,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1500 struct cpuset *parent; 1823 struct cpuset *parent;
1501 char *pathbuf = NULL; 1824 char *pathbuf = NULL;
1502 1825
1503 /* the vfs holds both inode->i_sem already */ 1826 /* the vfs holds both inode->i_mutex already */
1504 1827
1505 down(&manage_sem); 1828 down(&manage_sem);
1506 refresh_mems(); 1829 cpuset_update_task_memory_state();
1507 if (atomic_read(&cs->count) > 0) { 1830 if (atomic_read(&cs->count) > 0) {
1508 up(&manage_sem); 1831 up(&manage_sem);
1509 return -EBUSY; 1832 return -EBUSY;
@@ -1524,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1524 spin_unlock(&d->d_lock); 1847 spin_unlock(&d->d_lock);
1525 cpuset_d_remove_dir(d); 1848 cpuset_d_remove_dir(d);
1526 dput(d); 1849 dput(d);
1850 number_of_cpusets--;
1527 up(&callback_sem); 1851 up(&callback_sem);
1528 if (list_empty(&parent->children)) 1852 if (list_empty(&parent->children))
1529 check_for_release(parent, &pathbuf); 1853 check_for_release(parent, &pathbuf);
@@ -1532,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1532 return 0; 1856 return 0;
1533} 1857}
1534 1858
1859/*
1860 * cpuset_init_early - just enough so that the calls to
1861 * cpuset_update_task_memory_state() in early init code
1862 * are harmless.
1863 */
1864
1865int __init cpuset_init_early(void)
1866{
1867 struct task_struct *tsk = current;
1868
1869 tsk->cpuset = &top_cpuset;
1870 tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
1871 return 0;
1872}
1873
1535/** 1874/**
1536 * cpuset_init - initialize cpusets at system boot 1875 * cpuset_init - initialize cpusets at system boot
1537 * 1876 *
@@ -1546,6 +1885,7 @@ int __init cpuset_init(void)
1546 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1885 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1547 top_cpuset.mems_allowed = NODE_MASK_ALL; 1886 top_cpuset.mems_allowed = NODE_MASK_ALL;
1548 1887
1888 fmeter_init(&top_cpuset.fmeter);
1549 atomic_inc(&cpuset_mems_generation); 1889 atomic_inc(&cpuset_mems_generation);
1550 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); 1890 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1551 1891
@@ -1566,7 +1906,11 @@ int __init cpuset_init(void)
1566 root->d_inode->i_nlink++; 1906 root->d_inode->i_nlink++;
1567 top_cpuset.dentry = root; 1907 top_cpuset.dentry = root;
1568 root->d_inode->i_op = &cpuset_dir_inode_operations; 1908 root->d_inode->i_op = &cpuset_dir_inode_operations;
1909 number_of_cpusets = 1;
1569 err = cpuset_populate_dir(root); 1910 err = cpuset_populate_dir(root);
1911 /* memory_pressure_enabled is in root cpuset only */
1912 if (err == 0)
1913 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
1570out: 1914out:
1571 return err; 1915 return err;
1572} 1916}
@@ -1632,15 +1976,13 @@ void cpuset_fork(struct task_struct *child)
1632 * 1976 *
1633 * We don't need to task_lock() this reference to tsk->cpuset, 1977 * We don't need to task_lock() this reference to tsk->cpuset,
1634 * because tsk is already marked PF_EXITING, so attach_task() won't 1978 * because tsk is already marked PF_EXITING, so attach_task() won't
1635 * mess with it. 1979 * mess with it, or task is a failed fork, never visible to attach_task.
1636 **/ 1980 **/
1637 1981
1638void cpuset_exit(struct task_struct *tsk) 1982void cpuset_exit(struct task_struct *tsk)
1639{ 1983{
1640 struct cpuset *cs; 1984 struct cpuset *cs;
1641 1985
1642 BUG_ON(!(tsk->flags & PF_EXITING));
1643
1644 cs = tsk->cpuset; 1986 cs = tsk->cpuset;
1645 tsk->cpuset = NULL; 1987 tsk->cpuset = NULL;
1646 1988
@@ -1667,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk)
1667 * tasks cpuset. 2009 * tasks cpuset.
1668 **/ 2010 **/
1669 2011
1670cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) 2012cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
1671{ 2013{
1672 cpumask_t mask; 2014 cpumask_t mask;
1673 2015
1674 down(&callback_sem); 2016 down(&callback_sem);
1675 task_lock((struct task_struct *)tsk); 2017 task_lock(tsk);
1676 guarantee_online_cpus(tsk->cpuset, &mask); 2018 guarantee_online_cpus(tsk->cpuset, &mask);
1677 task_unlock((struct task_struct *)tsk); 2019 task_unlock(tsk);
1678 up(&callback_sem); 2020 up(&callback_sem);
1679 2021
1680 return mask; 2022 return mask;
@@ -1686,43 +2028,26 @@ void cpuset_init_current_mems_allowed(void)
1686} 2028}
1687 2029
1688/** 2030/**
1689 * cpuset_update_current_mems_allowed - update mems parameters to new values 2031 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
1690 * 2032 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
1691 * If the current tasks cpusets mems_allowed changed behind our backs,
1692 * update current->mems_allowed and mems_generation to the new value.
1693 * Do not call this routine if in_interrupt().
1694 * 2033 *
1695 * Call without callback_sem or task_lock() held. May be called 2034 * Description: Returns the nodemask_t mems_allowed of the cpuset
1696 * with or without manage_sem held. Unless exiting, it will acquire 2035 * attached to the specified @tsk. Guaranteed to return some non-empty
1697 * task_lock(). Also might acquire callback_sem during call to 2036 * subset of node_online_map, even if this means going outside the
1698 * refresh_mems(). 2037 * tasks cpuset.
1699 */ 2038 **/
1700 2039
1701void cpuset_update_current_mems_allowed(void) 2040nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1702{ 2041{
1703 struct cpuset *cs; 2042 nodemask_t mask;
1704 int need_to_refresh = 0;
1705 2043
1706 task_lock(current); 2044 down(&callback_sem);
1707 cs = current->cpuset; 2045 task_lock(tsk);
1708 if (!cs) 2046 guarantee_online_mems(tsk->cpuset, &mask);
1709 goto done; 2047 task_unlock(tsk);
1710 if (current->cpuset_mems_generation != cs->mems_generation) 2048 up(&callback_sem);
1711 need_to_refresh = 1;
1712done:
1713 task_unlock(current);
1714 if (need_to_refresh)
1715 refresh_mems();
1716}
1717 2049
1718/** 2050 return mask;
1719 * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
1720 * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
1721 */
1722void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
1723{
1724 bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
1725 MAX_NUMNODES);
1726} 2051}
1727 2052
1728/** 2053/**
@@ -1795,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1795 * GFP_USER - only nodes in current tasks mems allowed ok. 2120 * GFP_USER - only nodes in current tasks mems allowed ok.
1796 **/ 2121 **/
1797 2122
1798int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 2123int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1799{ 2124{
1800 int node; /* node that zone z is on */ 2125 int node; /* node that zone z is on */
1801 const struct cpuset *cs; /* current cpuset ancestors */ 2126 const struct cpuset *cs; /* current cpuset ancestors */
@@ -1809,11 +2134,12 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1809 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 2134 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
1810 return 0; 2135 return 0;
1811 2136
2137 if (current->flags & PF_EXITING) /* Let dying task have memory */
2138 return 1;
2139
1812 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2140 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1813 down(&callback_sem); 2141 down(&callback_sem);
1814 2142
1815 if (current->flags & PF_EXITING) /* Let dying task have memory */
1816 return 1;
1817 task_lock(current); 2143 task_lock(current);
1818 cs = nearest_exclusive_ancestor(current->cpuset); 2144 cs = nearest_exclusive_ancestor(current->cpuset);
1819 task_unlock(current); 2145 task_unlock(current);
@@ -1824,6 +2150,33 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1824} 2150}
1825 2151
1826/** 2152/**
2153 * cpuset_lock - lock out any changes to cpuset structures
2154 *
2155 * The out of memory (oom) code needs to lock down cpusets
2156 * from being changed while it scans the tasklist looking for a
2157 * task in an overlapping cpuset. Expose callback_sem via this
2158 * cpuset_lock() routine, so the oom code can lock it, before
2159 * locking the task list. The tasklist_lock is a spinlock, so
2160 * must be taken inside callback_sem.
2161 */
2162
2163void cpuset_lock(void)
2164{
2165 down(&callback_sem);
2166}
2167
2168/**
2169 * cpuset_unlock - release lock on cpuset changes
2170 *
2171 * Undo the lock taken in a previous cpuset_lock() call.
2172 */
2173
2174void cpuset_unlock(void)
2175{
2176 up(&callback_sem);
2177}
2178
2179/**
1827 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? 2180 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
1828 * @p: pointer to task_struct of some other task. 2181 * @p: pointer to task_struct of some other task.
1829 * 2182 *
@@ -1832,7 +2185,7 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1832 * determine if task @p's memory usage might impact the memory 2185 * determine if task @p's memory usage might impact the memory
1833 * available to the current task. 2186 * available to the current task.
1834 * 2187 *
1835 * Acquires callback_sem - not suitable for calling from a fast path. 2188 * Call while holding callback_sem.
1836 **/ 2189 **/
1837 2190
1838int cpuset_excl_nodes_overlap(const struct task_struct *p) 2191int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1840,8 +2193,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1840 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 2193 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1841 int overlap = 0; /* do cpusets overlap? */ 2194 int overlap = 0; /* do cpusets overlap? */
1842 2195
1843 down(&callback_sem);
1844
1845 task_lock(current); 2196 task_lock(current);
1846 if (current->flags & PF_EXITING) { 2197 if (current->flags & PF_EXITING) {
1847 task_unlock(current); 2198 task_unlock(current);
@@ -1860,12 +2211,46 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1860 2211
1861 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 2212 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1862done: 2213done:
1863 up(&callback_sem);
1864
1865 return overlap; 2214 return overlap;
1866} 2215}
1867 2216
1868/* 2217/*
2218 * Collection of memory_pressure is suppressed unless
2219 * this flag is enabled by writing "1" to the special
2220 * cpuset file 'memory_pressure_enabled' in the root cpuset.
2221 */
2222
2223int cpuset_memory_pressure_enabled __read_mostly;
2224
2225/**
2226 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
2227 *
2228 * Keep a running average of the rate of synchronous (direct)
2229 * page reclaim efforts initiated by tasks in each cpuset.
2230 *
2231 * This represents the rate at which some task in the cpuset
2232 * ran low on memory on all nodes it was allowed to use, and
2233 * had to enter the kernels page reclaim code in an effort to
2234 * create more free memory by tossing clean pages or swapping
2235 * or writing dirty pages.
2236 *
2237 * Display to user space in the per-cpuset read-only file
2238 * "memory_pressure". Value displayed is an integer
2239 * representing the recent rate of entry into the synchronous
2240 * (direct) page reclaim by any task attached to the cpuset.
2241 **/
2242
2243void __cpuset_memory_pressure_bump(void)
2244{
2245 struct cpuset *cs;
2246
2247 task_lock(current);
2248 cs = current->cpuset;
2249 fmeter_markevent(&cs->fmeter);
2250 task_unlock(current);
2251}
2252
2253/*
1869 * proc_cpuset_show() 2254 * proc_cpuset_show()
1870 * - Print tasks cpuset path into seq_file. 2255 * - Print tasks cpuset path into seq_file.
1871 * - Used for /proc/<pid>/cpuset. 2256 * - Used for /proc/<pid>/cpuset.
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
deleted file mode 100644
index 334c37f5218a..000000000000
--- a/kernel/crash_dump.c
+++ /dev/null
@@ -1,61 +0,0 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/smp_lock.h>
9#include <linux/errno.h>
10#include <linux/proc_fs.h>
11#include <linux/bootmem.h>
12#include <linux/highmem.h>
13#include <linux/crash_dump.h>
14
15#include <asm/io.h>
16#include <asm/uaccess.h>
17
18/* Stores the physical address of elf header of crash image. */
19unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
20
21/**
22 * copy_oldmem_page - copy one page from "oldmem"
23 * @pfn: page frame number to be copied
24 * @buf: target memory address for the copy; this can be in kernel address
25 * space or user address space (see @userbuf)
26 * @csize: number of bytes to copy
27 * @offset: offset in bytes into the page (based on pfn) to begin the copy
28 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
29 * otherwise @buf is in kernel address space, use memcpy().
30 *
31 * Copy a page from "oldmem". For this page, there is no pte mapped
32 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
33 */
34ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
35 size_t csize, unsigned long offset, int userbuf)
36{
37 void *page, *vaddr;
38
39 if (!csize)
40 return 0;
41
42 page = kmalloc(PAGE_SIZE, GFP_KERNEL);
43 if (!page)
44 return -ENOMEM;
45
46 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
47 copy_page(page, vaddr);
48 kunmap_atomic(vaddr, KM_PTE0);
49
50 if (userbuf) {
51 if (copy_to_user(buf, (page + offset), csize)) {
52 kfree(page);
53 return -EFAULT;
54 }
55 } else {
56 memcpy(buf, (page + offset), csize);
57 }
58
59 kfree(page);
60 return csize;
61}
diff --git a/kernel/exit.c b/kernel/exit.c
index 452a1d116178..93cee3671332 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -10,6 +10,7 @@
10#include <linux/interrupt.h> 10#include <linux/interrupt.h>
11#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/capability.h>
13#include <linux/completion.h> 14#include <linux/completion.h>
14#include <linux/personality.h> 15#include <linux/personality.h>
15#include <linux/tty.h> 16#include <linux/tty.h>
@@ -29,6 +30,7 @@
29#include <linux/syscalls.h> 30#include <linux/syscalls.h>
30#include <linux/signal.h> 31#include <linux/signal.h>
31#include <linux/cn_proc.h> 32#include <linux/cn_proc.h>
33#include <linux/mutex.h>
32 34
33#include <asm/uaccess.h> 35#include <asm/uaccess.h>
34#include <asm/unistd.h> 36#include <asm/unistd.h>
@@ -72,7 +74,6 @@ repeat:
72 __ptrace_unlink(p); 74 __ptrace_unlink(p);
73 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 75 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
74 __exit_signal(p); 76 __exit_signal(p);
75 __exit_sighand(p);
76 /* 77 /*
77 * Note that the fastpath in sys_times depends on __exit_signal having 78 * Note that the fastpath in sys_times depends on __exit_signal having
78 * updated the counters before a task is removed from the tasklist of 79 * updated the counters before a task is removed from the tasklist of
@@ -192,7 +193,7 @@ int is_orphaned_pgrp(int pgrp)
192 return retval; 193 return retval;
193} 194}
194 195
195static inline int has_stopped_jobs(int pgrp) 196static int has_stopped_jobs(int pgrp)
196{ 197{
197 int retval = 0; 198 int retval = 0;
198 struct task_struct *p; 199 struct task_struct *p;
@@ -229,7 +230,7 @@ static inline int has_stopped_jobs(int pgrp)
229 * 230 *
230 * NOTE that reparent_to_init() gives the caller full capabilities. 231 * NOTE that reparent_to_init() gives the caller full capabilities.
231 */ 232 */
232static inline void reparent_to_init(void) 233static void reparent_to_init(void)
233{ 234{
234 write_lock_irq(&tasklist_lock); 235 write_lock_irq(&tasklist_lock);
235 236
@@ -243,7 +244,9 @@ static inline void reparent_to_init(void)
243 /* Set the exit signal to SIGCHLD so we signal init on exit */ 244 /* Set the exit signal to SIGCHLD so we signal init on exit */
244 current->exit_signal = SIGCHLD; 245 current->exit_signal = SIGCHLD;
245 246
246 if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) 247 if ((current->policy == SCHED_NORMAL ||
248 current->policy == SCHED_BATCH)
249 && (task_nice(current) < 0))
247 set_user_nice(current, 0); 250 set_user_nice(current, 0);
248 /* cpus_allowed? */ 251 /* cpus_allowed? */
249 /* rt_priority? */ 252 /* rt_priority? */
@@ -258,7 +261,7 @@ static inline void reparent_to_init(void)
258 261
259void __set_special_pids(pid_t session, pid_t pgrp) 262void __set_special_pids(pid_t session, pid_t pgrp)
260{ 263{
261 struct task_struct *curr = current; 264 struct task_struct *curr = current->group_leader;
262 265
263 if (curr->signal->session != session) { 266 if (curr->signal->session != session) {
264 detach_pid(curr, PIDTYPE_SID); 267 detach_pid(curr, PIDTYPE_SID);
@@ -366,7 +369,7 @@ void daemonize(const char *name, ...)
366 369
367EXPORT_SYMBOL(daemonize); 370EXPORT_SYMBOL(daemonize);
368 371
369static inline void close_files(struct files_struct * files) 372static void close_files(struct files_struct * files)
370{ 373{
371 int i, j; 374 int i, j;
372 struct fdtable *fdt; 375 struct fdtable *fdt;
@@ -540,7 +543,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re
540 p->real_parent = reaper; 543 p->real_parent = reaper;
541} 544}
542 545
543static inline void reparent_thread(task_t *p, task_t *father, int traced) 546static void reparent_thread(task_t *p, task_t *father, int traced)
544{ 547{
545 /* We don't want people slaying init. */ 548 /* We don't want people slaying init. */
546 if (p->exit_signal != -1) 549 if (p->exit_signal != -1)
@@ -604,7 +607,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
604 * group, and if no such member exists, give it to 607 * group, and if no such member exists, give it to
605 * the global child reaper process (ie "init") 608 * the global child reaper process (ie "init")
606 */ 609 */
607static inline void forget_original_parent(struct task_struct * father, 610static void forget_original_parent(struct task_struct * father,
608 struct list_head *to_release) 611 struct list_head *to_release)
609{ 612{
610 struct task_struct *p, *reaper = father; 613 struct task_struct *p, *reaper = father;
@@ -842,7 +845,7 @@ fastcall NORET_TYPE void do_exit(long code)
842 } 845 }
843 group_dead = atomic_dec_and_test(&tsk->signal->live); 846 group_dead = atomic_dec_and_test(&tsk->signal->live);
844 if (group_dead) { 847 if (group_dead) {
845 del_timer_sync(&tsk->signal->real_timer); 848 hrtimer_cancel(&tsk->signal->real_timer);
846 exit_itimers(tsk->signal); 849 exit_itimers(tsk->signal);
847 acct_process(code); 850 acct_process(code);
848 } 851 }
@@ -859,7 +862,7 @@ fastcall NORET_TYPE void do_exit(long code)
859 if (group_dead && tsk->signal->leader) 862 if (group_dead && tsk->signal->leader)
860 disassociate_ctty(1); 863 disassociate_ctty(1);
861 864
862 module_put(tsk->thread_info->exec_domain->module); 865 module_put(task_thread_info(tsk)->exec_domain->module);
863 if (tsk->binfmt) 866 if (tsk->binfmt)
864 module_put(tsk->binfmt->module); 867 module_put(tsk->binfmt->module);
865 868
@@ -870,6 +873,10 @@ fastcall NORET_TYPE void do_exit(long code)
870 mpol_free(tsk->mempolicy); 873 mpol_free(tsk->mempolicy);
871 tsk->mempolicy = NULL; 874 tsk->mempolicy = NULL;
872#endif 875#endif
876 /*
877 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
878 */
879 mutex_debug_check_no_locks_held(tsk);
873 880
874 /* PF_DEAD causes final put_task_struct after we schedule. */ 881 /* PF_DEAD causes final put_task_struct after we schedule. */
875 preempt_disable(); 882 preempt_disable();
@@ -926,7 +933,6 @@ do_group_exit(int exit_code)
926 /* Another thread got here before we took the lock. */ 933 /* Another thread got here before we took the lock. */
927 exit_code = sig->group_exit_code; 934 exit_code = sig->group_exit_code;
928 else { 935 else {
929 sig->flags = SIGNAL_GROUP_EXIT;
930 sig->group_exit_code = exit_code; 936 sig->group_exit_code = exit_code;
931 zap_other_threads(current); 937 zap_other_threads(current);
932 } 938 }
@@ -1068,6 +1074,9 @@ static int wait_task_zombie(task_t *p, int noreap,
1068 } 1074 }
1069 1075
1070 if (likely(p->real_parent == p->parent) && likely(p->signal)) { 1076 if (likely(p->real_parent == p->parent) && likely(p->signal)) {
1077 struct signal_struct *psig;
1078 struct signal_struct *sig;
1079
1071 /* 1080 /*
1072 * The resource counters for the group leader are in its 1081 * The resource counters for the group leader are in its
1073 * own task_struct. Those for dead threads in the group 1082 * own task_struct. Those for dead threads in the group
@@ -1084,24 +1093,26 @@ static int wait_task_zombie(task_t *p, int noreap,
1084 * here reaping other children at the same time. 1093 * here reaping other children at the same time.
1085 */ 1094 */
1086 spin_lock_irq(&p->parent->sighand->siglock); 1095 spin_lock_irq(&p->parent->sighand->siglock);
1087 p->parent->signal->cutime = 1096 psig = p->parent->signal;
1088 cputime_add(p->parent->signal->cutime, 1097 sig = p->signal;
1098 psig->cutime =
1099 cputime_add(psig->cutime,
1089 cputime_add(p->utime, 1100 cputime_add(p->utime,
1090 cputime_add(p->signal->utime, 1101 cputime_add(sig->utime,
1091 p->signal->cutime))); 1102 sig->cutime)));
1092 p->parent->signal->cstime = 1103 psig->cstime =
1093 cputime_add(p->parent->signal->cstime, 1104 cputime_add(psig->cstime,
1094 cputime_add(p->stime, 1105 cputime_add(p->stime,
1095 cputime_add(p->signal->stime, 1106 cputime_add(sig->stime,
1096 p->signal->cstime))); 1107 sig->cstime)));
1097 p->parent->signal->cmin_flt += 1108 psig->cmin_flt +=
1098 p->min_flt + p->signal->min_flt + p->signal->cmin_flt; 1109 p->min_flt + sig->min_flt + sig->cmin_flt;
1099 p->parent->signal->cmaj_flt += 1110 psig->cmaj_flt +=
1100 p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt; 1111 p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1101 p->parent->signal->cnvcsw += 1112 psig->cnvcsw +=
1102 p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw; 1113 p->nvcsw + sig->nvcsw + sig->cnvcsw;
1103 p->parent->signal->cnivcsw += 1114 psig->cnivcsw +=
1104 p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw; 1115 p->nivcsw + sig->nivcsw + sig->cnivcsw;
1105 spin_unlock_irq(&p->parent->sighand->siglock); 1116 spin_unlock_irq(&p->parent->sighand->siglock);
1106 } 1117 }
1107 1118
diff --git a/kernel/fork.c b/kernel/fork.c
index 158710d22566..4ae8cfc1c89c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,7 @@
28#include <linux/binfmts.h> 28#include <linux/binfmts.h>
29#include <linux/mman.h> 29#include <linux/mman.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/capability.h>
31#include <linux/cpu.h> 32#include <linux/cpu.h>
32#include <linux/cpuset.h> 33#include <linux/cpuset.h>
33#include <linux/security.h> 34#include <linux/security.h>
@@ -171,10 +172,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
171 return NULL; 172 return NULL;
172 } 173 }
173 174
174 *ti = *orig->thread_info;
175 *tsk = *orig; 175 *tsk = *orig;
176 tsk->thread_info = ti; 176 tsk->thread_info = ti;
177 ti->task = tsk; 177 setup_thread_stack(tsk, orig);
178 178
179 /* One for us, one for whoever does the "release_task()" (usually parent) */ 179 /* One for us, one for whoever does the "release_task()" (usually parent) */
180 atomic_set(&tsk->usage,2); 180 atomic_set(&tsk->usage,2);
@@ -264,7 +264,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
264 rb_parent = &tmp->vm_rb; 264 rb_parent = &tmp->vm_rb;
265 265
266 mm->map_count++; 266 mm->map_count++;
267 retval = copy_page_range(mm, oldmm, tmp); 267 retval = copy_page_range(mm, oldmm, mpnt);
268 268
269 if (tmp->vm_ops && tmp->vm_ops->open) 269 if (tmp->vm_ops && tmp->vm_ops->open)
270 tmp->vm_ops->open(tmp); 270 tmp->vm_ops->open(tmp);
@@ -324,7 +324,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
324 spin_lock_init(&mm->page_table_lock); 324 spin_lock_init(&mm->page_table_lock);
325 rwlock_init(&mm->ioctx_list_lock); 325 rwlock_init(&mm->ioctx_list_lock);
326 mm->ioctx_list = NULL; 326 mm->ioctx_list = NULL;
327 mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
328 mm->free_area_cache = TASK_UNMAPPED_BASE; 327 mm->free_area_cache = TASK_UNMAPPED_BASE;
329 mm->cached_hole_size = ~0UL; 328 mm->cached_hole_size = ~0UL;
330 329
@@ -745,6 +744,14 @@ int unshare_files(void)
745 744
746EXPORT_SYMBOL(unshare_files); 745EXPORT_SYMBOL(unshare_files);
747 746
747void sighand_free_cb(struct rcu_head *rhp)
748{
749 struct sighand_struct *sp;
750
751 sp = container_of(rhp, struct sighand_struct, rcu);
752 kmem_cache_free(sighand_cachep, sp);
753}
754
748static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 755static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
749{ 756{
750 struct sighand_struct *sig; 757 struct sighand_struct *sig;
@@ -754,7 +761,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
754 return 0; 761 return 0;
755 } 762 }
756 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); 763 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
757 tsk->sighand = sig; 764 rcu_assign_pointer(tsk->sighand, sig);
758 if (!sig) 765 if (!sig)
759 return -ENOMEM; 766 return -ENOMEM;
760 spin_lock_init(&sig->siglock); 767 spin_lock_init(&sig->siglock);
@@ -795,19 +802,16 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
795 init_sigpending(&sig->shared_pending); 802 init_sigpending(&sig->shared_pending);
796 INIT_LIST_HEAD(&sig->posix_timers); 803 INIT_LIST_HEAD(&sig->posix_timers);
797 804
798 sig->it_real_value = sig->it_real_incr = 0; 805 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC);
806 sig->it_real_incr.tv64 = 0;
799 sig->real_timer.function = it_real_fn; 807 sig->real_timer.function = it_real_fn;
800 sig->real_timer.data = (unsigned long) tsk; 808 sig->real_timer.data = tsk;
801 init_timer(&sig->real_timer);
802 809
803 sig->it_virt_expires = cputime_zero; 810 sig->it_virt_expires = cputime_zero;
804 sig->it_virt_incr = cputime_zero; 811 sig->it_virt_incr = cputime_zero;
805 sig->it_prof_expires = cputime_zero; 812 sig->it_prof_expires = cputime_zero;
806 sig->it_prof_incr = cputime_zero; 813 sig->it_prof_incr = cputime_zero;
807 814
808 sig->tty = current->signal->tty;
809 sig->pgrp = process_group(current);
810 sig->session = current->signal->session;
811 sig->leader = 0; /* session leadership doesn't inherit */ 815 sig->leader = 0; /* session leadership doesn't inherit */
812 sig->tty_old_pgrp = 0; 816 sig->tty_old_pgrp = 0;
813 817
@@ -919,7 +923,7 @@ static task_t *copy_process(unsigned long clone_flags,
919 if (nr_threads >= max_threads) 923 if (nr_threads >= max_threads)
920 goto bad_fork_cleanup_count; 924 goto bad_fork_cleanup_count;
921 925
922 if (!try_module_get(p->thread_info->exec_domain->module)) 926 if (!try_module_get(task_thread_info(p)->exec_domain->module))
923 goto bad_fork_cleanup_count; 927 goto bad_fork_cleanup_count;
924 928
925 if (p->binfmt && !try_module_get(p->binfmt->module)) 929 if (p->binfmt && !try_module_get(p->binfmt->module))
@@ -966,15 +970,20 @@ static task_t *copy_process(unsigned long clone_flags,
966 p->io_context = NULL; 970 p->io_context = NULL;
967 p->io_wait = NULL; 971 p->io_wait = NULL;
968 p->audit_context = NULL; 972 p->audit_context = NULL;
973 cpuset_fork(p);
969#ifdef CONFIG_NUMA 974#ifdef CONFIG_NUMA
970 p->mempolicy = mpol_copy(p->mempolicy); 975 p->mempolicy = mpol_copy(p->mempolicy);
971 if (IS_ERR(p->mempolicy)) { 976 if (IS_ERR(p->mempolicy)) {
972 retval = PTR_ERR(p->mempolicy); 977 retval = PTR_ERR(p->mempolicy);
973 p->mempolicy = NULL; 978 p->mempolicy = NULL;
974 goto bad_fork_cleanup; 979 goto bad_fork_cleanup_cpuset;
975 } 980 }
976#endif 981#endif
977 982
983#ifdef CONFIG_DEBUG_MUTEXES
984 p->blocked_on = NULL; /* not blocked yet */
985#endif
986
978 p->tgid = p->pid; 987 p->tgid = p->pid;
979 if (clone_flags & CLONE_THREAD) 988 if (clone_flags & CLONE_THREAD)
980 p->tgid = current->tgid; 989 p->tgid = current->tgid;
@@ -1126,29 +1135,22 @@ static task_t *copy_process(unsigned long clone_flags,
1126 if (unlikely(p->ptrace & PT_PTRACED)) 1135 if (unlikely(p->ptrace & PT_PTRACED))
1127 __ptrace_link(p, current->parent); 1136 __ptrace_link(p, current->parent);
1128 1137
1129 cpuset_fork(p);
1130
1131 attach_pid(p, PIDTYPE_PID, p->pid); 1138 attach_pid(p, PIDTYPE_PID, p->pid);
1132 attach_pid(p, PIDTYPE_TGID, p->tgid); 1139 attach_pid(p, PIDTYPE_TGID, p->tgid);
1133 if (thread_group_leader(p)) { 1140 if (thread_group_leader(p)) {
1141 p->signal->tty = current->signal->tty;
1142 p->signal->pgrp = process_group(current);
1143 p->signal->session = current->signal->session;
1134 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1144 attach_pid(p, PIDTYPE_PGID, process_group(p));
1135 attach_pid(p, PIDTYPE_SID, p->signal->session); 1145 attach_pid(p, PIDTYPE_SID, p->signal->session);
1136 if (p->pid) 1146 if (p->pid)
1137 __get_cpu_var(process_counts)++; 1147 __get_cpu_var(process_counts)++;
1138 } 1148 }
1139 1149
1140 proc_fork_connector(p);
1141 if (!current->signal->tty && p->signal->tty)
1142 p->signal->tty = NULL;
1143
1144 nr_threads++; 1150 nr_threads++;
1145 total_forks++; 1151 total_forks++;
1146 write_unlock_irq(&tasklist_lock); 1152 write_unlock_irq(&tasklist_lock);
1147 retval = 0; 1153 proc_fork_connector(p);
1148
1149fork_out:
1150 if (retval)
1151 return ERR_PTR(retval);
1152 return p; 1154 return p;
1153 1155
1154bad_fork_cleanup_namespace: 1156bad_fork_cleanup_namespace:
@@ -1175,19 +1177,22 @@ bad_fork_cleanup_security:
1175bad_fork_cleanup_policy: 1177bad_fork_cleanup_policy:
1176#ifdef CONFIG_NUMA 1178#ifdef CONFIG_NUMA
1177 mpol_free(p->mempolicy); 1179 mpol_free(p->mempolicy);
1180bad_fork_cleanup_cpuset:
1178#endif 1181#endif
1182 cpuset_exit(p);
1179bad_fork_cleanup: 1183bad_fork_cleanup:
1180 if (p->binfmt) 1184 if (p->binfmt)
1181 module_put(p->binfmt->module); 1185 module_put(p->binfmt->module);
1182bad_fork_cleanup_put_domain: 1186bad_fork_cleanup_put_domain:
1183 module_put(p->thread_info->exec_domain->module); 1187 module_put(task_thread_info(p)->exec_domain->module);
1184bad_fork_cleanup_count: 1188bad_fork_cleanup_count:
1185 put_group_info(p->group_info); 1189 put_group_info(p->group_info);
1186 atomic_dec(&p->user->processes); 1190 atomic_dec(&p->user->processes);
1187 free_uid(p->user); 1191 free_uid(p->user);
1188bad_fork_free: 1192bad_fork_free:
1189 free_task(p); 1193 free_task(p);
1190 goto fork_out; 1194fork_out:
1195 return ERR_PTR(retval);
1191} 1196}
1192 1197
1193struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) 1198struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
@@ -1293,6 +1298,10 @@ long do_fork(unsigned long clone_flags,
1293 return pid; 1298 return pid;
1294} 1299}
1295 1300
1301#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1302#define ARCH_MIN_MMSTRUCT_ALIGN 0
1303#endif
1304
1296void __init proc_caches_init(void) 1305void __init proc_caches_init(void)
1297{ 1306{
1298 sighand_cachep = kmem_cache_create("sighand_cache", 1307 sighand_cachep = kmem_cache_create("sighand_cache",
@@ -1311,6 +1320,6 @@ void __init proc_caches_init(void)
1311 sizeof(struct vm_area_struct), 0, 1320 sizeof(struct vm_area_struct), 0,
1312 SLAB_PANIC, NULL, NULL); 1321 SLAB_PANIC, NULL, NULL);
1313 mm_cachep = kmem_cache_create("mm_struct", 1322 mm_cachep = kmem_cache_create("mm_struct",
1314 sizeof(struct mm_struct), 0, 1323 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1315 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1324 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1316} 1325}
diff --git a/kernel/futex.c b/kernel/futex.c
index aca8d10704f6..5efa2f978032 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -201,21 +201,6 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
201 * from swap. But that's a lot of code to duplicate here 201 * from swap. But that's a lot of code to duplicate here
202 * for a rare case, so we simply fetch the page. 202 * for a rare case, so we simply fetch the page.
203 */ 203 */
204
205 /*
206 * Do a quick atomic lookup first - this is the fastpath.
207 */
208 page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
209 if (likely(page != NULL)) {
210 key->shared.pgoff =
211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
212 put_page(page);
213 return 0;
214 }
215
216 /*
217 * Do it the general way.
218 */
219 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 204 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
220 if (err >= 0) { 205 if (err >= 0) {
221 key->shared.pgoff = 206 key->shared.pgoff =
@@ -285,7 +270,13 @@ static void wake_futex(struct futex_q *q)
285 /* 270 /*
286 * The waiting task can free the futex_q as soon as this is written, 271 * The waiting task can free the futex_q as soon as this is written,
287 * without taking any locks. This must come last. 272 * without taking any locks. This must come last.
273 *
274 * A memory barrier is required here to prevent the following store
275 * to lock_ptr from getting ahead of the wakeup. Clearing the lock
276 * at the end of wake_up_all() does not prevent this store from
277 * moving.
288 */ 278 */
279 wmb();
289 q->lock_ptr = NULL; 280 q->lock_ptr = NULL;
290} 281}
291 282
@@ -365,6 +356,13 @@ retry:
365 if (bh1 != bh2) 356 if (bh1 != bh2)
366 spin_unlock(&bh2->lock); 357 spin_unlock(&bh2->lock);
367 358
359#ifndef CONFIG_MMU
360 /* we don't get EFAULT from MMU faults if we don't have an MMU,
361 * but we might get them from range checking */
362 ret = op_ret;
363 goto out;
364#endif
365
368 if (unlikely(op_ret != -EFAULT)) { 366 if (unlikely(op_ret != -EFAULT)) {
369 ret = op_ret; 367 ret = op_ret;
370 goto out; 368 goto out;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
new file mode 100644
index 000000000000..f1c4155b49ac
--- /dev/null
+++ b/kernel/hrtimer.c
@@ -0,0 +1,826 @@
1/*
2 * linux/kernel/hrtimer.c
3 *
4 * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
6 *
7 * High-resolution kernel timers
8 *
9 * In contrast to the low-resolution timeout API implemented in
10 * kernel/timer.c, hrtimers provide finer resolution and accuracy
11 * depending on system configuration and capabilities.
12 *
13 * These timers are currently used for:
14 * - itimers
15 * - POSIX timers
16 * - nanosleep
17 * - precise in-kernel timing
18 *
19 * Started by: Thomas Gleixner and Ingo Molnar
20 *
21 * Credits:
22 * based on kernel/timer.c
23 *
24 * For licencing details see kernel-base/COPYING
25 */
26
27#include <linux/cpu.h>
28#include <linux/module.h>
29#include <linux/percpu.h>
30#include <linux/hrtimer.h>
31#include <linux/notifier.h>
32#include <linux/syscalls.h>
33#include <linux/interrupt.h>
34
35#include <asm/uaccess.h>
36
37/**
38 * ktime_get - get the monotonic time in ktime_t format
39 *
40 * returns the time in ktime_t format
41 */
42static ktime_t ktime_get(void)
43{
44 struct timespec now;
45
46 ktime_get_ts(&now);
47
48 return timespec_to_ktime(now);
49}
50
51/**
52 * ktime_get_real - get the real (wall-) time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56static ktime_t ktime_get_real(void)
57{
58 struct timespec now;
59
60 getnstimeofday(&now);
61
62 return timespec_to_ktime(now);
63}
64
65EXPORT_SYMBOL_GPL(ktime_get_real);
66
67/*
68 * The timer bases:
69 */
70
71#define MAX_HRTIMER_BASES 2
72
73static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
74{
75 {
76 .index = CLOCK_REALTIME,
77 .get_time = &ktime_get_real,
78 .resolution = KTIME_REALTIME_RES,
79 },
80 {
81 .index = CLOCK_MONOTONIC,
82 .get_time = &ktime_get,
83 .resolution = KTIME_MONOTONIC_RES,
84 },
85};
86
87/**
88 * ktime_get_ts - get the monotonic clock in timespec format
89 *
90 * @ts: pointer to timespec variable
91 *
92 * The function calculates the monotonic clock from the realtime
93 * clock and the wall_to_monotonic offset and stores the result
94 * in normalized timespec format in the variable pointed to by ts.
95 */
96void ktime_get_ts(struct timespec *ts)
97{
98 struct timespec tomono;
99 unsigned long seq;
100
101 do {
102 seq = read_seqbegin(&xtime_lock);
103 getnstimeofday(ts);
104 tomono = wall_to_monotonic;
105
106 } while (read_seqretry(&xtime_lock, seq));
107
108 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
109 ts->tv_nsec + tomono.tv_nsec);
110}
111EXPORT_SYMBOL_GPL(ktime_get_ts);
112
113/*
114 * Functions and macros which are different for UP/SMP systems are kept in a
115 * single place
116 */
117#ifdef CONFIG_SMP
118
119#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0)
120
121/*
122 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
123 * means that all timers which are tied to this base via timer->base are
124 * locked, and the base itself is locked too.
125 *
126 * So __run_timers/migrate_timers can safely modify all timers which could
127 * be found on the lists/queues.
128 *
129 * When the timer's base is locked, and the timer removed from list, it is
130 * possible to set timer->base = NULL and drop the lock: the timer remains
131 * locked.
132 */
133static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
134 unsigned long *flags)
135{
136 struct hrtimer_base *base;
137
138 for (;;) {
139 base = timer->base;
140 if (likely(base != NULL)) {
141 spin_lock_irqsave(&base->lock, *flags);
142 if (likely(base == timer->base))
143 return base;
144 /* The timer has migrated to another CPU: */
145 spin_unlock_irqrestore(&base->lock, *flags);
146 }
147 cpu_relax();
148 }
149}
150
151/*
152 * Switch the timer base to the current CPU when possible.
153 */
154static inline struct hrtimer_base *
155switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
156{
157 struct hrtimer_base *new_base;
158
159 new_base = &__get_cpu_var(hrtimer_bases[base->index]);
160
161 if (base != new_base) {
162 /*
163 * We are trying to schedule the timer on the local CPU.
164 * However we can't change timer's base while it is running,
165 * so we keep it on the same CPU. No hassle vs. reprogramming
166 * the event source in the high resolution case. The softirq
167 * code will take care of this when the timer function has
168 * completed. There is no conflict as we hold the lock until
169 * the timer is enqueued.
170 */
171 if (unlikely(base->curr_timer == timer))
172 return base;
173
174 /* See the comment in lock_timer_base() */
175 timer->base = NULL;
176 spin_unlock(&base->lock);
177 spin_lock(&new_base->lock);
178 timer->base = new_base;
179 }
180 return new_base;
181}
182
183#else /* CONFIG_SMP */
184
185#define set_curr_timer(b, t) do { } while (0)
186
187static inline struct hrtimer_base *
188lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
189{
190 struct hrtimer_base *base = timer->base;
191
192 spin_lock_irqsave(&base->lock, *flags);
193
194 return base;
195}
196
197#define switch_hrtimer_base(t, b) (b)
198
199#endif /* !CONFIG_SMP */
200
201/*
202 * Functions for the union type storage format of ktime_t which are
203 * too large for inlining:
204 */
205#if BITS_PER_LONG < 64
206# ifndef CONFIG_KTIME_SCALAR
207/**
208 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
209 *
210 * @kt: addend
211 * @nsec: the scalar nsec value to add
212 *
213 * Returns the sum of kt and nsec in ktime_t format
214 */
215ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
216{
217 ktime_t tmp;
218
219 if (likely(nsec < NSEC_PER_SEC)) {
220 tmp.tv64 = nsec;
221 } else {
222 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
223
224 tmp = ktime_set((long)nsec, rem);
225 }
226
227 return ktime_add(kt, tmp);
228}
229
230#else /* CONFIG_KTIME_SCALAR */
231
232# endif /* !CONFIG_KTIME_SCALAR */
233
234/*
235 * Divide a ktime value by a nanosecond value
236 */
237static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
238{
239 u64 dclc, inc, dns;
240 int sft = 0;
241
242 dclc = dns = ktime_to_ns(kt);
243 inc = div;
244 /* Make sure the divisor is less than 2^32: */
245 while (div >> 32) {
246 sft++;
247 div >>= 1;
248 }
249 dclc >>= sft;
250 do_div(dclc, (unsigned long) div);
251
252 return (unsigned long) dclc;
253}
254
255#else /* BITS_PER_LONG < 64 */
256# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
257#endif /* BITS_PER_LONG >= 64 */
258
259/*
260 * Counterpart to lock_timer_base above:
261 */
262static inline
263void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
264{
265 spin_unlock_irqrestore(&timer->base->lock, *flags);
266}
267
268/**
269 * hrtimer_forward - forward the timer expiry
270 *
271 * @timer: hrtimer to forward
272 * @interval: the interval to forward
273 *
274 * Forward the timer expiry so it will expire in the future.
275 * Returns the number of overruns.
276 */
277unsigned long
278hrtimer_forward(struct hrtimer *timer, ktime_t interval)
279{
280 unsigned long orun = 1;
281 ktime_t delta, now;
282
283 now = timer->base->get_time();
284
285 delta = ktime_sub(now, timer->expires);
286
287 if (delta.tv64 < 0)
288 return 0;
289
290 if (interval.tv64 < timer->base->resolution.tv64)
291 interval.tv64 = timer->base->resolution.tv64;
292
293 if (unlikely(delta.tv64 >= interval.tv64)) {
294 nsec_t incr = ktime_to_ns(interval);
295
296 orun = ktime_divns(delta, incr);
297 timer->expires = ktime_add_ns(timer->expires, incr * orun);
298 if (timer->expires.tv64 > now.tv64)
299 return orun;
300 /*
301 * This (and the ktime_add() below) is the
302 * correction for exact:
303 */
304 orun++;
305 }
306 timer->expires = ktime_add(timer->expires, interval);
307
308 return orun;
309}
310
311/*
312 * enqueue_hrtimer - internal function to (re)start a timer
313 *
314 * The timer is inserted in expiry order. Insertion into the
315 * red black tree is O(log(n)). Must hold the base lock.
316 */
317static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
318{
319 struct rb_node **link = &base->active.rb_node;
320 struct rb_node *parent = NULL;
321 struct hrtimer *entry;
322
323 /*
324 * Find the right place in the rbtree:
325 */
326 while (*link) {
327 parent = *link;
328 entry = rb_entry(parent, struct hrtimer, node);
329 /*
330 * We dont care about collisions. Nodes with
331 * the same expiry time stay together.
332 */
333 if (timer->expires.tv64 < entry->expires.tv64)
334 link = &(*link)->rb_left;
335 else
336 link = &(*link)->rb_right;
337 }
338
339 /*
340 * Insert the timer to the rbtree and check whether it
341 * replaces the first pending timer
342 */
343 rb_link_node(&timer->node, parent, link);
344 rb_insert_color(&timer->node, &base->active);
345
346 timer->state = HRTIMER_PENDING;
347
348 if (!base->first || timer->expires.tv64 <
349 rb_entry(base->first, struct hrtimer, node)->expires.tv64)
350 base->first = &timer->node;
351}
352
353/*
354 * __remove_hrtimer - internal function to remove a timer
355 *
356 * Caller must hold the base lock.
357 */
358static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
359{
360 /*
361 * Remove the timer from the rbtree and replace the
362 * first entry pointer if necessary.
363 */
364 if (base->first == &timer->node)
365 base->first = rb_next(&timer->node);
366 rb_erase(&timer->node, &base->active);
367}
368
369/*
370 * remove hrtimer, called with base lock held
371 */
372static inline int
373remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
374{
375 if (hrtimer_active(timer)) {
376 __remove_hrtimer(timer, base);
377 timer->state = HRTIMER_INACTIVE;
378 return 1;
379 }
380 return 0;
381}
382
383/**
384 * hrtimer_start - (re)start an relative timer on the current CPU
385 *
386 * @timer: the timer to be added
387 * @tim: expiry time
388 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
389 *
390 * Returns:
391 * 0 on success
392 * 1 when the timer was active
393 */
394int
395hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
396{
397 struct hrtimer_base *base, *new_base;
398 unsigned long flags;
399 int ret;
400
401 base = lock_hrtimer_base(timer, &flags);
402
403 /* Remove an active timer from the queue: */
404 ret = remove_hrtimer(timer, base);
405
406 /* Switch the timer base, if necessary: */
407 new_base = switch_hrtimer_base(timer, base);
408
409 if (mode == HRTIMER_REL)
410 tim = ktime_add(tim, new_base->get_time());
411 timer->expires = tim;
412
413 enqueue_hrtimer(timer, new_base);
414
415 unlock_hrtimer_base(timer, &flags);
416
417 return ret;
418}
419
420/**
421 * hrtimer_try_to_cancel - try to deactivate a timer
422 *
423 * @timer: hrtimer to stop
424 *
425 * Returns:
426 * 0 when the timer was not active
427 * 1 when the timer was active
428 * -1 when the timer is currently excuting the callback function and
429 * can not be stopped
430 */
431int hrtimer_try_to_cancel(struct hrtimer *timer)
432{
433 struct hrtimer_base *base;
434 unsigned long flags;
435 int ret = -1;
436
437 base = lock_hrtimer_base(timer, &flags);
438
439 if (base->curr_timer != timer)
440 ret = remove_hrtimer(timer, base);
441
442 unlock_hrtimer_base(timer, &flags);
443
444 return ret;
445
446}
447
448/**
449 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
450 *
451 * @timer: the timer to be cancelled
452 *
453 * Returns:
454 * 0 when the timer was not active
455 * 1 when the timer was active
456 */
457int hrtimer_cancel(struct hrtimer *timer)
458{
459 for (;;) {
460 int ret = hrtimer_try_to_cancel(timer);
461
462 if (ret >= 0)
463 return ret;
464 }
465}
466
467/**
468 * hrtimer_get_remaining - get remaining time for the timer
469 *
470 * @timer: the timer to read
471 */
472ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
473{
474 struct hrtimer_base *base;
475 unsigned long flags;
476 ktime_t rem;
477
478 base = lock_hrtimer_base(timer, &flags);
479 rem = ktime_sub(timer->expires, timer->base->get_time());
480 unlock_hrtimer_base(timer, &flags);
481
482 return rem;
483}
484
485/**
486 * hrtimer_rebase - rebase an initialized hrtimer to a different base
487 *
488 * @timer: the timer to be rebased
489 * @clock_id: the clock to be used
490 */
491void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id)
492{
493 struct hrtimer_base *bases;
494
495 bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
496 timer->base = &bases[clock_id];
497}
498
499/**
500 * hrtimer_init - initialize a timer to the given clock
501 *
502 * @timer: the timer to be initialized
503 * @clock_id: the clock to be used
504 */
505void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id)
506{
507 memset(timer, 0, sizeof(struct hrtimer));
508 hrtimer_rebase(timer, clock_id);
509}
510
511/**
512 * hrtimer_get_res - get the timer resolution for a clock
513 *
514 * @which_clock: which clock to query
515 * @tp: pointer to timespec variable to store the resolution
516 *
517 * Store the resolution of the clock selected by which_clock in the
518 * variable pointed to by tp.
519 */
520int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
521{
522 struct hrtimer_base *bases;
523
524 bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
525 *tp = ktime_to_timespec(bases[which_clock].resolution);
526
527 return 0;
528}
529
530/*
531 * Expire the per base hrtimer-queue:
532 */
533static inline void run_hrtimer_queue(struct hrtimer_base *base)
534{
535 ktime_t now = base->get_time();
536 struct rb_node *node;
537
538 spin_lock_irq(&base->lock);
539
540 while ((node = base->first)) {
541 struct hrtimer *timer;
542 int (*fn)(void *);
543 int restart;
544 void *data;
545
546 timer = rb_entry(node, struct hrtimer, node);
547 if (now.tv64 <= timer->expires.tv64)
548 break;
549
550 fn = timer->function;
551 data = timer->data;
552 set_curr_timer(base, timer);
553 __remove_hrtimer(timer, base);
554 spin_unlock_irq(&base->lock);
555
556 /*
557 * fn == NULL is special case for the simplest timer
558 * variant - wake up process and do not restart:
559 */
560 if (!fn) {
561 wake_up_process(data);
562 restart = HRTIMER_NORESTART;
563 } else
564 restart = fn(data);
565
566 spin_lock_irq(&base->lock);
567
568 if (restart == HRTIMER_RESTART)
569 enqueue_hrtimer(timer, base);
570 else
571 timer->state = HRTIMER_EXPIRED;
572 }
573 set_curr_timer(base, NULL);
574 spin_unlock_irq(&base->lock);
575}
576
577/*
578 * Called from timer softirq every jiffy, expire hrtimers:
579 */
580void hrtimer_run_queues(void)
581{
582 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
583 int i;
584
585 for (i = 0; i < MAX_HRTIMER_BASES; i++)
586 run_hrtimer_queue(&base[i]);
587}
588
589/*
590 * Sleep related functions:
591 */
592
593/**
594 * schedule_hrtimer - sleep until timeout
595 *
596 * @timer: hrtimer variable initialized with the correct clock base
597 * @mode: timeout value is abs/rel
598 *
599 * Make the current task sleep until @timeout is
600 * elapsed.
601 *
602 * You can set the task state as follows -
603 *
604 * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
605 * pass before the routine returns. The routine will return 0
606 *
607 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
608 * delivered to the current task. In this case the remaining time
609 * will be returned
610 *
611 * The current task state is guaranteed to be TASK_RUNNING when this
612 * routine returns.
613 */
614static ktime_t __sched
615schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
616{
617 /* fn stays NULL, meaning single-shot wakeup: */
618 timer->data = current;
619
620 hrtimer_start(timer, timer->expires, mode);
621
622 schedule();
623 hrtimer_cancel(timer);
624
625 /* Return the remaining time: */
626 if (timer->state != HRTIMER_EXPIRED)
627 return ktime_sub(timer->expires, timer->base->get_time());
628 else
629 return (ktime_t) {.tv64 = 0 };
630}
631
632static inline ktime_t __sched
633schedule_hrtimer_interruptible(struct hrtimer *timer,
634 const enum hrtimer_mode mode)
635{
636 set_current_state(TASK_INTERRUPTIBLE);
637
638 return schedule_hrtimer(timer, mode);
639}
640
641static long __sched
642nanosleep_restart(struct restart_block *restart, clockid_t clockid)
643{
644 struct timespec __user *rmtp;
645 struct timespec tu;
646 void *rfn_save = restart->fn;
647 struct hrtimer timer;
648 ktime_t rem;
649
650 restart->fn = do_no_restart_syscall;
651
652 hrtimer_init(&timer, clockid);
653
654 timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
655
656 rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
657
658 if (rem.tv64 <= 0)
659 return 0;
660
661 rmtp = (struct timespec __user *) restart->arg2;
662 tu = ktime_to_timespec(rem);
663 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
664 return -EFAULT;
665
666 restart->fn = rfn_save;
667
668 /* The other values in restart are already filled in */
669 return -ERESTART_RESTARTBLOCK;
670}
671
672static long __sched nanosleep_restart_mono(struct restart_block *restart)
673{
674 return nanosleep_restart(restart, CLOCK_MONOTONIC);
675}
676
677static long __sched nanosleep_restart_real(struct restart_block *restart)
678{
679 return nanosleep_restart(restart, CLOCK_REALTIME);
680}
681
682long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
683 const enum hrtimer_mode mode, const clockid_t clockid)
684{
685 struct restart_block *restart;
686 struct hrtimer timer;
687 struct timespec tu;
688 ktime_t rem;
689
690 hrtimer_init(&timer, clockid);
691
692 timer.expires = timespec_to_ktime(*rqtp);
693
694 rem = schedule_hrtimer_interruptible(&timer, mode);
695 if (rem.tv64 <= 0)
696 return 0;
697
698 /* Absolute timers do not update the rmtp value: */
699 if (mode == HRTIMER_ABS)
700 return -ERESTARTNOHAND;
701
702 tu = ktime_to_timespec(rem);
703
704 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
705 return -EFAULT;
706
707 restart = &current_thread_info()->restart_block;
708 restart->fn = (clockid == CLOCK_MONOTONIC) ?
709 nanosleep_restart_mono : nanosleep_restart_real;
710 restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
711 restart->arg1 = timer.expires.tv64 >> 32;
712 restart->arg2 = (unsigned long) rmtp;
713
714 return -ERESTART_RESTARTBLOCK;
715}
716
717asmlinkage long
718sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
719{
720 struct timespec tu;
721
722 if (copy_from_user(&tu, rqtp, sizeof(tu)))
723 return -EFAULT;
724
725 if (!timespec_valid(&tu))
726 return -EINVAL;
727
728 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC);
729}
730
731/*
732 * Functions related to boot-time initialization:
733 */
734static void __devinit init_hrtimers_cpu(int cpu)
735{
736 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
737 int i;
738
739 for (i = 0; i < MAX_HRTIMER_BASES; i++) {
740 spin_lock_init(&base->lock);
741 base++;
742 }
743}
744
745#ifdef CONFIG_HOTPLUG_CPU
746
747static void migrate_hrtimer_list(struct hrtimer_base *old_base,
748 struct hrtimer_base *new_base)
749{
750 struct hrtimer *timer;
751 struct rb_node *node;
752
753 while ((node = rb_first(&old_base->active))) {
754 timer = rb_entry(node, struct hrtimer, node);
755 __remove_hrtimer(timer, old_base);
756 timer->base = new_base;
757 enqueue_hrtimer(timer, new_base);
758 }
759}
760
761static void migrate_hrtimers(int cpu)
762{
763 struct hrtimer_base *old_base, *new_base;
764 int i;
765
766 BUG_ON(cpu_online(cpu));
767 old_base = per_cpu(hrtimer_bases, cpu);
768 new_base = get_cpu_var(hrtimer_bases);
769
770 local_irq_disable();
771
772 for (i = 0; i < MAX_HRTIMER_BASES; i++) {
773
774 spin_lock(&new_base->lock);
775 spin_lock(&old_base->lock);
776
777 BUG_ON(old_base->curr_timer);
778
779 migrate_hrtimer_list(old_base, new_base);
780
781 spin_unlock(&old_base->lock);
782 spin_unlock(&new_base->lock);
783 old_base++;
784 new_base++;
785 }
786
787 local_irq_enable();
788 put_cpu_var(hrtimer_bases);
789}
790#endif /* CONFIG_HOTPLUG_CPU */
791
792static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
793 unsigned long action, void *hcpu)
794{
795 long cpu = (long)hcpu;
796
797 switch (action) {
798
799 case CPU_UP_PREPARE:
800 init_hrtimers_cpu(cpu);
801 break;
802
803#ifdef CONFIG_HOTPLUG_CPU
804 case CPU_DEAD:
805 migrate_hrtimers(cpu);
806 break;
807#endif
808
809 default:
810 break;
811 }
812
813 return NOTIFY_OK;
814}
815
816static struct notifier_block __devinitdata hrtimers_nb = {
817 .notifier_call = hrtimer_cpu_notify,
818};
819
820void __init hrtimers_init(void)
821{
822 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
823 (void *)(long)smp_processor_id());
824 register_cpu_notifier(&hrtimers_nb);
825}
826
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3bd7226d15fa..97d5559997d2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -36,6 +36,9 @@ void synchronize_irq(unsigned int irq)
36{ 36{
37 struct irq_desc *desc = irq_desc + irq; 37 struct irq_desc *desc = irq_desc + irq;
38 38
39 if (irq >= NR_IRQS)
40 return;
41
39 while (desc->status & IRQ_INPROGRESS) 42 while (desc->status & IRQ_INPROGRESS)
40 cpu_relax(); 43 cpu_relax();
41} 44}
@@ -60,6 +63,9 @@ void disable_irq_nosync(unsigned int irq)
60 irq_desc_t *desc = irq_desc + irq; 63 irq_desc_t *desc = irq_desc + irq;
61 unsigned long flags; 64 unsigned long flags;
62 65
66 if (irq >= NR_IRQS)
67 return;
68
63 spin_lock_irqsave(&desc->lock, flags); 69 spin_lock_irqsave(&desc->lock, flags);
64 if (!desc->depth++) { 70 if (!desc->depth++) {
65 desc->status |= IRQ_DISABLED; 71 desc->status |= IRQ_DISABLED;
@@ -86,6 +92,9 @@ void disable_irq(unsigned int irq)
86{ 92{
87 irq_desc_t *desc = irq_desc + irq; 93 irq_desc_t *desc = irq_desc + irq;
88 94
95 if (irq >= NR_IRQS)
96 return;
97
89 disable_irq_nosync(irq); 98 disable_irq_nosync(irq);
90 if (desc->action) 99 if (desc->action)
91 synchronize_irq(irq); 100 synchronize_irq(irq);
@@ -108,6 +117,9 @@ void enable_irq(unsigned int irq)
108 irq_desc_t *desc = irq_desc + irq; 117 irq_desc_t *desc = irq_desc + irq;
109 unsigned long flags; 118 unsigned long flags;
110 119
120 if (irq >= NR_IRQS)
121 return;
122
111 spin_lock_irqsave(&desc->lock, flags); 123 spin_lock_irqsave(&desc->lock, flags);
112 switch (desc->depth) { 124 switch (desc->depth) {
113 case 0: 125 case 0:
@@ -163,6 +175,9 @@ int setup_irq(unsigned int irq, struct irqaction * new)
163 unsigned long flags; 175 unsigned long flags;
164 int shared = 0; 176 int shared = 0;
165 177
178 if (irq >= NR_IRQS)
179 return -EINVAL;
180
166 if (desc->handler == &no_irq_type) 181 if (desc->handler == &no_irq_type)
167 return -ENOSYS; 182 return -ENOSYS;
168 /* 183 /*
@@ -351,6 +366,8 @@ int request_irq(unsigned int irq,
351 action->next = NULL; 366 action->next = NULL;
352 action->dev_id = dev_id; 367 action->dev_id = dev_id;
353 368
369 select_smp_affinity(irq);
370
354 retval = setup_irq(irq, action); 371 retval = setup_irq(irq, action);
355 if (retval) 372 if (retval)
356 kfree(action); 373 kfree(action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f26e534c6585..d03b5eef8ce0 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -10,6 +10,8 @@
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12 12
13#include "internals.h"
14
13static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; 15static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
14 16
15#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
@@ -68,7 +70,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
68 */ 70 */
69 cpus_and(tmp, new_value, cpu_online_map); 71 cpus_and(tmp, new_value, cpu_online_map);
70 if (cpus_empty(tmp)) 72 if (cpus_empty(tmp))
71 return -EINVAL; 73 /* Special case for empty set - allow the architecture
74 code to set default SMP affinity. */
75 return select_smp_affinity(irq) ? -EINVAL : full_count;
72 76
73 proc_set_irq_affinity(irq, new_value); 77 proc_set_irq_affinity(irq, new_value);
74 78
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 7c1b25e25e47..c2c05c4ff28d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,36 +12,46 @@
12#include <linux/syscalls.h> 12#include <linux/syscalls.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/posix-timers.h> 14#include <linux/posix-timers.h>
15#include <linux/hrtimer.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18
18static unsigned long it_real_value(struct signal_struct *sig) 19/**
20 * itimer_get_remtime - get remaining time for the timer
21 *
22 * @timer: the timer to read
23 *
24 * Returns the delta between the expiry time and now, which can be
25 * less than zero or 1usec for an pending expired timer
26 */
27static struct timeval itimer_get_remtime(struct hrtimer *timer)
19{ 28{
20 unsigned long val = 0; 29 ktime_t rem = hrtimer_get_remaining(timer);
21 if (timer_pending(&sig->real_timer)) {
22 val = sig->real_timer.expires - jiffies;
23 30
24 /* look out for negative/zero itimer.. */ 31 /*
25 if ((long) val <= 0) 32 * Racy but safe: if the itimer expires after the above
26 val = 1; 33 * hrtimer_get_remtime() call but before this condition
27 } 34 * then we return 0 - which is correct.
28 return val; 35 */
36 if (hrtimer_active(timer)) {
37 if (rem.tv64 <= 0)
38 rem.tv64 = NSEC_PER_USEC;
39 } else
40 rem.tv64 = 0;
41
42 return ktime_to_timeval(rem);
29} 43}
30 44
31int do_getitimer(int which, struct itimerval *value) 45int do_getitimer(int which, struct itimerval *value)
32{ 46{
33 struct task_struct *tsk = current; 47 struct task_struct *tsk = current;
34 unsigned long interval, val;
35 cputime_t cinterval, cval; 48 cputime_t cinterval, cval;
36 49
37 switch (which) { 50 switch (which) {
38 case ITIMER_REAL: 51 case ITIMER_REAL:
39 spin_lock_irq(&tsk->sighand->siglock); 52 value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
40 interval = tsk->signal->it_real_incr; 53 value->it_interval =
41 val = it_real_value(tsk->signal); 54 ktime_to_timeval(tsk->signal->it_real_incr);
42 spin_unlock_irq(&tsk->sighand->siglock);
43 jiffies_to_timeval(val, &value->it_value);
44 jiffies_to_timeval(interval, &value->it_interval);
45 break; 55 break;
46 case ITIMER_VIRTUAL: 56 case ITIMER_VIRTUAL:
47 read_lock(&tasklist_lock); 57 read_lock(&tasklist_lock);
@@ -113,59 +123,45 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
113} 123}
114 124
115 125
116void it_real_fn(unsigned long __data) 126/*
127 * The timer is automagically restarted, when interval != 0
128 */
129int it_real_fn(void *data)
117{ 130{
118 struct task_struct * p = (struct task_struct *) __data; 131 struct task_struct *tsk = (struct task_struct *) data;
119 unsigned long inc = p->signal->it_real_incr;
120 132
121 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); 133 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
122 134
123 /* 135 if (tsk->signal->it_real_incr.tv64 != 0) {
124 * Now restart the timer if necessary. We don't need any locking 136 hrtimer_forward(&tsk->signal->real_timer,
125 * here because do_setitimer makes sure we have finished running 137 tsk->signal->it_real_incr);
126 * before it touches anything. 138
127 * Note, we KNOW we are (or should be) at a jiffie edge here so 139 return HRTIMER_RESTART;
128 * we don't need the +1 stuff. Also, we want to use the prior 140 }
129 * expire value so as to not "slip" a jiffie if we are late. 141 return HRTIMER_NORESTART;
130 * Deal with requesting a time prior to "now" here rather than
131 * in add_timer.
132 */
133 if (!inc)
134 return;
135 while (time_before_eq(p->signal->real_timer.expires, jiffies))
136 p->signal->real_timer.expires += inc;
137 add_timer(&p->signal->real_timer);
138} 142}
139 143
140int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) 144int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
141{ 145{
142 struct task_struct *tsk = current; 146 struct task_struct *tsk = current;
143 unsigned long val, interval, expires; 147 struct hrtimer *timer;
148 ktime_t expires;
144 cputime_t cval, cinterval, nval, ninterval; 149 cputime_t cval, cinterval, nval, ninterval;
145 150
146 switch (which) { 151 switch (which) {
147 case ITIMER_REAL: 152 case ITIMER_REAL:
148again: 153 timer = &tsk->signal->real_timer;
149 spin_lock_irq(&tsk->sighand->siglock); 154 hrtimer_cancel(timer);
150 interval = tsk->signal->it_real_incr;
151 val = it_real_value(tsk->signal);
152 /* We are sharing ->siglock with it_real_fn() */
153 if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
154 spin_unlock_irq(&tsk->sighand->siglock);
155 goto again;
156 }
157 tsk->signal->it_real_incr =
158 timeval_to_jiffies(&value->it_interval);
159 expires = timeval_to_jiffies(&value->it_value);
160 if (expires)
161 mod_timer(&tsk->signal->real_timer,
162 jiffies + 1 + expires);
163 spin_unlock_irq(&tsk->sighand->siglock);
164 if (ovalue) { 155 if (ovalue) {
165 jiffies_to_timeval(val, &ovalue->it_value); 156 ovalue->it_value = itimer_get_remtime(timer);
166 jiffies_to_timeval(interval, 157 ovalue->it_interval
167 &ovalue->it_interval); 158 = ktime_to_timeval(tsk->signal->it_real_incr);
168 } 159 }
160 tsk->signal->it_real_incr =
161 timeval_to_ktime(value->it_interval);
162 expires = timeval_to_ktime(value->it_value);
163 if (expires.tv64 != 0)
164 hrtimer_start(timer, expires, HRTIMER_REL);
169 break; 165 break;
170 case ITIMER_VIRTUAL: 166 case ITIMER_VIRTUAL:
171 nval = timeval_to_cputime(&value->it_value); 167 nval = timeval_to_cputime(&value->it_value);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2c95848fbce8..bf39d28e4c0e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,7 @@
6 * Version 2. See the file COPYING for more details. 6 * Version 2. See the file COPYING for more details.
7 */ 7 */
8 8
9#include <linux/capability.h>
9#include <linux/mm.h> 10#include <linux/mm.h>
10#include <linux/file.h> 11#include <linux/file.h>
11#include <linux/slab.h> 12#include <linux/slab.h>
@@ -26,6 +27,9 @@
26#include <asm/system.h> 27#include <asm/system.h>
27#include <asm/semaphore.h> 28#include <asm/semaphore.h>
28 29
30/* Per cpu memory for storing cpu states in case of system crash. */
31note_buf_t* crash_notes;
32
29/* Location of the reserved area for the crash kernel */ 33/* Location of the reserved area for the crash kernel */
30struct resource crashk_res = { 34struct resource crashk_res = {
31 .name = "Crash kernel", 35 .name = "Crash kernel",
@@ -1054,9 +1058,24 @@ void crash_kexec(struct pt_regs *regs)
1054 if (!locked) { 1058 if (!locked) {
1055 image = xchg(&kexec_crash_image, NULL); 1059 image = xchg(&kexec_crash_image, NULL);
1056 if (image) { 1060 if (image) {
1057 machine_crash_shutdown(regs); 1061 struct pt_regs fixed_regs;
1062 crash_setup_regs(&fixed_regs, regs);
1063 machine_crash_shutdown(&fixed_regs);
1058 machine_kexec(image); 1064 machine_kexec(image);
1059 } 1065 }
1060 xchg(&kexec_lock, 0); 1066 xchg(&kexec_lock, 0);
1061 } 1067 }
1062} 1068}
1069
1070static int __init crash_notes_memory_init(void)
1071{
1072 /* Allocate memory for saving cpu registers. */
1073 crash_notes = alloc_percpu(note_buf_t);
1074 if (!crash_notes) {
1075 printk("Kexec: Memory allocation for saving cpu register"
1076 " states failed\n");
1077 return -ENOMEM;
1078 }
1079 return 0;
1080}
1081module_init(crash_notes_memory_init)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5beda378cc75..3ea6325228da 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,10 +48,11 @@
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50 50
51static DEFINE_SPINLOCK(kprobe_lock); /* Protects kprobe_table */ 51DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 54
55#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
55/* 56/*
56 * kprobe->ainsn.insn points to the copy of the instruction to be 57 * kprobe->ainsn.insn points to the copy of the instruction to be
57 * single-stepped. x86_64, POWER4 and above have no-exec support and 58 * single-stepped. x86_64, POWER4 and above have no-exec support and
@@ -151,6 +152,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
151 } 152 }
152 } 153 }
153} 154}
155#endif
154 156
155/* We have preemption disabled.. so it is safe to use __ versions */ 157/* We have preemption disabled.. so it is safe to use __ versions */
156static inline void set_kprobe_instance(struct kprobe *kp) 158static inline void set_kprobe_instance(struct kprobe *kp)
@@ -165,7 +167,7 @@ static inline void reset_kprobe_instance(void)
165 167
166/* 168/*
167 * This routine is called either: 169 * This routine is called either:
168 * - under the kprobe_lock spinlock - during kprobe_[un]register() 170 * - under the kprobe_mutex - during kprobe_[un]register()
169 * OR 171 * OR
170 * - with preemption disabled - from arch/xxx/kernel/kprobes.c 172 * - with preemption disabled - from arch/xxx/kernel/kprobes.c
171 */ 173 */
@@ -246,6 +248,19 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
246 return ret; 248 return ret;
247} 249}
248 250
251/* Walks the list and increments nmissed count for multiprobe case */
252void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
253{
254 struct kprobe *kp;
255 if (p->pre_handler != aggr_pre_handler) {
256 p->nmissed++;
257 } else {
258 list_for_each_entry_rcu(kp, &p->list, list)
259 kp->nmissed++;
260 }
261 return;
262}
263
249/* Called with kretprobe_lock held */ 264/* Called with kretprobe_lock held */
250struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) 265struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
251{ 266{
@@ -399,16 +414,12 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
399 INIT_LIST_HEAD(&ap->list); 414 INIT_LIST_HEAD(&ap->list);
400 list_add_rcu(&p->list, &ap->list); 415 list_add_rcu(&p->list, &ap->list);
401 416
402 INIT_HLIST_NODE(&ap->hlist); 417 hlist_replace_rcu(&p->hlist, &ap->hlist);
403 hlist_del_rcu(&p->hlist);
404 hlist_add_head_rcu(&ap->hlist,
405 &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
406} 418}
407 419
408/* 420/*
409 * This is the second or subsequent kprobe at the address - handle 421 * This is the second or subsequent kprobe at the address - handle
410 * the intricacies 422 * the intricacies
411 * TODO: Move kcalloc outside the spin_lock
412 */ 423 */
413static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 424static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
414 struct kprobe *p) 425 struct kprobe *p)
@@ -420,7 +431,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
420 copy_kprobe(old_p, p); 431 copy_kprobe(old_p, p);
421 ret = add_new_kprobe(old_p, p); 432 ret = add_new_kprobe(old_p, p);
422 } else { 433 } else {
423 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); 434 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
424 if (!ap) 435 if (!ap)
425 return -ENOMEM; 436 return -ENOMEM;
426 add_aggr_kprobe(ap, old_p); 437 add_aggr_kprobe(ap, old_p);
@@ -430,25 +441,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
430 return ret; 441 return ret;
431} 442}
432 443
433/* kprobe removal house-keeping routines */
434static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
435{
436 arch_disarm_kprobe(p);
437 hlist_del_rcu(&p->hlist);
438 spin_unlock_irqrestore(&kprobe_lock, flags);
439 arch_remove_kprobe(p);
440}
441
442static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
443 struct kprobe *p, unsigned long flags)
444{
445 list_del_rcu(&p->list);
446 if (list_empty(&old_p->list))
447 cleanup_kprobe(old_p, flags);
448 else
449 spin_unlock_irqrestore(&kprobe_lock, flags);
450}
451
452static int __kprobes in_kprobes_functions(unsigned long addr) 444static int __kprobes in_kprobes_functions(unsigned long addr)
453{ 445{
454 if (addr >= (unsigned long)__kprobes_text_start 446 if (addr >= (unsigned long)__kprobes_text_start
@@ -457,26 +449,44 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
457 return 0; 449 return 0;
458} 450}
459 451
460int __kprobes register_kprobe(struct kprobe *p) 452static int __kprobes __register_kprobe(struct kprobe *p,
453 unsigned long called_from)
461{ 454{
462 int ret = 0; 455 int ret = 0;
463 unsigned long flags = 0;
464 struct kprobe *old_p; 456 struct kprobe *old_p;
457 struct module *probed_mod;
465 458
466 if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) 459 if ((!kernel_text_address((unsigned long) p->addr)) ||
467 return ret; 460 in_kprobes_functions((unsigned long) p->addr))
468 if ((ret = arch_prepare_kprobe(p)) != 0) 461 return -EINVAL;
469 goto rm_kprobe; 462
463 p->mod_refcounted = 0;
464 /* Check are we probing a module */
465 if ((probed_mod = module_text_address((unsigned long) p->addr))) {
466 struct module *calling_mod = module_text_address(called_from);
467 /* We must allow modules to probe themself and
468 * in this case avoid incrementing the module refcount,
469 * so as to allow unloading of self probing modules.
470 */
471 if (calling_mod && (calling_mod != probed_mod)) {
472 if (unlikely(!try_module_get(probed_mod)))
473 return -EINVAL;
474 p->mod_refcounted = 1;
475 } else
476 probed_mod = NULL;
477 }
470 478
471 p->nmissed = 0; 479 p->nmissed = 0;
472 spin_lock_irqsave(&kprobe_lock, flags); 480 down(&kprobe_mutex);
473 old_p = get_kprobe(p->addr); 481 old_p = get_kprobe(p->addr);
474 if (old_p) { 482 if (old_p) {
475 ret = register_aggr_kprobe(old_p, p); 483 ret = register_aggr_kprobe(old_p, p);
476 goto out; 484 goto out;
477 } 485 }
478 486
479 arch_copy_kprobe(p); 487 if ((ret = arch_prepare_kprobe(p)) != 0)
488 goto out;
489
480 INIT_HLIST_NODE(&p->hlist); 490 INIT_HLIST_NODE(&p->hlist);
481 hlist_add_head_rcu(&p->hlist, 491 hlist_add_head_rcu(&p->hlist,
482 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 492 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
@@ -484,33 +494,66 @@ int __kprobes register_kprobe(struct kprobe *p)
484 arch_arm_kprobe(p); 494 arch_arm_kprobe(p);
485 495
486out: 496out:
487 spin_unlock_irqrestore(&kprobe_lock, flags); 497 up(&kprobe_mutex);
488rm_kprobe: 498
489 if (ret == -EEXIST) 499 if (ret && probed_mod)
490 arch_remove_kprobe(p); 500 module_put(probed_mod);
491 return ret; 501 return ret;
492} 502}
493 503
504int __kprobes register_kprobe(struct kprobe *p)
505{
506 return __register_kprobe(p,
507 (unsigned long)__builtin_return_address(0));
508}
509
494void __kprobes unregister_kprobe(struct kprobe *p) 510void __kprobes unregister_kprobe(struct kprobe *p)
495{ 511{
496 unsigned long flags; 512 struct module *mod;
497 struct kprobe *old_p; 513 struct kprobe *old_p, *list_p;
514 int cleanup_p;
498 515
499 spin_lock_irqsave(&kprobe_lock, flags); 516 down(&kprobe_mutex);
500 old_p = get_kprobe(p->addr); 517 old_p = get_kprobe(p->addr);
501 if (old_p) { 518 if (unlikely(!old_p)) {
502 /* cleanup_*_kprobe() does the spin_unlock_irqrestore */ 519 up(&kprobe_mutex);
503 if (old_p->pre_handler == aggr_pre_handler) 520 return;
504 cleanup_aggr_kprobe(old_p, p, flags); 521 }
505 else 522 if (p != old_p) {
506 cleanup_kprobe(p, flags); 523 list_for_each_entry_rcu(list_p, &old_p->list, list)
507 524 if (list_p == p)
508 synchronize_sched(); 525 /* kprobe p is a valid probe */
509 if (old_p->pre_handler == aggr_pre_handler && 526 goto valid_p;
510 list_empty(&old_p->list)) 527 up(&kprobe_mutex);
528 return;
529 }
530valid_p:
531 if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
532 (p->list.next == &old_p->list) &&
533 (p->list.prev == &old_p->list))) {
534 /* Only probe on the hash list */
535 arch_disarm_kprobe(p);
536 hlist_del_rcu(&old_p->hlist);
537 cleanup_p = 1;
538 } else {
539 list_del_rcu(&p->list);
540 cleanup_p = 0;
541 }
542
543 up(&kprobe_mutex);
544
545 synchronize_sched();
546 if (p->mod_refcounted &&
547 (mod = module_text_address((unsigned long)p->addr)))
548 module_put(mod);
549
550 if (cleanup_p) {
551 if (p != old_p) {
552 list_del_rcu(&p->list);
511 kfree(old_p); 553 kfree(old_p);
512 } else 554 }
513 spin_unlock_irqrestore(&kprobe_lock, flags); 555 arch_remove_kprobe(p);
556 }
514} 557}
515 558
516static struct notifier_block kprobe_exceptions_nb = { 559static struct notifier_block kprobe_exceptions_nb = {
@@ -524,7 +567,8 @@ int __kprobes register_jprobe(struct jprobe *jp)
524 jp->kp.pre_handler = setjmp_pre_handler; 567 jp->kp.pre_handler = setjmp_pre_handler;
525 jp->kp.break_handler = longjmp_break_handler; 568 jp->kp.break_handler = longjmp_break_handler;
526 569
527 return register_kprobe(&jp->kp); 570 return __register_kprobe(&jp->kp,
571 (unsigned long)__builtin_return_address(0));
528} 572}
529 573
530void __kprobes unregister_jprobe(struct jprobe *jp) 574void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -564,7 +608,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
564 608
565 rp->nmissed = 0; 609 rp->nmissed = 0;
566 /* Establish function entry probe point */ 610 /* Establish function entry probe point */
567 if ((ret = register_kprobe(&rp->kp)) != 0) 611 if ((ret = __register_kprobe(&rp->kp,
612 (unsigned long)__builtin_return_address(0))) != 0)
568 free_rp_inst(rp); 613 free_rp_inst(rp);
569 return ret; 614 return ret;
570} 615}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 015fb69ad94d..d5eeae0fa5bc 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,6 +15,9 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/init.h> 16#include <linux/init.h>
17 17
18u64 uevent_seqnum;
19char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
20
18#define KERNEL_ATTR_RO(_name) \ 21#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 22static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
20 23
@@ -23,21 +26,29 @@ static struct subsys_attribute _name##_attr = \
23 __ATTR(_name, 0644, _name##_show, _name##_store) 26 __ATTR(_name, 0644, _name##_show, _name##_store)
24 27
25#ifdef CONFIG_HOTPLUG 28#ifdef CONFIG_HOTPLUG
26static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) 29/* current uevent sequence number */
30static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
27{ 31{
28 return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); 32 return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
29} 33}
30KERNEL_ATTR_RO(hotplug_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
31#endif
32
33#ifdef CONFIG_KEXEC
34#include <asm/kexec.h>
35 35
36static ssize_t crash_notes_show(struct subsystem *subsys, char *page) 36/* uevent helper program, used during early boo */
37static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
37{ 38{
38 return sprintf(page, "%p\n", (void *)crash_notes); 39 return sprintf(page, "%s\n", uevent_helper);
39} 40}
40KERNEL_ATTR_RO(crash_notes); 41static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
42{
43 if (count+1 > UEVENT_HELPER_PATH_LEN)
44 return -ENOENT;
45 memcpy(uevent_helper, page, count);
46 uevent_helper[count] = '\0';
47 if (count && uevent_helper[count-1] == '\n')
48 uevent_helper[count-1] = '\0';
49 return count;
50}
51KERNEL_ATTR_RW(uevent_helper);
41#endif 52#endif
42 53
43decl_subsys(kernel, NULL, NULL); 54decl_subsys(kernel, NULL, NULL);
@@ -45,10 +56,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys);
45 56
46static struct attribute * kernel_attrs[] = { 57static struct attribute * kernel_attrs[] = {
47#ifdef CONFIG_HOTPLUG 58#ifdef CONFIG_HOTPLUG
48 &hotplug_seqnum_attr.attr, 59 &uevent_seqnum_attr.attr,
49#endif 60 &uevent_helper_attr.attr,
50#ifdef CONFIG_KEXEC
51 &crash_notes_attr.attr,
52#endif 61#endif
53 NULL 62 NULL
54}; 63};
diff --git a/kernel/module.c b/kernel/module.c
index 2ea929d51ad0..618ed6e23ecc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -28,6 +28,7 @@
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/fcntl.h> 29#include <linux/fcntl.h>
30#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
31#include <linux/capability.h>
31#include <linux/cpu.h> 32#include <linux/cpu.h>
32#include <linux/moduleparam.h> 33#include <linux/moduleparam.h>
33#include <linux/errno.h> 34#include <linux/errno.h>
@@ -496,15 +497,15 @@ static void module_unload_free(struct module *mod)
496} 497}
497 498
498#ifdef CONFIG_MODULE_FORCE_UNLOAD 499#ifdef CONFIG_MODULE_FORCE_UNLOAD
499static inline int try_force(unsigned int flags) 500static inline int try_force_unload(unsigned int flags)
500{ 501{
501 int ret = (flags & O_TRUNC); 502 int ret = (flags & O_TRUNC);
502 if (ret) 503 if (ret)
503 add_taint(TAINT_FORCED_MODULE); 504 add_taint(TAINT_FORCED_RMMOD);
504 return ret; 505 return ret;
505} 506}
506#else 507#else
507static inline int try_force(unsigned int flags) 508static inline int try_force_unload(unsigned int flags)
508{ 509{
509 return 0; 510 return 0;
510} 511}
@@ -524,7 +525,7 @@ static int __try_stop_module(void *_sref)
524 525
525 /* If it's not unused, quit unless we are told to block. */ 526 /* If it's not unused, quit unless we are told to block. */
526 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { 527 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
527 if (!(*sref->forced = try_force(sref->flags))) 528 if (!(*sref->forced = try_force_unload(sref->flags)))
528 return -EWOULDBLOCK; 529 return -EWOULDBLOCK;
529 } 530 }
530 531
@@ -609,7 +610,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
609 /* If it has an init func, it must have an exit func to unload */ 610 /* If it has an init func, it must have an exit func to unload */
610 if ((mod->init != NULL && mod->exit == NULL) 611 if ((mod->init != NULL && mod->exit == NULL)
611 || mod->unsafe) { 612 || mod->unsafe) {
612 forced = try_force(flags); 613 forced = try_force_unload(flags);
613 if (!forced) { 614 if (!forced) {
614 /* This module can't be removed */ 615 /* This module can't be removed */
615 ret = -EBUSY; 616 ret = -EBUSY;
@@ -958,7 +959,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
958 unsigned long ret; 959 unsigned long ret;
959 const unsigned long *crc; 960 const unsigned long *crc;
960 961
961 spin_lock_irq(&modlist_lock);
962 ret = __find_symbol(name, &owner, &crc, mod->license_gplok); 962 ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
963 if (ret) { 963 if (ret) {
964 /* use_module can fail due to OOM, or module unloading */ 964 /* use_module can fail due to OOM, or module unloading */
@@ -966,7 +966,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
966 !use_module(mod, owner)) 966 !use_module(mod, owner))
967 ret = 0; 967 ret = 0;
968 } 968 }
969 spin_unlock_irq(&modlist_lock);
970 return ret; 969 return ret;
971} 970}
972 971
@@ -1204,6 +1203,39 @@ void *__symbol_get(const char *symbol)
1204} 1203}
1205EXPORT_SYMBOL_GPL(__symbol_get); 1204EXPORT_SYMBOL_GPL(__symbol_get);
1206 1205
1206/*
1207 * Ensure that an exported symbol [global namespace] does not already exist
1208 * in the Kernel or in some other modules exported symbol table.
1209 */
1210static int verify_export_symbols(struct module *mod)
1211{
1212 const char *name = NULL;
1213 unsigned long i, ret = 0;
1214 struct module *owner;
1215 const unsigned long *crc;
1216
1217 for (i = 0; i < mod->num_syms; i++)
1218 if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
1219 name = mod->syms[i].name;
1220 ret = -ENOEXEC;
1221 goto dup;
1222 }
1223
1224 for (i = 0; i < mod->num_gpl_syms; i++)
1225 if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
1226 name = mod->gpl_syms[i].name;
1227 ret = -ENOEXEC;
1228 goto dup;
1229 }
1230
1231dup:
1232 if (ret)
1233 printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
1234 mod->name, name, module_name(owner));
1235
1236 return ret;
1237}
1238
1207/* Change all symbols so that sh_value encodes the pointer directly. */ 1239/* Change all symbols so that sh_value encodes the pointer directly. */
1208static int simplify_symbols(Elf_Shdr *sechdrs, 1240static int simplify_symbols(Elf_Shdr *sechdrs,
1209 unsigned int symindex, 1241 unsigned int symindex,
@@ -1715,6 +1747,11 @@ static struct module *load_module(void __user *umod,
1715 /* Set up license info based on the info section */ 1747 /* Set up license info based on the info section */
1716 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1748 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1717 1749
1750 if (strcmp(mod->name, "ndiswrapper") == 0)
1751 add_taint(TAINT_PROPRIETARY_MODULE);
1752 if (strcmp(mod->name, "driverloader") == 0)
1753 add_taint(TAINT_PROPRIETARY_MODULE);
1754
1718#ifdef CONFIG_MODULE_UNLOAD 1755#ifdef CONFIG_MODULE_UNLOAD
1719 /* Set up MODINFO_ATTR fields */ 1756 /* Set up MODINFO_ATTR fields */
1720 setup_modinfo(mod, sechdrs, infoindex); 1757 setup_modinfo(mod, sechdrs, infoindex);
@@ -1767,6 +1804,12 @@ static struct module *load_module(void __user *umod,
1767 goto cleanup; 1804 goto cleanup;
1768 } 1805 }
1769 1806
1807 /* Find duplicate symbols */
1808 err = verify_export_symbols(mod);
1809
1810 if (err < 0)
1811 goto cleanup;
1812
1770 /* Set up and sort exception table */ 1813 /* Set up and sort exception table */
1771 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); 1814 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
1772 mod->extable = extable = (void *)sechdrs[exindex].sh_addr; 1815 mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
@@ -1854,8 +1897,7 @@ static struct module *load_module(void __user *umod,
1854 kfree(args); 1897 kfree(args);
1855 free_hdr: 1898 free_hdr:
1856 vfree(hdr); 1899 vfree(hdr);
1857 if (err < 0) return ERR_PTR(err); 1900 return ERR_PTR(err);
1858 else return ptr;
1859 1901
1860 truncated: 1902 truncated:
1861 printk(KERN_ERR "Module len %lu truncated\n", len); 1903 printk(KERN_ERR "Module len %lu truncated\n", len);
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
new file mode 100644
index 000000000000..f4913c376950
--- /dev/null
+++ b/kernel/mutex-debug.c
@@ -0,0 +1,462 @@
1/*
2 * kernel/mutex-debug.c
3 *
4 * Debugging code for mutexes
5 *
6 * Started by Ingo Molnar:
7 *
8 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 *
10 * lock debugging, locking tree, deadlock detection started by:
11 *
12 * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
13 * Released under the General Public License (GPL).
14 */
15#include <linux/mutex.h>
16#include <linux/sched.h>
17#include <linux/delay.h>
18#include <linux/module.h>
19#include <linux/spinlock.h>
20#include <linux/kallsyms.h>
21#include <linux/interrupt.h>
22
23#include "mutex-debug.h"
24
25/*
26 * We need a global lock when we walk through the multi-process
27 * lock tree. Only used in the deadlock-debugging case.
28 */
29DEFINE_SPINLOCK(debug_mutex_lock);
30
31/*
32 * All locks held by all tasks, in a single global list:
33 */
34LIST_HEAD(debug_mutex_held_locks);
35
36/*
37 * In the debug case we carry the caller's instruction pointer into
38 * other functions, but we dont want the function argument overhead
39 * in the nondebug case - hence these macros:
40 */
41#define __IP_DECL__ , unsigned long ip
42#define __IP__ , ip
43#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
44
45/*
46 * "mutex debugging enabled" flag. We turn it off when we detect
47 * the first problem because we dont want to recurse back
48 * into the tracing code when doing error printk or
49 * executing a BUG():
50 */
51int debug_mutex_on = 1;
52
53static void printk_task(struct task_struct *p)
54{
55 if (p)
56 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
57 else
58 printk("<none>");
59}
60
61static void printk_ti(struct thread_info *ti)
62{
63 if (ti)
64 printk_task(ti->task);
65 else
66 printk("<none>");
67}
68
69static void printk_task_short(struct task_struct *p)
70{
71 if (p)
72 printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
73 else
74 printk("<none>");
75}
76
77static void printk_lock(struct mutex *lock, int print_owner)
78{
79 printk(" [%p] {%s}\n", lock, lock->name);
80
81 if (print_owner && lock->owner) {
82 printk(".. held by: ");
83 printk_ti(lock->owner);
84 printk("\n");
85 }
86 if (lock->owner) {
87 printk("... acquired at: ");
88 print_symbol("%s\n", lock->acquire_ip);
89 }
90}
91
92/*
93 * printk locks held by a task:
94 */
95static void show_task_locks(struct task_struct *p)
96{
97 switch (p->state) {
98 case TASK_RUNNING: printk("R"); break;
99 case TASK_INTERRUPTIBLE: printk("S"); break;
100 case TASK_UNINTERRUPTIBLE: printk("D"); break;
101 case TASK_STOPPED: printk("T"); break;
102 case EXIT_ZOMBIE: printk("Z"); break;
103 case EXIT_DEAD: printk("X"); break;
104 default: printk("?"); break;
105 }
106 printk_task(p);
107 if (p->blocked_on) {
108 struct mutex *lock = p->blocked_on->lock;
109
110 printk(" blocked on mutex:");
111 printk_lock(lock, 1);
112 } else
113 printk(" (not blocked on mutex)\n");
114}
115
116/*
117 * printk all locks held in the system (if filter == NULL),
118 * or all locks belonging to a single task (if filter != NULL):
119 */
120void show_held_locks(struct task_struct *filter)
121{
122 struct list_head *curr, *cursor = NULL;
123 struct mutex *lock;
124 struct thread_info *t;
125 unsigned long flags;
126 int count = 0;
127
128 if (filter) {
129 printk("------------------------------\n");
130 printk("| showing all locks held by: | (");
131 printk_task_short(filter);
132 printk("):\n");
133 printk("------------------------------\n");
134 } else {
135 printk("---------------------------\n");
136 printk("| showing all locks held: |\n");
137 printk("---------------------------\n");
138 }
139
140 /*
141 * Play safe and acquire the global trace lock. We
142 * cannot printk with that lock held so we iterate
143 * very carefully:
144 */
145next:
146 debug_spin_lock_save(&debug_mutex_lock, flags);
147 list_for_each(curr, &debug_mutex_held_locks) {
148 if (cursor && curr != cursor)
149 continue;
150 lock = list_entry(curr, struct mutex, held_list);
151 t = lock->owner;
152 if (filter && (t != filter->thread_info))
153 continue;
154 count++;
155 cursor = curr->next;
156 debug_spin_lock_restore(&debug_mutex_lock, flags);
157
158 printk("\n#%03d: ", count);
159 printk_lock(lock, filter ? 0 : 1);
160 goto next;
161 }
162 debug_spin_lock_restore(&debug_mutex_lock, flags);
163 printk("\n");
164}
165
166void mutex_debug_show_all_locks(void)
167{
168 struct task_struct *g, *p;
169 int count = 10;
170 int unlock = 1;
171
172 printk("\nShowing all blocking locks in the system:\n");
173
174 /*
175 * Here we try to get the tasklist_lock as hard as possible,
176 * if not successful after 2 seconds we ignore it (but keep
177 * trying). This is to enable a debug printout even if a
178 * tasklist_lock-holding task deadlocks or crashes.
179 */
180retry:
181 if (!read_trylock(&tasklist_lock)) {
182 if (count == 10)
183 printk("hm, tasklist_lock locked, retrying... ");
184 if (count) {
185 count--;
186 printk(" #%d", 10-count);
187 mdelay(200);
188 goto retry;
189 }
190 printk(" ignoring it.\n");
191 unlock = 0;
192 }
193 if (count != 10)
194 printk(" locked it.\n");
195
196 do_each_thread(g, p) {
197 show_task_locks(p);
198 if (!unlock)
199 if (read_trylock(&tasklist_lock))
200 unlock = 1;
201 } while_each_thread(g, p);
202
203 printk("\n");
204 show_held_locks(NULL);
205 printk("=============================================\n\n");
206
207 if (unlock)
208 read_unlock(&tasklist_lock);
209}
210
211static void report_deadlock(struct task_struct *task, struct mutex *lock,
212 struct mutex *lockblk, unsigned long ip)
213{
214 printk("\n%s/%d is trying to acquire this lock:\n",
215 current->comm, current->pid);
216 printk_lock(lock, 1);
217 printk("... trying at: ");
218 print_symbol("%s\n", ip);
219 show_held_locks(current);
220
221 if (lockblk) {
222 printk("but %s/%d is deadlocking current task %s/%d!\n\n",
223 task->comm, task->pid, current->comm, current->pid);
224 printk("\n%s/%d is blocked on this lock:\n",
225 task->comm, task->pid);
226 printk_lock(lockblk, 1);
227
228 show_held_locks(task);
229
230 printk("\n%s/%d's [blocked] stackdump:\n\n",
231 task->comm, task->pid);
232 show_stack(task, NULL);
233 }
234
235 printk("\n%s/%d's [current] stackdump:\n\n",
236 current->comm, current->pid);
237 dump_stack();
238 mutex_debug_show_all_locks();
239 printk("[ turning off deadlock detection. Please report this. ]\n\n");
240 local_irq_disable();
241}
242
243/*
244 * Recursively check for mutex deadlocks:
245 */
246static int check_deadlock(struct mutex *lock, int depth,
247 struct thread_info *ti, unsigned long ip)
248{
249 struct mutex *lockblk;
250 struct task_struct *task;
251
252 if (!debug_mutex_on)
253 return 0;
254
255 ti = lock->owner;
256 if (!ti)
257 return 0;
258
259 task = ti->task;
260 lockblk = NULL;
261 if (task->blocked_on)
262 lockblk = task->blocked_on->lock;
263
264 /* Self-deadlock: */
265 if (current == task) {
266 DEBUG_OFF();
267 if (depth)
268 return 1;
269 printk("\n==========================================\n");
270 printk( "[ BUG: lock recursion deadlock detected! |\n");
271 printk( "------------------------------------------\n");
272 report_deadlock(task, lock, NULL, ip);
273 return 0;
274 }
275
276 /* Ugh, something corrupted the lock data structure? */
277 if (depth > 20) {
278 DEBUG_OFF();
279 printk("\n===========================================\n");
280 printk( "[ BUG: infinite lock dependency detected!? |\n");
281 printk( "-------------------------------------------\n");
282 report_deadlock(task, lock, lockblk, ip);
283 return 0;
284 }
285
286 /* Recursively check for dependencies: */
287 if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) {
288 printk("\n============================================\n");
289 printk( "[ BUG: circular locking deadlock detected! ]\n");
290 printk( "--------------------------------------------\n");
291 report_deadlock(task, lock, lockblk, ip);
292 return 0;
293 }
294 return 0;
295}
296
297/*
298 * Called when a task exits, this function checks whether the
299 * task is holding any locks, and reports the first one if so:
300 */
301void mutex_debug_check_no_locks_held(struct task_struct *task)
302{
303 struct list_head *curr, *next;
304 struct thread_info *t;
305 unsigned long flags;
306 struct mutex *lock;
307
308 if (!debug_mutex_on)
309 return;
310
311 debug_spin_lock_save(&debug_mutex_lock, flags);
312 list_for_each_safe(curr, next, &debug_mutex_held_locks) {
313 lock = list_entry(curr, struct mutex, held_list);
314 t = lock->owner;
315 if (t != task->thread_info)
316 continue;
317 list_del_init(curr);
318 DEBUG_OFF();
319 debug_spin_lock_restore(&debug_mutex_lock, flags);
320
321 printk("BUG: %s/%d, lock held at task exit time!\n",
322 task->comm, task->pid);
323 printk_lock(lock, 1);
324 if (lock->owner != task->thread_info)
325 printk("exiting task is not even the owner??\n");
326 return;
327 }
328 debug_spin_lock_restore(&debug_mutex_lock, flags);
329}
330
331/*
332 * Called when kernel memory is freed (or unmapped), or if a mutex
333 * is destroyed or reinitialized - this code checks whether there is
334 * any held lock in the memory range of <from> to <to>:
335 */
336void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
337{
338 struct list_head *curr, *next;
339 const void *to = from + len;
340 unsigned long flags;
341 struct mutex *lock;
342 void *lock_addr;
343
344 if (!debug_mutex_on)
345 return;
346
347 debug_spin_lock_save(&debug_mutex_lock, flags);
348 list_for_each_safe(curr, next, &debug_mutex_held_locks) {
349 lock = list_entry(curr, struct mutex, held_list);
350 lock_addr = lock;
351 if (lock_addr < from || lock_addr >= to)
352 continue;
353 list_del_init(curr);
354 DEBUG_OFF();
355 debug_spin_lock_restore(&debug_mutex_lock, flags);
356
357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
358 current->comm, current->pid, lock, from, to);
359 dump_stack();
360 printk_lock(lock, 1);
361 if (lock->owner != current_thread_info())
362 printk("freeing task is not even the owner??\n");
363 return;
364 }
365 debug_spin_lock_restore(&debug_mutex_lock, flags);
366}
367
368/*
369 * Must be called with lock->wait_lock held.
370 */
371void debug_mutex_set_owner(struct mutex *lock,
372 struct thread_info *new_owner __IP_DECL__)
373{
374 lock->owner = new_owner;
375 DEBUG_WARN_ON(!list_empty(&lock->held_list));
376 if (debug_mutex_on) {
377 list_add_tail(&lock->held_list, &debug_mutex_held_locks);
378 lock->acquire_ip = ip;
379 }
380}
381
382void debug_mutex_init_waiter(struct mutex_waiter *waiter)
383{
384 memset(waiter, 0x11, sizeof(*waiter));
385 waiter->magic = waiter;
386 INIT_LIST_HEAD(&waiter->list);
387}
388
389void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
390{
391 SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
392 DEBUG_WARN_ON(list_empty(&lock->wait_list));
393 DEBUG_WARN_ON(waiter->magic != waiter);
394 DEBUG_WARN_ON(list_empty(&waiter->list));
395}
396
397void debug_mutex_free_waiter(struct mutex_waiter *waiter)
398{
399 DEBUG_WARN_ON(!list_empty(&waiter->list));
400 memset(waiter, 0x22, sizeof(*waiter));
401}
402
403void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
404 struct thread_info *ti __IP_DECL__)
405{
406 SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
407 check_deadlock(lock, 0, ti, ip);
408 /* Mark the current thread as blocked on the lock: */
409 ti->task->blocked_on = waiter;
410 waiter->lock = lock;
411}
412
413void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
414 struct thread_info *ti)
415{
416 DEBUG_WARN_ON(list_empty(&waiter->list));
417 DEBUG_WARN_ON(waiter->task != ti->task);
418 DEBUG_WARN_ON(ti->task->blocked_on != waiter);
419 ti->task->blocked_on = NULL;
420
421 list_del_init(&waiter->list);
422 waiter->task = NULL;
423}
424
425void debug_mutex_unlock(struct mutex *lock)
426{
427 DEBUG_WARN_ON(lock->magic != lock);
428 DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
429 DEBUG_WARN_ON(lock->owner != current_thread_info());
430 if (debug_mutex_on) {
431 DEBUG_WARN_ON(list_empty(&lock->held_list));
432 list_del_init(&lock->held_list);
433 }
434}
435
436void debug_mutex_init(struct mutex *lock, const char *name)
437{
438 /*
439 * Make sure we are not reinitializing a held lock:
440 */
441 mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock));
442 lock->owner = NULL;
443 INIT_LIST_HEAD(&lock->held_list);
444 lock->name = name;
445 lock->magic = lock;
446}
447
448/***
449 * mutex_destroy - mark a mutex unusable
450 * @lock: the mutex to be destroyed
451 *
452 * This function marks the mutex uninitialized, and any subsequent
453 * use of the mutex is forbidden. The mutex must not be locked when
454 * this function is called.
455 */
456void fastcall mutex_destroy(struct mutex *lock)
457{
458 DEBUG_WARN_ON(mutex_is_locked(lock));
459 lock->magic = NULL;
460}
461
462EXPORT_SYMBOL_GPL(mutex_destroy);
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
new file mode 100644
index 000000000000..fd384050acb1
--- /dev/null
+++ b/kernel/mutex-debug.h
@@ -0,0 +1,134 @@
1/*
2 * Mutexes: blocking mutual exclusion locks
3 *
4 * started by Ingo Molnar:
5 *
6 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 *
8 * This file contains mutex debugging related internal declarations,
9 * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
10 * More details are in kernel/mutex-debug.c.
11 */
12
13extern spinlock_t debug_mutex_lock;
14extern struct list_head debug_mutex_held_locks;
15extern int debug_mutex_on;
16
17/*
18 * In the debug case we carry the caller's instruction pointer into
19 * other functions, but we dont want the function argument overhead
20 * in the nondebug case - hence these macros:
21 */
22#define __IP_DECL__ , unsigned long ip
23#define __IP__ , ip
24#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
25
26/*
27 * This must be called with lock->wait_lock held.
28 */
29extern void debug_mutex_set_owner(struct mutex *lock,
30 struct thread_info *new_owner __IP_DECL__);
31
32static inline void debug_mutex_clear_owner(struct mutex *lock)
33{
34 lock->owner = NULL;
35}
36
37extern void debug_mutex_init_waiter(struct mutex_waiter *waiter);
38extern void debug_mutex_wake_waiter(struct mutex *lock,
39 struct mutex_waiter *waiter);
40extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
41extern void debug_mutex_add_waiter(struct mutex *lock,
42 struct mutex_waiter *waiter,
43 struct thread_info *ti __IP_DECL__);
44extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
45 struct thread_info *ti);
46extern void debug_mutex_unlock(struct mutex *lock);
47extern void debug_mutex_init(struct mutex *lock, const char *name);
48
49#define debug_spin_lock(lock) \
50 do { \
51 local_irq_disable(); \
52 if (debug_mutex_on) \
53 spin_lock(lock); \
54 } while (0)
55
56#define debug_spin_unlock(lock) \
57 do { \
58 if (debug_mutex_on) \
59 spin_unlock(lock); \
60 local_irq_enable(); \
61 preempt_check_resched(); \
62 } while (0)
63
64#define debug_spin_lock_save(lock, flags) \
65 do { \
66 local_irq_save(flags); \
67 if (debug_mutex_on) \
68 spin_lock(lock); \
69 } while (0)
70
71#define debug_spin_lock_restore(lock, flags) \
72 do { \
73 if (debug_mutex_on) \
74 spin_unlock(lock); \
75 local_irq_restore(flags); \
76 preempt_check_resched(); \
77 } while (0)
78
79#define spin_lock_mutex(lock) \
80 do { \
81 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
82 \
83 DEBUG_WARN_ON(in_interrupt()); \
84 debug_spin_lock(&debug_mutex_lock); \
85 spin_lock(lock); \
86 DEBUG_WARN_ON(l->magic != l); \
87 } while (0)
88
89#define spin_unlock_mutex(lock) \
90 do { \
91 spin_unlock(lock); \
92 debug_spin_unlock(&debug_mutex_lock); \
93 } while (0)
94
95#define DEBUG_OFF() \
96do { \
97 if (debug_mutex_on) { \
98 debug_mutex_on = 0; \
99 console_verbose(); \
100 if (spin_is_locked(&debug_mutex_lock)) \
101 spin_unlock(&debug_mutex_lock); \
102 } \
103} while (0)
104
105#define DEBUG_BUG() \
106do { \
107 if (debug_mutex_on) { \
108 DEBUG_OFF(); \
109 BUG(); \
110 } \
111} while (0)
112
113#define DEBUG_WARN_ON(c) \
114do { \
115 if (unlikely(c && debug_mutex_on)) { \
116 DEBUG_OFF(); \
117 WARN_ON(1); \
118 } \
119} while (0)
120
121# define DEBUG_BUG_ON(c) \
122do { \
123 if (unlikely(c)) \
124 DEBUG_BUG(); \
125} while (0)
126
127#ifdef CONFIG_SMP
128# define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c)
129# define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c)
130#else
131# define SMP_DEBUG_WARN_ON(c) do { } while (0)
132# define SMP_DEBUG_BUG_ON(c) do { } while (0)
133#endif
134
diff --git a/kernel/mutex.c b/kernel/mutex.c
new file mode 100644
index 000000000000..5449b210d9ed
--- /dev/null
+++ b/kernel/mutex.c
@@ -0,0 +1,315 @@
1/*
2 * kernel/mutex.c
3 *
4 * Mutexes: blocking mutual exclusion locks
5 *
6 * Started by Ingo Molnar:
7 *
8 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 *
10 * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
11 * David Howells for suggestions and improvements.
12 *
13 * Also see Documentation/mutex-design.txt.
14 */
15#include <linux/mutex.h>
16#include <linux/sched.h>
17#include <linux/module.h>
18#include <linux/spinlock.h>
19#include <linux/interrupt.h>
20
21/*
22 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
23 * which forces all calls into the slowpath:
24 */
25#ifdef CONFIG_DEBUG_MUTEXES
26# include "mutex-debug.h"
27# include <asm-generic/mutex-null.h>
28#else
29# include "mutex.h"
30# include <asm/mutex.h>
31#endif
32
33/***
34 * mutex_init - initialize the mutex
35 * @lock: the mutex to be initialized
36 *
37 * Initialize the mutex to unlocked state.
38 *
39 * It is not allowed to initialize an already locked mutex.
40 */
41void fastcall __mutex_init(struct mutex *lock, const char *name)
42{
43 atomic_set(&lock->count, 1);
44 spin_lock_init(&lock->wait_lock);
45 INIT_LIST_HEAD(&lock->wait_list);
46
47 debug_mutex_init(lock, name);
48}
49
50EXPORT_SYMBOL(__mutex_init);
51
52/*
53 * We split the mutex lock/unlock logic into separate fastpath and
54 * slowpath functions, to reduce the register pressure on the fastpath.
55 * We also put the fastpath first in the kernel image, to make sure the
56 * branch is predicted by the CPU as default-untaken.
57 */
58static void fastcall noinline __sched
59__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
60
61/***
62 * mutex_lock - acquire the mutex
63 * @lock: the mutex to be acquired
64 *
65 * Lock the mutex exclusively for this task. If the mutex is not
66 * available right now, it will sleep until it can get it.
67 *
68 * The mutex must later on be released by the same task that
69 * acquired it. Recursive locking is not allowed. The task
70 * may not exit without first unlocking the mutex. Also, kernel
71 * memory where the mutex resides mutex must not be freed with
72 * the mutex still locked. The mutex must first be initialized
73 * (or statically defined) before it can be locked. memset()-ing
74 * the mutex to 0 is not allowed.
75 *
76 * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
77 * checks that will enforce the restrictions and will also do
78 * deadlock debugging. )
79 *
80 * This function is similar to (but not equivalent to) down().
81 */
82void fastcall __sched mutex_lock(struct mutex *lock)
83{
84 might_sleep();
85 /*
86 * The locking fastpath is the 1->0 transition from
87 * 'unlocked' into 'locked' state.
88 */
89 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
90}
91
92EXPORT_SYMBOL(mutex_lock);
93
94static void fastcall noinline __sched
95__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__);
96
97/***
98 * mutex_unlock - release the mutex
99 * @lock: the mutex to be released
100 *
101 * Unlock a mutex that has been locked by this task previously.
102 *
103 * This function must not be used in interrupt context. Unlocking
104 * of a not locked mutex is not allowed.
105 *
106 * This function is similar to (but not equivalent to) up().
107 */
108void fastcall __sched mutex_unlock(struct mutex *lock)
109{
110 /*
111 * The unlocking fastpath is the 0->1 transition from 'locked'
112 * into 'unlocked' state:
113 */
114 __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
115}
116
117EXPORT_SYMBOL(mutex_unlock);
118
119/*
120 * Lock a mutex (possibly interruptible), slowpath:
121 */
122static inline int __sched
123__mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
124{
125 struct task_struct *task = current;
126 struct mutex_waiter waiter;
127 unsigned int old_val;
128
129 debug_mutex_init_waiter(&waiter);
130
131 spin_lock_mutex(&lock->wait_lock);
132
133 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
134
135 /* add waiting tasks to the end of the waitqueue (FIFO): */
136 list_add_tail(&waiter.list, &lock->wait_list);
137 waiter.task = task;
138
139 for (;;) {
140 /*
141 * Lets try to take the lock again - this is needed even if
142 * we get here for the first time (shortly after failing to
143 * acquire the lock), to make sure that we get a wakeup once
144 * it's unlocked. Later on, if we sleep, this is the
145 * operation that gives us the lock. We xchg it to -1, so
146 * that when we release the lock, we properly wake up the
147 * other waiters:
148 */
149 old_val = atomic_xchg(&lock->count, -1);
150 if (old_val == 1)
151 break;
152
153 /*
154 * got a signal? (This code gets eliminated in the
155 * TASK_UNINTERRUPTIBLE case.)
156 */
157 if (unlikely(state == TASK_INTERRUPTIBLE &&
158 signal_pending(task))) {
159 mutex_remove_waiter(lock, &waiter, task->thread_info);
160 spin_unlock_mutex(&lock->wait_lock);
161
162 debug_mutex_free_waiter(&waiter);
163 return -EINTR;
164 }
165 __set_task_state(task, state);
166
167 /* didnt get the lock, go to sleep: */
168 spin_unlock_mutex(&lock->wait_lock);
169 schedule();
170 spin_lock_mutex(&lock->wait_lock);
171 }
172
173 /* got the lock - rejoice! */
174 mutex_remove_waiter(lock, &waiter, task->thread_info);
175 debug_mutex_set_owner(lock, task->thread_info __IP__);
176
177 /* set it to 0 if there are no waiters left: */
178 if (likely(list_empty(&lock->wait_list)))
179 atomic_set(&lock->count, 0);
180
181 spin_unlock_mutex(&lock->wait_lock);
182
183 debug_mutex_free_waiter(&waiter);
184
185 DEBUG_WARN_ON(list_empty(&lock->held_list));
186 DEBUG_WARN_ON(lock->owner != task->thread_info);
187
188 return 0;
189}
190
191static void fastcall noinline __sched
192__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__)
193{
194 struct mutex *lock = container_of(lock_count, struct mutex, count);
195
196 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__);
197}
198
199/*
200 * Release the lock, slowpath:
201 */
202static fastcall noinline void
203__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
204{
205 struct mutex *lock = container_of(lock_count, struct mutex, count);
206
207 DEBUG_WARN_ON(lock->owner != current_thread_info());
208
209 spin_lock_mutex(&lock->wait_lock);
210
211 /*
212 * some architectures leave the lock unlocked in the fastpath failure
213 * case, others need to leave it locked. In the later case we have to
214 * unlock it here
215 */
216 if (__mutex_slowpath_needs_to_unlock())
217 atomic_set(&lock->count, 1);
218
219 debug_mutex_unlock(lock);
220
221 if (!list_empty(&lock->wait_list)) {
222 /* get the first entry from the wait-list: */
223 struct mutex_waiter *waiter =
224 list_entry(lock->wait_list.next,
225 struct mutex_waiter, list);
226
227 debug_mutex_wake_waiter(lock, waiter);
228
229 wake_up_process(waiter->task);
230 }
231
232 debug_mutex_clear_owner(lock);
233
234 spin_unlock_mutex(&lock->wait_lock);
235}
236
237/*
238 * Here come the less common (and hence less performance-critical) APIs:
239 * mutex_lock_interruptible() and mutex_trylock().
240 */
241static int fastcall noinline __sched
242__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
243
244/***
245 * mutex_lock_interruptible - acquire the mutex, interruptable
246 * @lock: the mutex to be acquired
247 *
248 * Lock the mutex like mutex_lock(), and return 0 if the mutex has
249 * been acquired or sleep until the mutex becomes available. If a
250 * signal arrives while waiting for the lock then this function
251 * returns -EINTR.
252 *
253 * This function is similar to (but not equivalent to) down_interruptible().
254 */
255int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
256{
257 might_sleep();
258 return __mutex_fastpath_lock_retval
259 (&lock->count, __mutex_lock_interruptible_slowpath);
260}
261
262EXPORT_SYMBOL(mutex_lock_interruptible);
263
264static int fastcall noinline __sched
265__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
266{
267 struct mutex *lock = container_of(lock_count, struct mutex, count);
268
269 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__);
270}
271
272/*
273 * Spinlock based trylock, we take the spinlock and check whether we
274 * can get the lock:
275 */
276static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
277{
278 struct mutex *lock = container_of(lock_count, struct mutex, count);
279 int prev;
280
281 spin_lock_mutex(&lock->wait_lock);
282
283 prev = atomic_xchg(&lock->count, -1);
284 if (likely(prev == 1))
285 debug_mutex_set_owner(lock, current_thread_info() __RET_IP__);
286 /* Set it back to 0 if there are no waiters: */
287 if (likely(list_empty(&lock->wait_list)))
288 atomic_set(&lock->count, 0);
289
290 spin_unlock_mutex(&lock->wait_lock);
291
292 return prev == 1;
293}
294
295/***
296 * mutex_trylock - try acquire the mutex, without waiting
297 * @lock: the mutex to be acquired
298 *
299 * Try to acquire the mutex atomically. Returns 1 if the mutex
300 * has been acquired successfully, and 0 on contention.
301 *
302 * NOTE: this function follows the spin_trylock() convention, so
303 * it is negated to the down_trylock() return values! Be careful
304 * about this when converting semaphore users to mutexes.
305 *
306 * This function must not be used in interrupt context. The
307 * mutex must be released by the same task that acquired it.
308 */
309int fastcall mutex_trylock(struct mutex *lock)
310{
311 return __mutex_fastpath_trylock(&lock->count,
312 __mutex_trylock_slowpath);
313}
314
315EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
new file mode 100644
index 000000000000..00fe84e7b672
--- /dev/null
+++ b/kernel/mutex.h
@@ -0,0 +1,35 @@
1/*
2 * Mutexes: blocking mutual exclusion locks
3 *
4 * started by Ingo Molnar:
5 *
6 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 *
8 * This file contains mutex debugging related internal prototypes, for the
9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
10 */
11
12#define spin_lock_mutex(lock) spin_lock(lock)
13#define spin_unlock_mutex(lock) spin_unlock(lock)
14#define mutex_remove_waiter(lock, waiter, ti) \
15 __list_del((waiter)->list.prev, (waiter)->list.next)
16
17#define DEBUG_WARN_ON(c) do { } while (0)
18#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
19#define debug_mutex_clear_owner(lock) do { } while (0)
20#define debug_mutex_init_waiter(waiter) do { } while (0)
21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
22#define debug_mutex_free_waiter(waiter) do { } while (0)
23#define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0)
24#define debug_mutex_unlock(lock) do { } while (0)
25#define debug_mutex_init(lock, name) do { } while (0)
26
27/*
28 * Return-address parameters/declarations. They are very useful for
29 * debugging, but add overhead in the !DEBUG case - so we go the
30 * trouble of using this not too elegant but zero-cost solution:
31 */
32#define __IP_DECL__
33#define __IP__
34#define __RET_IP__
35
diff --git a/kernel/panic.c b/kernel/panic.c
index aabc5f86fa3f..c5c4ab255834 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
60 long i; 60 long i;
61 static char buf[1024]; 61 static char buf[1024];
62 va_list args; 62 va_list args;
63#if defined(CONFIG_ARCH_S390) 63#if defined(CONFIG_S390)
64 unsigned long caller = (unsigned long) __builtin_return_address(0); 64 unsigned long caller = (unsigned long) __builtin_return_address(0);
65#endif 65#endif
66 66
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...)
125 printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); 125 printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
126 } 126 }
127#endif 127#endif
128#if defined(CONFIG_ARCH_S390) 128#if defined(CONFIG_S390)
129 disabled_wait(caller); 129 disabled_wait(caller);
130#endif 130#endif
131 local_irq_enable(); 131 local_irq_enable();
diff --git a/kernel/params.c b/kernel/params.c
index 47ba69547945..c76ad25e6a21 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -619,7 +619,7 @@ static void __init param_sysfs_builtin(void)
619 619
620 620
621/* module-related sysfs stuff */ 621/* module-related sysfs stuff */
622#ifdef CONFIG_MODULES 622#ifdef CONFIG_SYSFS
623 623
624#define to_module_attr(n) container_of(n, struct module_attribute, attr); 624#define to_module_attr(n) container_of(n, struct module_attribute, attr);
625#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 625#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
diff --git a/kernel/pid.c b/kernel/pid.c
index edba31c681ac..1acc07246991 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
136 struct hlist_node *elem; 136 struct hlist_node *elem;
137 struct pid *pid; 137 struct pid *pid;
138 138
139 hlist_for_each_entry(pid, elem, 139 hlist_for_each_entry_rcu(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) { 140 &pid_hash[type][pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr) 141 if (pid->nr == nr)
142 return pid; 142 return pid;
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
150 150
151 task_pid = &task->pids[type]; 151 task_pid = &task->pids[type];
152 pid = find_pid(type, nr); 152 pid = find_pid(type, nr);
153 task_pid->nr = nr;
153 if (pid == NULL) { 154 if (pid == NULL) {
154 hlist_add_head(&task_pid->pid_chain,
155 &pid_hash[type][pid_hashfn(nr)]);
156 INIT_LIST_HEAD(&task_pid->pid_list); 155 INIT_LIST_HEAD(&task_pid->pid_list);
156 hlist_add_head_rcu(&task_pid->pid_chain,
157 &pid_hash[type][pid_hashfn(nr)]);
157 } else { 158 } else {
158 INIT_HLIST_NODE(&task_pid->pid_chain); 159 INIT_HLIST_NODE(&task_pid->pid_chain);
159 list_add_tail(&task_pid->pid_list, &pid->pid_list); 160 list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
160 } 161 }
161 task_pid->nr = nr;
162 162
163 return 0; 163 return 0;
164} 164}
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type)
170 170
171 pid = &task->pids[type]; 171 pid = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) { 172 if (!hlist_unhashed(&pid->pid_chain)) {
173 hlist_del(&pid->pid_chain);
174 173
175 if (list_empty(&pid->pid_list)) 174 if (list_empty(&pid->pid_list)) {
176 nr = pid->nr; 175 nr = pid->nr;
177 else { 176 hlist_del_rcu(&pid->pid_chain);
177 } else {
178 pid_next = list_entry(pid->pid_list.next, 178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list); 179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */ 180 /* insert next pid from pid_list to hash */
181 hlist_add_head(&pid_next->pid_chain, 181 hlist_replace_rcu(&pid->pid_chain,
182 &pid_hash[type][pid_hashfn(pid_next->nr)]); 182 &pid_next->pid_chain);
183 } 183 }
184 } 184 }
185 185
186 list_del(&pid->pid_list); 186 list_del_rcu(&pid->pid_list);
187 pid->nr = 0; 187 pid->nr = 0;
188 188
189 return nr; 189 return nr;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 84af54c39e1b..520f6c59948d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,7 +7,7 @@
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9 9
10static int check_clock(clockid_t which_clock) 10static int check_clock(const clockid_t which_clock)
11{ 11{
12 int error = 0; 12 int error = 0;
13 struct task_struct *p; 13 struct task_struct *p;
@@ -31,19 +31,19 @@ static int check_clock(clockid_t which_clock)
31} 31}
32 32
33static inline union cpu_time_count 33static inline union cpu_time_count
34timespec_to_sample(clockid_t which_clock, const struct timespec *tp) 34timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
35{ 35{
36 union cpu_time_count ret; 36 union cpu_time_count ret;
37 ret.sched = 0; /* high half always zero when .cpu used */ 37 ret.sched = 0; /* high half always zero when .cpu used */
38 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 38 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
39 ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; 39 ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
40 } else { 40 } else {
41 ret.cpu = timespec_to_cputime(tp); 41 ret.cpu = timespec_to_cputime(tp);
42 } 42 }
43 return ret; 43 return ret;
44} 44}
45 45
46static void sample_to_timespec(clockid_t which_clock, 46static void sample_to_timespec(const clockid_t which_clock,
47 union cpu_time_count cpu, 47 union cpu_time_count cpu,
48 struct timespec *tp) 48 struct timespec *tp)
49{ 49{
@@ -55,7 +55,7 @@ static void sample_to_timespec(clockid_t which_clock,
55 } 55 }
56} 56}
57 57
58static inline int cpu_time_before(clockid_t which_clock, 58static inline int cpu_time_before(const clockid_t which_clock,
59 union cpu_time_count now, 59 union cpu_time_count now,
60 union cpu_time_count then) 60 union cpu_time_count then)
61{ 61{
@@ -65,7 +65,7 @@ static inline int cpu_time_before(clockid_t which_clock,
65 return cputime_lt(now.cpu, then.cpu); 65 return cputime_lt(now.cpu, then.cpu);
66 } 66 }
67} 67}
68static inline void cpu_time_add(clockid_t which_clock, 68static inline void cpu_time_add(const clockid_t which_clock,
69 union cpu_time_count *acc, 69 union cpu_time_count *acc,
70 union cpu_time_count val) 70 union cpu_time_count val)
71{ 71{
@@ -75,7 +75,7 @@ static inline void cpu_time_add(clockid_t which_clock,
75 acc->cpu = cputime_add(acc->cpu, val.cpu); 75 acc->cpu = cputime_add(acc->cpu, val.cpu);
76 } 76 }
77} 77}
78static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, 78static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
79 union cpu_time_count a, 79 union cpu_time_count a,
80 union cpu_time_count b) 80 union cpu_time_count b)
81{ 81{
@@ -151,7 +151,7 @@ static inline unsigned long long sched_ns(struct task_struct *p)
151 return (p == current) ? current_sched_time(p) : p->sched_time; 151 return (p == current) ? current_sched_time(p) : p->sched_time;
152} 152}
153 153
154int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) 154int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
155{ 155{
156 int error = check_clock(which_clock); 156 int error = check_clock(which_clock);
157 if (!error) { 157 if (!error) {
@@ -169,7 +169,7 @@ int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
169 return error; 169 return error;
170} 170}
171 171
172int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp) 172int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
173{ 173{
174 /* 174 /*
175 * You can never reset a CPU clock, but we check for other errors 175 * You can never reset a CPU clock, but we check for other errors
@@ -186,7 +186,7 @@ int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
186/* 186/*
187 * Sample a per-thread clock for the given task. 187 * Sample a per-thread clock for the given task.
188 */ 188 */
189static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p, 189static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
190 union cpu_time_count *cpu) 190 union cpu_time_count *cpu)
191{ 191{
192 switch (CPUCLOCK_WHICH(which_clock)) { 192 switch (CPUCLOCK_WHICH(which_clock)) {
@@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
238 while ((t = next_thread(t)) != p) { 238 while ((t = next_thread(t)) != p) {
239 cpu->sched += t->sched_time; 239 cpu->sched += t->sched_time;
240 } 240 }
241 if (p->tgid == current->tgid) { 241 cpu->sched += sched_ns(p);
242 /*
243 * We're sampling ourselves, so include the
244 * cycles not yet banked. We still omit
245 * other threads running on other CPUs,
246 * so the total can always be behind as
247 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
248 */
249 cpu->sched += current_sched_time(current);
250 } else {
251 cpu->sched += p->sched_time;
252 }
253 break; 242 break;
254 } 243 }
255 return 0; 244 return 0;
@@ -259,7 +248,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
259 * Sample a process (thread group) clock for the given group_leader task. 248 * Sample a process (thread group) clock for the given group_leader task.
260 * Must be called with tasklist_lock held for reading. 249 * Must be called with tasklist_lock held for reading.
261 */ 250 */
262static int cpu_clock_sample_group(clockid_t which_clock, 251static int cpu_clock_sample_group(const clockid_t which_clock,
263 struct task_struct *p, 252 struct task_struct *p,
264 union cpu_time_count *cpu) 253 union cpu_time_count *cpu)
265{ 254{
@@ -273,7 +262,7 @@ static int cpu_clock_sample_group(clockid_t which_clock,
273} 262}
274 263
275 264
276int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp) 265int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
277{ 266{
278 const pid_t pid = CPUCLOCK_PID(which_clock); 267 const pid_t pid = CPUCLOCK_PID(which_clock);
279 int error = -EINVAL; 268 int error = -EINVAL;
@@ -1410,8 +1399,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1410 1399
1411static long posix_cpu_clock_nanosleep_restart(struct restart_block *); 1400static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
1412 1401
1413int posix_cpu_nsleep(clockid_t which_clock, int flags, 1402int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1414 struct timespec *rqtp) 1403 struct timespec *rqtp, struct timespec __user *rmtp)
1415{ 1404{
1416 struct restart_block *restart_block = 1405 struct restart_block *restart_block =
1417 &current_thread_info()->restart_block; 1406 &current_thread_info()->restart_block;
@@ -1436,7 +1425,6 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
1436 error = posix_cpu_timer_create(&timer); 1425 error = posix_cpu_timer_create(&timer);
1437 timer.it_process = current; 1426 timer.it_process = current;
1438 if (!error) { 1427 if (!error) {
1439 struct timespec __user *rmtp;
1440 static struct itimerspec zero_it; 1428 static struct itimerspec zero_it;
1441 struct itimerspec it = { .it_value = *rqtp, 1429 struct itimerspec it = { .it_value = *rqtp,
1442 .it_interval = {} }; 1430 .it_interval = {} };
@@ -1483,7 +1471,6 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
1483 /* 1471 /*
1484 * Report back to the user the time still remaining. 1472 * Report back to the user the time still remaining.
1485 */ 1473 */
1486 rmtp = (struct timespec __user *) restart_block->arg1;
1487 if (rmtp != NULL && !(flags & TIMER_ABSTIME) && 1474 if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
1488 copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1475 copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1489 return -EFAULT; 1476 return -EFAULT;
@@ -1491,6 +1478,7 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
1491 restart_block->fn = posix_cpu_clock_nanosleep_restart; 1478 restart_block->fn = posix_cpu_clock_nanosleep_restart;
1492 /* Caller already set restart_block->arg1 */ 1479 /* Caller already set restart_block->arg1 */
1493 restart_block->arg0 = which_clock; 1480 restart_block->arg0 = which_clock;
1481 restart_block->arg1 = (unsigned long) rmtp;
1494 restart_block->arg2 = rqtp->tv_sec; 1482 restart_block->arg2 = rqtp->tv_sec;
1495 restart_block->arg3 = rqtp->tv_nsec; 1483 restart_block->arg3 = rqtp->tv_nsec;
1496 1484
@@ -1504,21 +1492,28 @@ static long
1504posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) 1492posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
1505{ 1493{
1506 clockid_t which_clock = restart_block->arg0; 1494 clockid_t which_clock = restart_block->arg0;
1507 struct timespec t = { .tv_sec = restart_block->arg2, 1495 struct timespec __user *rmtp;
1508 .tv_nsec = restart_block->arg3 }; 1496 struct timespec t;
1497
1498 rmtp = (struct timespec __user *) restart_block->arg1;
1499 t.tv_sec = restart_block->arg2;
1500 t.tv_nsec = restart_block->arg3;
1501
1509 restart_block->fn = do_no_restart_syscall; 1502 restart_block->fn = do_no_restart_syscall;
1510 return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t); 1503 return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
1511} 1504}
1512 1505
1513 1506
1514#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1507#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1515#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1508#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1516 1509
1517static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) 1510static int process_cpu_clock_getres(const clockid_t which_clock,
1511 struct timespec *tp)
1518{ 1512{
1519 return posix_cpu_clock_getres(PROCESS_CLOCK, tp); 1513 return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1520} 1514}
1521static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp) 1515static int process_cpu_clock_get(const clockid_t which_clock,
1516 struct timespec *tp)
1522{ 1517{
1523 return posix_cpu_clock_get(PROCESS_CLOCK, tp); 1518 return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1524} 1519}
@@ -1527,16 +1522,19 @@ static int process_cpu_timer_create(struct k_itimer *timer)
1527 timer->it_clock = PROCESS_CLOCK; 1522 timer->it_clock = PROCESS_CLOCK;
1528 return posix_cpu_timer_create(timer); 1523 return posix_cpu_timer_create(timer);
1529} 1524}
1530static int process_cpu_nsleep(clockid_t which_clock, int flags, 1525static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1531 struct timespec *rqtp) 1526 struct timespec *rqtp,
1527 struct timespec __user *rmtp)
1532{ 1528{
1533 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); 1529 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1534} 1530}
1535static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) 1531static int thread_cpu_clock_getres(const clockid_t which_clock,
1532 struct timespec *tp)
1536{ 1533{
1537 return posix_cpu_clock_getres(THREAD_CLOCK, tp); 1534 return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1538} 1535}
1539static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp) 1536static int thread_cpu_clock_get(const clockid_t which_clock,
1537 struct timespec *tp)
1540{ 1538{
1541 return posix_cpu_clock_get(THREAD_CLOCK, tp); 1539 return posix_cpu_clock_get(THREAD_CLOCK, tp);
1542} 1540}
@@ -1545,8 +1543,8 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1545 timer->it_clock = THREAD_CLOCK; 1543 timer->it_clock = THREAD_CLOCK;
1546 return posix_cpu_timer_create(timer); 1544 return posix_cpu_timer_create(timer);
1547} 1545}
1548static int thread_cpu_nsleep(clockid_t which_clock, int flags, 1546static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
1549 struct timespec *rqtp) 1547 struct timespec *rqtp, struct timespec __user *rmtp)
1550{ 1548{
1551 return -EINVAL; 1549 return -EINVAL;
1552} 1550}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ea55c7a1cd75..197208b3aa2a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,21 +48,6 @@
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/module.h> 49#include <linux/module.h>
50 50
51#ifndef div_long_long_rem
52#include <asm/div64.h>
53
54#define div_long_long_rem(dividend,divisor,remainder) ({ \
55 u64 result = dividend; \
56 *remainder = do_div(result,divisor); \
57 result; })
58
59#endif
60#define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */
61
62static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
63{
64 return (u64)mpy1 * mpy2;
65}
66/* 51/*
67 * Management arrays for POSIX timers. Timers are kept in slab memory 52 * Management arrays for POSIX timers. Timers are kept in slab memory
68 * Timer ids are allocated by an external routine that keeps track of the 53 * Timer ids are allocated by an external routine that keeps track of the
@@ -148,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock);
148 */ 133 */
149 134
150static struct k_clock posix_clocks[MAX_CLOCKS]; 135static struct k_clock posix_clocks[MAX_CLOCKS];
136
151/* 137/*
152 * We only have one real clock that can be set so we need only one abs list, 138 * These ones are defined below.
153 * even if we should want to have several clocks with differing resolutions.
154 */ 139 */
155static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list), 140static int common_nsleep(const clockid_t, int flags, struct timespec *t,
156 .lock = SPIN_LOCK_UNLOCKED}; 141 struct timespec __user *rmtp);
142static void common_timer_get(struct k_itimer *, struct itimerspec *);
143static int common_timer_set(struct k_itimer *, int,
144 struct itimerspec *, struct itimerspec *);
145static int common_timer_del(struct k_itimer *timer);
157 146
158static void posix_timer_fn(unsigned long); 147static int posix_timer_fn(void *data);
159static u64 do_posix_clock_monotonic_gettime_parts(
160 struct timespec *tp, struct timespec *mo);
161int do_posix_clock_monotonic_gettime(struct timespec *tp);
162static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
163 148
164static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 149static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
165 150
@@ -184,7 +169,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
184 * the function pointer CALL in struct k_clock. 169 * the function pointer CALL in struct k_clock.
185 */ 170 */
186 171
187static inline int common_clock_getres(clockid_t which_clock, 172static inline int common_clock_getres(const clockid_t which_clock,
188 struct timespec *tp) 173 struct timespec *tp)
189{ 174{
190 tp->tv_sec = 0; 175 tp->tv_sec = 0;
@@ -192,39 +177,33 @@ static inline int common_clock_getres(clockid_t which_clock,
192 return 0; 177 return 0;
193} 178}
194 179
195static inline int common_clock_get(clockid_t which_clock, struct timespec *tp) 180/*
181 * Get real time for posix timers
182 */
183static int common_clock_get(clockid_t which_clock, struct timespec *tp)
196{ 184{
197 getnstimeofday(tp); 185 ktime_get_real_ts(tp);
198 return 0; 186 return 0;
199} 187}
200 188
201static inline int common_clock_set(clockid_t which_clock, struct timespec *tp) 189static inline int common_clock_set(const clockid_t which_clock,
190 struct timespec *tp)
202{ 191{
203 return do_sys_settimeofday(tp, NULL); 192 return do_sys_settimeofday(tp, NULL);
204} 193}
205 194
206static inline int common_timer_create(struct k_itimer *new_timer) 195static int common_timer_create(struct k_itimer *new_timer)
207{ 196{
208 INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry); 197 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock);
209 init_timer(&new_timer->it.real.timer); 198 new_timer->it.real.timer.data = new_timer;
210 new_timer->it.real.timer.data = (unsigned long) new_timer;
211 new_timer->it.real.timer.function = posix_timer_fn; 199 new_timer->it.real.timer.function = posix_timer_fn;
212 return 0; 200 return 0;
213} 201}
214 202
215/* 203/*
216 * These ones are defined below. 204 * Return nonzero if we know a priori this clockid_t value is bogus.
217 */
218static int common_nsleep(clockid_t, int flags, struct timespec *t);
219static void common_timer_get(struct k_itimer *, struct itimerspec *);
220static int common_timer_set(struct k_itimer *, int,
221 struct itimerspec *, struct itimerspec *);
222static int common_timer_del(struct k_itimer *timer);
223
224/*
225 * Return nonzero iff we know a priori this clockid_t value is bogus.
226 */ 205 */
227static inline int invalid_clockid(clockid_t which_clock) 206static inline int invalid_clockid(const clockid_t which_clock)
228{ 207{
229 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ 208 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */
230 return 0; 209 return 0;
@@ -232,26 +211,32 @@ static inline int invalid_clockid(clockid_t which_clock)
232 return 1; 211 return 1;
233 if (posix_clocks[which_clock].clock_getres != NULL) 212 if (posix_clocks[which_clock].clock_getres != NULL)
234 return 0; 213 return 0;
235#ifndef CLOCK_DISPATCH_DIRECT
236 if (posix_clocks[which_clock].res != 0) 214 if (posix_clocks[which_clock].res != 0)
237 return 0; 215 return 0;
238#endif
239 return 1; 216 return 1;
240} 217}
241 218
219/*
220 * Get monotonic time for posix timers
221 */
222static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
223{
224 ktime_get_ts(tp);
225 return 0;
226}
242 227
243/* 228/*
244 * Initialize everything, well, just everything in Posix clocks/timers ;) 229 * Initialize everything, well, just everything in Posix clocks/timers ;)
245 */ 230 */
246static __init int init_posix_timers(void) 231static __init int init_posix_timers(void)
247{ 232{
248 struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES, 233 struct k_clock clock_realtime = {
249 .abs_struct = &abs_list 234 .clock_getres = hrtimer_get_res,
250 }; 235 };
251 struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES, 236 struct k_clock clock_monotonic = {
252 .abs_struct = NULL, 237 .clock_getres = hrtimer_get_res,
253 .clock_get = do_posix_clock_monotonic_get, 238 .clock_get = posix_ktime_get_ts,
254 .clock_set = do_posix_clock_nosettime 239 .clock_set = do_posix_clock_nosettime,
255 }; 240 };
256 241
257 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 242 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -265,117 +250,17 @@ static __init int init_posix_timers(void)
265 250
266__initcall(init_posix_timers); 251__initcall(init_posix_timers);
267 252
268static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
269{
270 long sec = tp->tv_sec;
271 long nsec = tp->tv_nsec + res - 1;
272
273 if (nsec > NSEC_PER_SEC) {
274 sec++;
275 nsec -= NSEC_PER_SEC;
276 }
277
278 /*
279 * The scaling constants are defined in <linux/time.h>
280 * The difference between there and here is that we do the
281 * res rounding and compute a 64-bit result (well so does that
282 * but it then throws away the high bits).
283 */
284 *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
285 (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >>
286 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
287}
288
289/*
290 * This function adjusts the timer as needed as a result of the clock
291 * being set. It should only be called for absolute timers, and then
292 * under the abs_list lock. It computes the time difference and sets
293 * the new jiffies value in the timer. It also updates the timers
294 * reference wall_to_monotonic value. It is complicated by the fact
295 * that tstojiffies() only handles positive times and it needs to work
296 * with both positive and negative times. Also, for negative offsets,
297 * we need to defeat the res round up.
298 *
299 * Return is true if there is a new time, else false.
300 */
301static long add_clockset_delta(struct k_itimer *timr,
302 struct timespec *new_wall_to)
303{
304 struct timespec delta;
305 int sign = 0;
306 u64 exp;
307
308 set_normalized_timespec(&delta,
309 new_wall_to->tv_sec -
310 timr->it.real.wall_to_prev.tv_sec,
311 new_wall_to->tv_nsec -
312 timr->it.real.wall_to_prev.tv_nsec);
313 if (likely(!(delta.tv_sec | delta.tv_nsec)))
314 return 0;
315 if (delta.tv_sec < 0) {
316 set_normalized_timespec(&delta,
317 -delta.tv_sec,
318 1 - delta.tv_nsec -
319 posix_clocks[timr->it_clock].res);
320 sign++;
321 }
322 tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
323 timr->it.real.wall_to_prev = *new_wall_to;
324 timr->it.real.timer.expires += (sign ? -exp : exp);
325 return 1;
326}
327
328static void remove_from_abslist(struct k_itimer *timr)
329{
330 if (!list_empty(&timr->it.real.abs_timer_entry)) {
331 spin_lock(&abs_list.lock);
332 list_del_init(&timr->it.real.abs_timer_entry);
333 spin_unlock(&abs_list.lock);
334 }
335}
336
337static void schedule_next_timer(struct k_itimer *timr) 253static void schedule_next_timer(struct k_itimer *timr)
338{ 254{
339 struct timespec new_wall_to; 255 if (timr->it.real.interval.tv64 == 0)
340 struct now_struct now;
341 unsigned long seq;
342
343 /*
344 * Set up the timer for the next interval (if there is one).
345 * Note: this code uses the abs_timer_lock to protect
346 * it.real.wall_to_prev and must hold it until exp is set, not exactly
347 * obvious...
348
349 * This function is used for CLOCK_REALTIME* and
350 * CLOCK_MONOTONIC* timers. If we ever want to handle other
351 * CLOCKs, the calling code (do_schedule_next_timer) would need
352 * to pull the "clock" info from the timer and dispatch the
353 * "other" CLOCKs "next timer" code (which, I suppose should
354 * also be added to the k_clock structure).
355 */
356 if (!timr->it.real.incr)
357 return; 256 return;
358 257
359 do { 258 timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
360 seq = read_seqbegin(&xtime_lock); 259 timr->it.real.interval);
361 new_wall_to = wall_to_monotonic;
362 posix_get_now(&now);
363 } while (read_seqretry(&xtime_lock, seq));
364
365 if (!list_empty(&timr->it.real.abs_timer_entry)) {
366 spin_lock(&abs_list.lock);
367 add_clockset_delta(timr, &new_wall_to);
368
369 posix_bump_timer(timr, now);
370
371 spin_unlock(&abs_list.lock);
372 } else {
373 posix_bump_timer(timr, now);
374 }
375 timr->it_overrun_last = timr->it_overrun; 260 timr->it_overrun_last = timr->it_overrun;
376 timr->it_overrun = -1; 261 timr->it_overrun = -1;
377 ++timr->it_requeue_pending; 262 ++timr->it_requeue_pending;
378 add_timer(&timr->it.real.timer); 263 hrtimer_restart(&timr->it.real.timer);
379} 264}
380 265
381/* 266/*
@@ -396,31 +281,23 @@ void do_schedule_next_timer(struct siginfo *info)
396 281
397 timr = lock_timer(info->si_tid, &flags); 282 timr = lock_timer(info->si_tid, &flags);
398 283
399 if (!timr || timr->it_requeue_pending != info->si_sys_private) 284 if (timr && timr->it_requeue_pending == info->si_sys_private) {
400 goto exit; 285 if (timr->it_clock < 0)
286 posix_cpu_timer_schedule(timr);
287 else
288 schedule_next_timer(timr);
401 289
402 if (timr->it_clock < 0) /* CPU clock */ 290 info->si_overrun = timr->it_overrun_last;
403 posix_cpu_timer_schedule(timr); 291 }
404 else 292
405 schedule_next_timer(timr); 293 unlock_timer(timr, flags);
406 info->si_overrun = timr->it_overrun_last;
407exit:
408 if (timr)
409 unlock_timer(timr, flags);
410} 294}
411 295
412int posix_timer_event(struct k_itimer *timr,int si_private) 296int posix_timer_event(struct k_itimer *timr,int si_private)
413{ 297{
414 memset(&timr->sigq->info, 0, sizeof(siginfo_t)); 298 memset(&timr->sigq->info, 0, sizeof(siginfo_t));
415 timr->sigq->info.si_sys_private = si_private; 299 timr->sigq->info.si_sys_private = si_private;
416 /* 300 /* Send signal to the process that owns this timer.*/
417 * Send signal to the process that owns this timer.
418
419 * This code assumes that all the possible abs_lists share the
420 * same lock (there is only one list at this time). If this is
421 * not the case, the CLOCK info would need to be used to find
422 * the proper abs list lock.
423 */
424 301
425 timr->sigq->info.si_signo = timr->it_sigev_signo; 302 timr->sigq->info.si_signo = timr->it_sigev_signo;
426 timr->sigq->info.si_errno = 0; 303 timr->sigq->info.si_errno = 0;
@@ -454,66 +331,37 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
454 331
455 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 332 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
456 */ 333 */
457static void posix_timer_fn(unsigned long __data) 334static int posix_timer_fn(void *data)
458{ 335{
459 struct k_itimer *timr = (struct k_itimer *) __data; 336 struct k_itimer *timr = data;
460 unsigned long flags; 337 unsigned long flags;
461 unsigned long seq; 338 int si_private = 0;
462 struct timespec delta, new_wall_to; 339 int ret = HRTIMER_NORESTART;
463 u64 exp = 0;
464 int do_notify = 1;
465 340
466 spin_lock_irqsave(&timr->it_lock, flags); 341 spin_lock_irqsave(&timr->it_lock, flags);
467 if (!list_empty(&timr->it.real.abs_timer_entry)) {
468 spin_lock(&abs_list.lock);
469 do {
470 seq = read_seqbegin(&xtime_lock);
471 new_wall_to = wall_to_monotonic;
472 } while (read_seqretry(&xtime_lock, seq));
473 set_normalized_timespec(&delta,
474 new_wall_to.tv_sec -
475 timr->it.real.wall_to_prev.tv_sec,
476 new_wall_to.tv_nsec -
477 timr->it.real.wall_to_prev.tv_nsec);
478 if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
479 /* do nothing, timer is on time */
480 } else if (delta.tv_sec < 0) {
481 /* do nothing, timer is already late */
482 } else {
483 /* timer is early due to a clock set */
484 tstojiffie(&delta,
485 posix_clocks[timr->it_clock].res,
486 &exp);
487 timr->it.real.wall_to_prev = new_wall_to;
488 timr->it.real.timer.expires += exp;
489 add_timer(&timr->it.real.timer);
490 do_notify = 0;
491 }
492 spin_unlock(&abs_list.lock);
493 342
494 } 343 if (timr->it.real.interval.tv64 != 0)
495 if (do_notify) { 344 si_private = ++timr->it_requeue_pending;
496 int si_private=0;
497 345
498 if (timr->it.real.incr) 346 if (posix_timer_event(timr, si_private)) {
499 si_private = ++timr->it_requeue_pending; 347 /*
500 else { 348 * signal was not sent because of sig_ignor
501 remove_from_abslist(timr); 349 * we will not get a call back to restart it AND
350 * it should be restarted.
351 */
352 if (timr->it.real.interval.tv64 != 0) {
353 timr->it_overrun +=
354 hrtimer_forward(&timr->it.real.timer,
355 timr->it.real.interval);
356 ret = HRTIMER_RESTART;
502 } 357 }
503
504 if (posix_timer_event(timr, si_private))
505 /*
506 * signal was not sent because of sig_ignor
507 * we will not get a call back to restart it AND
508 * it should be restarted.
509 */
510 schedule_next_timer(timr);
511 } 358 }
512 unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
513}
514 359
360 unlock_timer(timr, flags);
361 return ret;
362}
515 363
516static inline struct task_struct * good_sigevent(sigevent_t * event) 364static struct task_struct * good_sigevent(sigevent_t * event)
517{ 365{
518 struct task_struct *rtn = current->group_leader; 366 struct task_struct *rtn = current->group_leader;
519 367
@@ -530,7 +378,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event)
530 return rtn; 378 return rtn;
531} 379}
532 380
533void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock) 381void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
534{ 382{
535 if ((unsigned) clock_id >= MAX_CLOCKS) { 383 if ((unsigned) clock_id >= MAX_CLOCKS) {
536 printk("POSIX clock register failed for clock_id %d\n", 384 printk("POSIX clock register failed for clock_id %d\n",
@@ -576,7 +424,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
576/* Create a POSIX.1b interval timer. */ 424/* Create a POSIX.1b interval timer. */
577 425
578asmlinkage long 426asmlinkage long
579sys_timer_create(clockid_t which_clock, 427sys_timer_create(const clockid_t which_clock,
580 struct sigevent __user *timer_event_spec, 428 struct sigevent __user *timer_event_spec,
581 timer_t __user * created_timer_id) 429 timer_t __user * created_timer_id)
582{ 430{
@@ -602,8 +450,7 @@ sys_timer_create(clockid_t which_clock,
602 goto out; 450 goto out;
603 } 451 }
604 spin_lock_irq(&idr_lock); 452 spin_lock_irq(&idr_lock);
605 error = idr_get_new(&posix_timers_id, 453 error = idr_get_new(&posix_timers_id, (void *) new_timer,
606 (void *) new_timer,
607 &new_timer_id); 454 &new_timer_id);
608 spin_unlock_irq(&idr_lock); 455 spin_unlock_irq(&idr_lock);
609 if (error == -EAGAIN) 456 if (error == -EAGAIN)
@@ -704,27 +551,6 @@ out:
704} 551}
705 552
706/* 553/*
707 * good_timespec
708 *
709 * This function checks the elements of a timespec structure.
710 *
711 * Arguments:
712 * ts : Pointer to the timespec structure to check
713 *
714 * Return value:
715 * If a NULL pointer was passed in, or the tv_nsec field was less than 0
716 * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
717 * this function returns 0. Otherwise it returns 1.
718 */
719static int good_timespec(const struct timespec *ts)
720{
721 if ((!ts) || (ts->tv_sec < 0) ||
722 ((unsigned) ts->tv_nsec >= NSEC_PER_SEC))
723 return 0;
724 return 1;
725}
726
727/*
728 * Locking issues: We need to protect the result of the id look up until 554 * Locking issues: We need to protect the result of the id look up until
729 * we get the timer locked down so it is not deleted under us. The 555 * we get the timer locked down so it is not deleted under us. The
730 * removal is done under the idr spinlock so we use that here to bridge 556 * removal is done under the idr spinlock so we use that here to bridge
@@ -776,39 +602,39 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
776static void 602static void
777common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) 603common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
778{ 604{
779 unsigned long expires; 605 ktime_t remaining;
780 struct now_struct now; 606 struct hrtimer *timer = &timr->it.real.timer;
781 607
782 do 608 memset(cur_setting, 0, sizeof(struct itimerspec));
783 expires = timr->it.real.timer.expires; 609 remaining = hrtimer_get_remaining(timer);
784 while ((volatile long) (timr->it.real.timer.expires) != expires);
785
786 posix_get_now(&now);
787
788 if (expires &&
789 ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
790 !timr->it.real.incr &&
791 posix_time_before(&timr->it.real.timer, &now))
792 timr->it.real.timer.expires = expires = 0;
793 if (expires) {
794 if (timr->it_requeue_pending & REQUEUE_PENDING ||
795 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
796 posix_bump_timer(timr, now);
797 expires = timr->it.real.timer.expires;
798 }
799 else
800 if (!timer_pending(&timr->it.real.timer))
801 expires = 0;
802 if (expires)
803 expires -= now.jiffies;
804 }
805 jiffies_to_timespec(expires, &cur_setting->it_value);
806 jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
807 610
808 if (cur_setting->it_value.tv_sec < 0) { 611 /* Time left ? or timer pending */
612 if (remaining.tv64 > 0 || hrtimer_active(timer))
613 goto calci;
614 /* interval timer ? */
615 if (timr->it.real.interval.tv64 == 0)
616 return;
617 /*
618 * When a requeue is pending or this is a SIGEV_NONE timer
619 * move the expiry time forward by intervals, so expiry is >
620 * now.
621 */
622 if (timr->it_requeue_pending & REQUEUE_PENDING ||
623 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
624 timr->it_overrun +=
625 hrtimer_forward(timer, timr->it.real.interval);
626 remaining = hrtimer_get_remaining(timer);
627 }
628 calci:
629 /* interval timer ? */
630 if (timr->it.real.interval.tv64 != 0)
631 cur_setting->it_interval =
632 ktime_to_timespec(timr->it.real.interval);
633 /* Return 0 only, when the timer is expired and not pending */
634 if (remaining.tv64 <= 0)
809 cur_setting->it_value.tv_nsec = 1; 635 cur_setting->it_value.tv_nsec = 1;
810 cur_setting->it_value.tv_sec = 0; 636 else
811 } 637 cur_setting->it_value = ktime_to_timespec(remaining);
812} 638}
813 639
814/* Get the time remaining on a POSIX.1b interval timer. */ 640/* Get the time remaining on a POSIX.1b interval timer. */
@@ -832,6 +658,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
832 658
833 return 0; 659 return 0;
834} 660}
661
835/* 662/*
836 * Get the number of overruns of a POSIX.1b interval timer. This is to 663 * Get the number of overruns of a POSIX.1b interval timer. This is to
837 * be the overrun of the timer last delivered. At the same time we are 664 * be the overrun of the timer last delivered. At the same time we are
@@ -841,7 +668,6 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
841 * the call back to do_schedule_next_timer(). So all we need to do is 668 * the call back to do_schedule_next_timer(). So all we need to do is
842 * to pick up the frozen overrun. 669 * to pick up the frozen overrun.
843 */ 670 */
844
845asmlinkage long 671asmlinkage long
846sys_timer_getoverrun(timer_t timer_id) 672sys_timer_getoverrun(timer_t timer_id)
847{ 673{
@@ -858,153 +684,55 @@ sys_timer_getoverrun(timer_t timer_id)
858 684
859 return overrun; 685 return overrun;
860} 686}
861/*
862 * Adjust for absolute time
863 *
864 * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
865 * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
866 * what ever clock he is using.
867 *
868 * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
869 * time to it to get the proper time for the timer.
870 */
871static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
872 int abs, u64 *exp, struct timespec *wall_to)
873{
874 struct timespec now;
875 struct timespec oc = *tp;
876 u64 jiffies_64_f;
877 int rtn =0;
878
879 if (abs) {
880 /*
881 * The mask pick up the 4 basic clocks
882 */
883 if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
884 jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
885 &now, wall_to);
886 /*
887 * If we are doing a MONOTONIC clock
888 */
889 if((clock - &posix_clocks[0]) & CLOCKS_MONO){
890 now.tv_sec += wall_to->tv_sec;
891 now.tv_nsec += wall_to->tv_nsec;
892 }
893 } else {
894 /*
895 * Not one of the basic clocks
896 */
897 clock->clock_get(clock - posix_clocks, &now);
898 jiffies_64_f = get_jiffies_64();
899 }
900 /*
901 * Take away now to get delta and normalize
902 */
903 set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
904 oc.tv_nsec - now.tv_nsec);
905 }else{
906 jiffies_64_f = get_jiffies_64();
907 }
908 /*
909 * Check if the requested time is prior to now (if so set now)
910 */
911 if (oc.tv_sec < 0)
912 oc.tv_sec = oc.tv_nsec = 0;
913
914 if (oc.tv_sec | oc.tv_nsec)
915 set_normalized_timespec(&oc, oc.tv_sec,
916 oc.tv_nsec + clock->res);
917 tstojiffie(&oc, clock->res, exp);
918
919 /*
920 * Check if the requested time is more than the timer code
921 * can handle (if so we error out but return the value too).
922 */
923 if (*exp > ((u64)MAX_JIFFY_OFFSET))
924 /*
925 * This is a considered response, not exactly in
926 * line with the standard (in fact it is silent on
927 * possible overflows). We assume such a large
928 * value is ALMOST always a programming error and
929 * try not to compound it by setting a really dumb
930 * value.
931 */
932 rtn = -EINVAL;
933 /*
934 * return the actual jiffies expire time, full 64 bits
935 */
936 *exp += jiffies_64_f;
937 return rtn;
938}
939 687
940/* Set a POSIX.1b interval timer. */ 688/* Set a POSIX.1b interval timer. */
941/* timr->it_lock is taken. */ 689/* timr->it_lock is taken. */
942static inline int 690static int
943common_timer_set(struct k_itimer *timr, int flags, 691common_timer_set(struct k_itimer *timr, int flags,
944 struct itimerspec *new_setting, struct itimerspec *old_setting) 692 struct itimerspec *new_setting, struct itimerspec *old_setting)
945{ 693{
946 struct k_clock *clock = &posix_clocks[timr->it_clock]; 694 struct hrtimer *timer = &timr->it.real.timer;
947 u64 expire_64;
948 695
949 if (old_setting) 696 if (old_setting)
950 common_timer_get(timr, old_setting); 697 common_timer_get(timr, old_setting);
951 698
952 /* disable the timer */ 699 /* disable the timer */
953 timr->it.real.incr = 0; 700 timr->it.real.interval.tv64 = 0;
954 /* 701 /*
955 * careful here. If smp we could be in the "fire" routine which will 702 * careful here. If smp we could be in the "fire" routine which will
956 * be spinning as we hold the lock. But this is ONLY an SMP issue. 703 * be spinning as we hold the lock. But this is ONLY an SMP issue.
957 */ 704 */
958 if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { 705 if (hrtimer_try_to_cancel(timer) < 0)
959#ifdef CONFIG_SMP
960 /*
961 * It can only be active if on an other cpu. Since
962 * we have cleared the interval stuff above, it should
963 * clear once we release the spin lock. Of course once
964 * we do that anything could happen, including the
965 * complete melt down of the timer. So return with
966 * a "retry" exit status.
967 */
968 return TIMER_RETRY; 706 return TIMER_RETRY;
969#endif
970 }
971
972 remove_from_abslist(timr);
973 707
974 timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 708 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
975 ~REQUEUE_PENDING; 709 ~REQUEUE_PENDING;
976 timr->it_overrun_last = 0; 710 timr->it_overrun_last = 0;
977 timr->it_overrun = -1;
978 /*
979 *switch off the timer when it_value is zero
980 */
981 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
982 timr->it.real.timer.expires = 0;
983 return 0;
984 }
985 711
986 if (adjust_abs_time(clock, 712 /* switch off the timer when it_value is zero */
987 &new_setting->it_value, flags & TIMER_ABSTIME, 713 if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
988 &expire_64, &(timr->it.real.wall_to_prev))) { 714 return 0;
989 return -EINVAL;
990 }
991 timr->it.real.timer.expires = (unsigned long)expire_64;
992 tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
993 timr->it.real.incr = (unsigned long)expire_64;
994 715
995 /* 716 /* Posix madness. Only absolute CLOCK_REALTIME timers
996 * We do not even queue SIGEV_NONE timers! But we do put them 717 * are affected by clock sets. So we must reiniatilize
997 * in the abs list so we can do that right. 718 * the timer.
998 */ 719 */
999 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)) 720 if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME))
1000 add_timer(&timr->it.real.timer); 721 hrtimer_rebase(timer, CLOCK_REALTIME);
1001 722 else
1002 if (flags & TIMER_ABSTIME && clock->abs_struct) { 723 hrtimer_rebase(timer, CLOCK_MONOTONIC);
1003 spin_lock(&clock->abs_struct->lock); 724
1004 list_add_tail(&(timr->it.real.abs_timer_entry), 725 timer->expires = timespec_to_ktime(new_setting->it_value);
1005 &(clock->abs_struct->list)); 726
1006 spin_unlock(&clock->abs_struct->lock); 727 /* Convert interval */
1007 } 728 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
729
730 /* SIGEV_NONE timers are not queued ! See common_timer_get */
731 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
732 return 0;
733
734 hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ?
735 HRTIMER_ABS : HRTIMER_REL);
1008 return 0; 736 return 0;
1009} 737}
1010 738
@@ -1026,8 +754,8 @@ sys_timer_settime(timer_t timer_id, int flags,
1026 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) 754 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
1027 return -EFAULT; 755 return -EFAULT;
1028 756
1029 if ((!good_timespec(&new_spec.it_interval)) || 757 if (!timespec_valid(&new_spec.it_interval) ||
1030 (!good_timespec(&new_spec.it_value))) 758 !timespec_valid(&new_spec.it_value))
1031 return -EINVAL; 759 return -EINVAL;
1032retry: 760retry:
1033 timr = lock_timer(timer_id, &flag); 761 timr = lock_timer(timer_id, &flag);
@@ -1043,8 +771,8 @@ retry:
1043 goto retry; 771 goto retry;
1044 } 772 }
1045 773
1046 if (old_setting && !error && copy_to_user(old_setting, 774 if (old_setting && !error &&
1047 &old_spec, sizeof (old_spec))) 775 copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
1048 error = -EFAULT; 776 error = -EFAULT;
1049 777
1050 return error; 778 return error;
@@ -1052,24 +780,10 @@ retry:
1052 780
1053static inline int common_timer_del(struct k_itimer *timer) 781static inline int common_timer_del(struct k_itimer *timer)
1054{ 782{
1055 timer->it.real.incr = 0; 783 timer->it.real.interval.tv64 = 0;
1056 784
1057 if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { 785 if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
1058#ifdef CONFIG_SMP
1059 /*
1060 * It can only be active if on an other cpu. Since
1061 * we have cleared the interval stuff above, it should
1062 * clear once we release the spin lock. Of course once
1063 * we do that anything could happen, including the
1064 * complete melt down of the timer. So return with
1065 * a "retry" exit status.
1066 */
1067 return TIMER_RETRY; 786 return TIMER_RETRY;
1068#endif
1069 }
1070
1071 remove_from_abslist(timer);
1072
1073 return 0; 787 return 0;
1074} 788}
1075 789
@@ -1085,24 +799,16 @@ sys_timer_delete(timer_t timer_id)
1085 struct k_itimer *timer; 799 struct k_itimer *timer;
1086 long flags; 800 long flags;
1087 801
1088#ifdef CONFIG_SMP
1089 int error;
1090retry_delete: 802retry_delete:
1091#endif
1092 timer = lock_timer(timer_id, &flags); 803 timer = lock_timer(timer_id, &flags);
1093 if (!timer) 804 if (!timer)
1094 return -EINVAL; 805 return -EINVAL;
1095 806
1096#ifdef CONFIG_SMP 807 if (timer_delete_hook(timer) == TIMER_RETRY) {
1097 error = timer_delete_hook(timer);
1098
1099 if (error == TIMER_RETRY) {
1100 unlock_timer(timer, flags); 808 unlock_timer(timer, flags);
1101 goto retry_delete; 809 goto retry_delete;
1102 } 810 }
1103#else 811
1104 timer_delete_hook(timer);
1105#endif
1106 spin_lock(&current->sighand->siglock); 812 spin_lock(&current->sighand->siglock);
1107 list_del(&timer->list); 813 list_del(&timer->list);
1108 spin_unlock(&current->sighand->siglock); 814 spin_unlock(&current->sighand->siglock);
@@ -1119,29 +825,21 @@ retry_delete:
1119 release_posix_timer(timer, IT_ID_SET); 825 release_posix_timer(timer, IT_ID_SET);
1120 return 0; 826 return 0;
1121} 827}
828
1122/* 829/*
1123 * return timer owned by the process, used by exit_itimers 830 * return timer owned by the process, used by exit_itimers
1124 */ 831 */
1125static inline void itimer_delete(struct k_itimer *timer) 832static void itimer_delete(struct k_itimer *timer)
1126{ 833{
1127 unsigned long flags; 834 unsigned long flags;
1128 835
1129#ifdef CONFIG_SMP
1130 int error;
1131retry_delete: 836retry_delete:
1132#endif
1133 spin_lock_irqsave(&timer->it_lock, flags); 837 spin_lock_irqsave(&timer->it_lock, flags);
1134 838
1135#ifdef CONFIG_SMP 839 if (timer_delete_hook(timer) == TIMER_RETRY) {
1136 error = timer_delete_hook(timer);
1137
1138 if (error == TIMER_RETRY) {
1139 unlock_timer(timer, flags); 840 unlock_timer(timer, flags);
1140 goto retry_delete; 841 goto retry_delete;
1141 } 842 }
1142#else
1143 timer_delete_hook(timer);
1144#endif
1145 list_del(&timer->list); 843 list_del(&timer->list);
1146 /* 844 /*
1147 * This keeps any tasks waiting on the spin lock from thinking 845 * This keeps any tasks waiting on the spin lock from thinking
@@ -1170,61 +868,8 @@ void exit_itimers(struct signal_struct *sig)
1170 } 868 }
1171} 869}
1172 870
1173/* 871/* Not available / possible... functions */
1174 * And now for the "clock" calls 872int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
1175 *
1176 * These functions are called both from timer functions (with the timer
1177 * spin_lock_irq() held and from clock calls with no locking. They must
1178 * use the save flags versions of locks.
1179 */
1180
1181/*
1182 * We do ticks here to avoid the irq lock ( they take sooo long).
1183 * The seqlock is great here. Since we a reader, we don't really care
1184 * if we are interrupted since we don't take lock that will stall us or
1185 * any other cpu. Voila, no irq lock is needed.
1186 *
1187 */
1188
1189static u64 do_posix_clock_monotonic_gettime_parts(
1190 struct timespec *tp, struct timespec *mo)
1191{
1192 u64 jiff;
1193 unsigned int seq;
1194
1195 do {
1196 seq = read_seqbegin(&xtime_lock);
1197 getnstimeofday(tp);
1198 *mo = wall_to_monotonic;
1199 jiff = jiffies_64;
1200
1201 } while(read_seqretry(&xtime_lock, seq));
1202
1203 return jiff;
1204}
1205
1206static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
1207{
1208 struct timespec wall_to_mono;
1209
1210 do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
1211
1212 tp->tv_sec += wall_to_mono.tv_sec;
1213 tp->tv_nsec += wall_to_mono.tv_nsec;
1214
1215 if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
1216 tp->tv_nsec -= NSEC_PER_SEC;
1217 tp->tv_sec++;
1218 }
1219 return 0;
1220}
1221
1222int do_posix_clock_monotonic_gettime(struct timespec *tp)
1223{
1224 return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
1225}
1226
1227int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
1228{ 873{
1229 return -EINVAL; 874 return -EINVAL;
1230} 875}
@@ -1236,7 +881,8 @@ int do_posix_clock_notimer_create(struct k_itimer *timer)
1236} 881}
1237EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); 882EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
1238 883
1239int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) 884int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
885 struct timespec *t, struct timespec __user *r)
1240{ 886{
1241#ifndef ENOTSUP 887#ifndef ENOTSUP
1242 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ 888 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
@@ -1246,8 +892,8 @@ int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
1246} 892}
1247EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); 893EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
1248 894
1249asmlinkage long 895asmlinkage long sys_clock_settime(const clockid_t which_clock,
1250sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp) 896 const struct timespec __user *tp)
1251{ 897{
1252 struct timespec new_tp; 898 struct timespec new_tp;
1253 899
@@ -1260,7 +906,7 @@ sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
1260} 906}
1261 907
1262asmlinkage long 908asmlinkage long
1263sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp) 909sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
1264{ 910{
1265 struct timespec kernel_tp; 911 struct timespec kernel_tp;
1266 int error; 912 int error;
@@ -1277,7 +923,7 @@ sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
1277} 923}
1278 924
1279asmlinkage long 925asmlinkage long
1280sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) 926sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
1281{ 927{
1282 struct timespec rtn_tp; 928 struct timespec rtn_tp;
1283 int error; 929 int error;
@@ -1296,117 +942,34 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
1296} 942}
1297 943
1298/* 944/*
1299 * The standard says that an absolute nanosleep call MUST wake up at 945 * nanosleep for monotonic and realtime clocks
1300 * the requested time in spite of clock settings. Here is what we do:
1301 * For each nanosleep call that needs it (only absolute and not on
1302 * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
1303 * into the "nanosleep_abs_list". All we need is the task_struct pointer.
1304 * When ever the clock is set we just wake up all those tasks. The rest
1305 * is done by the while loop in clock_nanosleep().
1306 *
1307 * On locking, clock_was_set() is called from update_wall_clock which
1308 * holds (or has held for it) a write_lock_irq( xtime_lock) and is
1309 * called from the timer bh code. Thus we need the irq save locks.
1310 *
1311 * Also, on the call from update_wall_clock, that is done as part of a
1312 * softirq thing. We don't want to delay the system that much (possibly
1313 * long list of timers to fix), so we defer that work to keventd.
1314 */ 946 */
1315 947static int common_nsleep(const clockid_t which_clock, int flags,
1316static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue); 948 struct timespec *tsave, struct timespec __user *rmtp)
1317static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL); 949{
1318 950 int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
1319static DECLARE_MUTEX(clock_was_set_lock); 951 int clockid = which_clock;
1320 952
1321void clock_was_set(void) 953 switch (which_clock) {
1322{ 954 case CLOCK_REALTIME:
1323 struct k_itimer *timr; 955 /* Posix madness. Only absolute timers on clock realtime
1324 struct timespec new_wall_to; 956 are affected by clock set. */
1325 LIST_HEAD(cws_list); 957 if (mode != HRTIMER_ABS)
1326 unsigned long seq; 958 clockid = CLOCK_MONOTONIC;
1327 959 case CLOCK_MONOTONIC:
1328 960 break;
1329 if (unlikely(in_interrupt())) { 961 default:
1330 schedule_work(&clock_was_set_work); 962 return -EINVAL;
1331 return;
1332 } 963 }
1333 wake_up_all(&nanosleep_abs_wqueue); 964 return hrtimer_nanosleep(tsave, rmtp, mode, clockid);
1334
1335 /*
1336 * Check if there exist TIMER_ABSTIME timers to correct.
1337 *
1338 * Notes on locking: This code is run in task context with irq
1339 * on. We CAN be interrupted! All other usage of the abs list
1340 * lock is under the timer lock which holds the irq lock as
1341 * well. We REALLY don't want to scan the whole list with the
1342 * interrupt system off, AND we would like a sequence lock on
1343 * this code as well. Since we assume that the clock will not
1344 * be set often, it seems ok to take and release the irq lock
1345 * for each timer. In fact add_timer will do this, so this is
1346 * not an issue. So we know when we are done, we will move the
1347 * whole list to a new location. Then as we process each entry,
1348 * we will move it to the actual list again. This way, when our
1349 * copy is empty, we are done. We are not all that concerned
1350 * about preemption so we will use a semaphore lock to protect
1351 * aginst reentry. This way we will not stall another
1352 * processor. It is possible that this may delay some timers
1353 * that should have expired, given the new clock, but even this
1354 * will be minimal as we will always update to the current time,
1355 * even if it was set by a task that is waiting for entry to
1356 * this code. Timers that expire too early will be caught by
1357 * the expire code and restarted.
1358
1359 * Absolute timers that repeat are left in the abs list while
1360 * waiting for the task to pick up the signal. This means we
1361 * may find timers that are not in the "add_timer" list, but are
1362 * in the abs list. We do the same thing for these, save
1363 * putting them back in the "add_timer" list. (Note, these are
1364 * left in the abs list mainly to indicate that they are
1365 * ABSOLUTE timers, a fact that is used by the re-arm code, and
1366 * for which we have no other flag.)
1367
1368 */
1369
1370 down(&clock_was_set_lock);
1371 spin_lock_irq(&abs_list.lock);
1372 list_splice_init(&abs_list.list, &cws_list);
1373 spin_unlock_irq(&abs_list.lock);
1374 do {
1375 do {
1376 seq = read_seqbegin(&xtime_lock);
1377 new_wall_to = wall_to_monotonic;
1378 } while (read_seqretry(&xtime_lock, seq));
1379
1380 spin_lock_irq(&abs_list.lock);
1381 if (list_empty(&cws_list)) {
1382 spin_unlock_irq(&abs_list.lock);
1383 break;
1384 }
1385 timr = list_entry(cws_list.next, struct k_itimer,
1386 it.real.abs_timer_entry);
1387
1388 list_del_init(&timr->it.real.abs_timer_entry);
1389 if (add_clockset_delta(timr, &new_wall_to) &&
1390 del_timer(&timr->it.real.timer)) /* timer run yet? */
1391 add_timer(&timr->it.real.timer);
1392 list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
1393 spin_unlock_irq(&abs_list.lock);
1394 } while (1);
1395
1396 up(&clock_was_set_lock);
1397} 965}
1398 966
1399long clock_nanosleep_restart(struct restart_block *restart_block);
1400
1401asmlinkage long 967asmlinkage long
1402sys_clock_nanosleep(clockid_t which_clock, int flags, 968sys_clock_nanosleep(const clockid_t which_clock, int flags,
1403 const struct timespec __user *rqtp, 969 const struct timespec __user *rqtp,
1404 struct timespec __user *rmtp) 970 struct timespec __user *rmtp)
1405{ 971{
1406 struct timespec t; 972 struct timespec t;
1407 struct restart_block *restart_block =
1408 &(current_thread_info()->restart_block);
1409 int ret;
1410 973
1411 if (invalid_clockid(which_clock)) 974 if (invalid_clockid(which_clock))
1412 return -EINVAL; 975 return -EINVAL;
@@ -1414,125 +977,9 @@ sys_clock_nanosleep(clockid_t which_clock, int flags,
1414 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 977 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1415 return -EFAULT; 978 return -EFAULT;
1416 979
1417 if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0) 980 if (!timespec_valid(&t))
1418 return -EINVAL; 981 return -EINVAL;
1419 982
1420 /* 983 return CLOCK_DISPATCH(which_clock, nsleep,
1421 * Do this here as nsleep function does not have the real address. 984 (which_clock, flags, &t, rmtp));
1422 */
1423 restart_block->arg1 = (unsigned long)rmtp;
1424
1425 ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
1426
1427 if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
1428 copy_to_user(rmtp, &t, sizeof (t)))
1429 return -EFAULT;
1430 return ret;
1431}
1432
1433
1434static int common_nsleep(clockid_t which_clock,
1435 int flags, struct timespec *tsave)
1436{
1437 struct timespec t, dum;
1438 DECLARE_WAITQUEUE(abs_wqueue, current);
1439 u64 rq_time = (u64)0;
1440 s64 left;
1441 int abs;
1442 struct restart_block *restart_block =
1443 &current_thread_info()->restart_block;
1444
1445 abs_wqueue.flags = 0;
1446 abs = flags & TIMER_ABSTIME;
1447
1448 if (restart_block->fn == clock_nanosleep_restart) {
1449 /*
1450 * Interrupted by a non-delivered signal, pick up remaining
1451 * time and continue. Remaining time is in arg2 & 3.
1452 */
1453 restart_block->fn = do_no_restart_syscall;
1454
1455 rq_time = restart_block->arg3;
1456 rq_time = (rq_time << 32) + restart_block->arg2;
1457 if (!rq_time)
1458 return -EINTR;
1459 left = rq_time - get_jiffies_64();
1460 if (left <= (s64)0)
1461 return 0; /* Already passed */
1462 }
1463
1464 if (abs && (posix_clocks[which_clock].clock_get !=
1465 posix_clocks[CLOCK_MONOTONIC].clock_get))
1466 add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
1467
1468 do {
1469 t = *tsave;
1470 if (abs || !rq_time) {
1471 adjust_abs_time(&posix_clocks[which_clock], &t, abs,
1472 &rq_time, &dum);
1473 }
1474
1475 left = rq_time - get_jiffies_64();
1476 if (left >= (s64)MAX_JIFFY_OFFSET)
1477 left = (s64)MAX_JIFFY_OFFSET;
1478 if (left < (s64)0)
1479 break;
1480
1481 schedule_timeout_interruptible(left);
1482
1483 left = rq_time - get_jiffies_64();
1484 } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
1485
1486 if (abs_wqueue.task_list.next)
1487 finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
1488
1489 if (left > (s64)0) {
1490
1491 /*
1492 * Always restart abs calls from scratch to pick up any
1493 * clock shifting that happened while we are away.
1494 */
1495 if (abs)
1496 return -ERESTARTNOHAND;
1497
1498 left *= TICK_NSEC;
1499 tsave->tv_sec = div_long_long_rem(left,
1500 NSEC_PER_SEC,
1501 &tsave->tv_nsec);
1502 /*
1503 * Restart works by saving the time remaing in
1504 * arg2 & 3 (it is 64-bits of jiffies). The other
1505 * info we need is the clock_id (saved in arg0).
1506 * The sys_call interface needs the users
1507 * timespec return address which _it_ saves in arg1.
1508 * Since we have cast the nanosleep call to a clock_nanosleep
1509 * both can be restarted with the same code.
1510 */
1511 restart_block->fn = clock_nanosleep_restart;
1512 restart_block->arg0 = which_clock;
1513 /*
1514 * Caller sets arg1
1515 */
1516 restart_block->arg2 = rq_time & 0xffffffffLL;
1517 restart_block->arg3 = rq_time >> 32;
1518
1519 return -ERESTART_RESTARTBLOCK;
1520 }
1521
1522 return 0;
1523}
1524/*
1525 * This will restart clock_nanosleep.
1526 */
1527long
1528clock_nanosleep_restart(struct restart_block *restart_block)
1529{
1530 struct timespec t;
1531 int ret = common_nsleep(restart_block->arg0, 0, &t);
1532
1533 if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
1534 copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
1535 sizeof (t)))
1536 return -EFAULT;
1537 return ret;
1538} 985}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 46a5e5acff97..9fd8d4f03595 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,6 +19,15 @@ config PM
19 will issue the hlt instruction if nothing is to be done, thereby 19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power. 20 sending the processor to sleep and saving power.
21 21
22config PM_LEGACY
23 bool "Legacy Power Management API"
24 depends on PM
25 default y
26 ---help---
27 Support for pm_register() and friends.
28
29 If unsure, say Y.
30
22config PM_DEBUG 31config PM_DEBUG
23 bool "Power Management Debug Support" 32 bool "Power Management Debug Support"
24 depends on PM 33 depends on PM
@@ -29,7 +38,7 @@ config PM_DEBUG
29 38
30config SOFTWARE_SUSPEND 39config SOFTWARE_SUSPEND
31 bool "Software Suspend" 40 bool "Software Suspend"
32 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP) 41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
33 ---help--- 42 ---help---
34 Enable the possibility of suspending the machine. 43 Enable the possibility of suspending the machine.
35 It doesn't need APM. 44 It doesn't need APM.
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c71eb4579c07..04be7d0d96a7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,7 +3,8 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o process.o console.o pm.o 6obj-y := main.o process.o console.o
7obj-$(CONFIG_PM_LEGACY) += pm.o
7obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o 8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o
8 9
9obj-$(CONFIG_SUSPEND_SMP) += smp.o 10obj-$(CONFIG_SUSPEND_SMP) += smp.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 027322a564f4..e24446f8d8cd 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -24,10 +24,11 @@
24 24
25extern suspend_disk_method_t pm_disk_mode; 25extern suspend_disk_method_t pm_disk_mode;
26 26
27extern int swsusp_shrink_memory(void);
27extern int swsusp_suspend(void); 28extern int swsusp_suspend(void);
28extern int swsusp_write(void); 29extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
29extern int swsusp_check(void); 30extern int swsusp_check(void);
30extern int swsusp_read(void); 31extern int swsusp_read(struct pbe **pblist_ptr);
31extern void swsusp_close(void); 32extern void swsusp_close(void);
32extern int swsusp_resume(void); 33extern int swsusp_resume(void);
33 34
@@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode)
73static int in_suspend __nosavedata = 0; 74static int in_suspend __nosavedata = 0;
74 75
75 76
76/**
77 * free_some_memory - Try to free as much memory as possible
78 *
79 * ... but do not OOM-kill anyone
80 *
81 * Notice: all userland should be stopped at this point, or
82 * livelock is possible.
83 */
84
85static void free_some_memory(void)
86{
87 unsigned int i = 0;
88 unsigned int tmp;
89 unsigned long pages = 0;
90 char *p = "-\\|/";
91
92 printk("Freeing memory... ");
93 while ((tmp = shrink_all_memory(10000))) {
94 pages += tmp;
95 printk("\b%c", p[i++ % 4]);
96 }
97 printk("\bdone (%li pages freed)\n", pages);
98}
99
100
101static inline void platform_finish(void) 77static inline void platform_finish(void)
102{ 78{
103 if (pm_disk_mode == PM_DISK_PLATFORM) { 79 if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -127,8 +103,8 @@ static int prepare_processes(void)
127 } 103 }
128 104
129 /* Free memory before shutting down devices. */ 105 /* Free memory before shutting down devices. */
130 free_some_memory(); 106 if (!(error = swsusp_shrink_memory()))
131 return 0; 107 return 0;
132thaw: 108thaw:
133 thaw_processes(); 109 thaw_processes();
134 enable_nonboot_cpus(); 110 enable_nonboot_cpus();
@@ -176,7 +152,7 @@ int pm_suspend_disk(void)
176 if (in_suspend) { 152 if (in_suspend) {
177 device_resume(); 153 device_resume();
178 pr_debug("PM: writing image.\n"); 154 pr_debug("PM: writing image.\n");
179 error = swsusp_write(); 155 error = swsusp_write(pagedir_nosave, nr_copy_pages);
180 if (!error) 156 if (!error)
181 power_down(pm_disk_mode); 157 power_down(pm_disk_mode);
182 else { 158 else {
@@ -247,7 +223,7 @@ static int software_resume(void)
247 223
248 pr_debug("PM: Reading swsusp image.\n"); 224 pr_debug("PM: Reading swsusp image.\n");
249 225
250 if ((error = swsusp_read())) { 226 if ((error = swsusp_read(&pagedir_nosave))) {
251 swsusp_free(); 227 swsusp_free();
252 goto Thaw; 228 goto Thaw;
253 } 229 }
@@ -363,37 +339,55 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf)
363 MINOR(swsusp_resume_device)); 339 MINOR(swsusp_resume_device));
364} 340}
365 341
366static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) 342static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
367{ 343{
368 int len;
369 char *p;
370 unsigned int maj, min; 344 unsigned int maj, min;
371 int error = -EINVAL;
372 dev_t res; 345 dev_t res;
346 int ret = -EINVAL;
373 347
374 p = memchr(buf, '\n', n); 348 if (sscanf(buf, "%u:%u", &maj, &min) != 2)
375 len = p ? p - buf : n; 349 goto out;
376 350
377 if (sscanf(buf, "%u:%u", &maj, &min) == 2) { 351 res = MKDEV(maj,min);
378 res = MKDEV(maj,min); 352 if (maj != MAJOR(res) || min != MINOR(res))
379 if (maj == MAJOR(res) && min == MINOR(res)) { 353 goto out;
380 down(&pm_sem);
381 swsusp_resume_device = res;
382 up(&pm_sem);
383 printk("Attempting manual resume\n");
384 noresume = 0;
385 software_resume();
386 }
387 }
388 354
389 return error >= 0 ? n : error; 355 down(&pm_sem);
356 swsusp_resume_device = res;
357 up(&pm_sem);
358 printk("Attempting manual resume\n");
359 noresume = 0;
360 software_resume();
361 ret = n;
362out:
363 return ret;
390} 364}
391 365
392power_attr(resume); 366power_attr(resume);
393 367
368static ssize_t image_size_show(struct subsystem * subsys, char *buf)
369{
370 return sprintf(buf, "%u\n", image_size);
371}
372
373static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
374{
375 unsigned int size;
376
377 if (sscanf(buf, "%u", &size) == 1) {
378 image_size = size;
379 return n;
380 }
381
382 return -EINVAL;
383}
384
385power_attr(image_size);
386
394static struct attribute * g[] = { 387static struct attribute * g[] = {
395 &disk_attr.attr, 388 &disk_attr.attr,
396 &resume_attr.attr, 389 &resume_attr.attr,
390 &image_size_attr.attr,
397 NULL, 391 NULL,
398}; 392};
399 393
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6ee2cad530e8..d253f3ae2fa5 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -24,7 +24,7 @@
24 24
25DECLARE_MUTEX(pm_sem); 25DECLARE_MUTEX(pm_sem);
26 26
27struct pm_ops * pm_ops = NULL; 27struct pm_ops *pm_ops;
28suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; 28suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
29 29
30/** 30/**
@@ -151,6 +151,18 @@ static char *pm_states[PM_SUSPEND_MAX] = {
151#endif 151#endif
152}; 152};
153 153
154static inline int valid_state(suspend_state_t state)
155{
156 /* Suspend-to-disk does not really need low-level support.
157 * It can work with reboot if needed. */
158 if (state == PM_SUSPEND_DISK)
159 return 1;
160
161 if (pm_ops && pm_ops->valid && !pm_ops->valid(state))
162 return 0;
163 return 1;
164}
165
154 166
155/** 167/**
156 * enter_state - Do common work of entering low-power state. 168 * enter_state - Do common work of entering low-power state.
@@ -167,7 +179,7 @@ static int enter_state(suspend_state_t state)
167{ 179{
168 int error; 180 int error;
169 181
170 if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) 182 if (!valid_state(state))
171 return -ENODEV; 183 return -ENODEV;
172 if (down_trylock(&pm_sem)) 184 if (down_trylock(&pm_sem))
173 return -EBUSY; 185 return -EBUSY;
@@ -238,9 +250,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
238 char * s = buf; 250 char * s = buf;
239 251
240 for (i = 0; i < PM_SUSPEND_MAX; i++) { 252 for (i = 0; i < PM_SUSPEND_MAX; i++) {
241 if (pm_states[i] && pm_ops && (!pm_ops->valid 253 if (pm_states[i] && valid_state(i))
242 ||(pm_ops->valid && pm_ops->valid(i)))) 254 s += sprintf(s,"%s ", pm_states[i]);
243 s += sprintf(s,"%s ",pm_states[i]);
244 } 255 }
245 s += sprintf(s,"\n"); 256 s += sprintf(s,"\n");
246 return (s - buf); 257 return (s - buf);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 159149321b3c..33c508e857dd 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -23,6 +23,7 @@
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/pm_legacy.h>
26#include <linux/interrupt.h> 27#include <linux/interrupt.h>
27 28
28int pm_active; 29int pm_active;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6c042b5ee14b..7e8492fd1423 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -9,19 +9,13 @@
9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
10#endif 10#endif
11 11
12#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \
13 - 4 - 3*sizeof(unsigned long) - sizeof(int) \
14 - sizeof(void *)) / sizeof(swp_entry_t))
15
16struct swsusp_info { 12struct swsusp_info {
17 struct new_utsname uts; 13 struct new_utsname uts;
18 u32 version_code; 14 u32 version_code;
19 unsigned long num_physpages; 15 unsigned long num_physpages;
20 int cpus; 16 int cpus;
21 unsigned long image_pages; 17 unsigned long image_pages;
22 unsigned long pagedir_pages; 18 unsigned long pages;
23 suspend_pagedir_t * suspend_pagedir;
24 swp_entry_t pagedir[MAX_PBES];
25} __attribute__((aligned(PAGE_SIZE))); 19} __attribute__((aligned(PAGE_SIZE)));
26 20
27 21
@@ -48,25 +42,27 @@ static struct subsys_attribute _name##_attr = { \
48 42
49extern struct subsystem power_subsys; 43extern struct subsystem power_subsys;
50 44
51extern int freeze_processes(void);
52extern void thaw_processes(void);
53
54extern int pm_prepare_console(void); 45extern int pm_prepare_console(void);
55extern void pm_restore_console(void); 46extern void pm_restore_console(void);
56 47
57
58/* References to section boundaries */ 48/* References to section boundaries */
59extern const void __nosave_begin, __nosave_end; 49extern const void __nosave_begin, __nosave_end;
60 50
61extern unsigned int nr_copy_pages; 51extern unsigned int nr_copy_pages;
62extern suspend_pagedir_t *pagedir_nosave; 52extern struct pbe *pagedir_nosave;
63extern suspend_pagedir_t *pagedir_save; 53
54/* Preferred image size in MB (default 500) */
55extern unsigned int image_size;
64 56
65extern asmlinkage int swsusp_arch_suspend(void); 57extern asmlinkage int swsusp_arch_suspend(void);
66extern asmlinkage int swsusp_arch_resume(void); 58extern asmlinkage int swsusp_arch_resume(void);
67 59
60extern unsigned int count_data_pages(void);
68extern void free_pagedir(struct pbe *pblist); 61extern void free_pagedir(struct pbe *pblist);
62extern void release_eaten_pages(void);
69extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); 63extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
70extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
71extern void swsusp_free(void); 64extern void swsusp_free(void);
72extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); 65extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
66extern unsigned int snapshot_nr_pages(void);
67extern struct pbe *snapshot_pblist(void);
68extern void snapshot_pblist_set(struct pbe *pblist);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4a6dbcefd378..41f66365f0d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,7 +33,35 @@
33 33
34#include "power.h" 34#include "power.h"
35 35
36struct pbe *pagedir_nosave;
37unsigned int nr_copy_pages;
38
36#ifdef CONFIG_HIGHMEM 39#ifdef CONFIG_HIGHMEM
40unsigned int count_highmem_pages(void)
41{
42 struct zone *zone;
43 unsigned long zone_pfn;
44 unsigned int n = 0;
45
46 for_each_zone (zone)
47 if (is_highmem(zone)) {
48 mark_free_pages(zone);
49 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
50 struct page *page;
51 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
52 if (!pfn_valid(pfn))
53 continue;
54 page = pfn_to_page(pfn);
55 if (PageReserved(page))
56 continue;
57 if (PageNosaveFree(page))
58 continue;
59 n++;
60 }
61 }
62 return n;
63}
64
37struct highmem_page { 65struct highmem_page {
38 char *data; 66 char *data;
39 struct page *page; 67 struct page *page;
@@ -149,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
149 BUG_ON(PageReserved(page) && PageNosave(page)); 177 BUG_ON(PageReserved(page) && PageNosave(page));
150 if (PageNosave(page)) 178 if (PageNosave(page))
151 return 0; 179 return 0;
152 if (PageReserved(page) && pfn_is_nosave(pfn)) { 180 if (PageReserved(page) && pfn_is_nosave(pfn))
153 pr_debug("[nosave pfn 0x%lx]", pfn);
154 return 0; 181 return 0;
155 }
156 if (PageNosaveFree(page)) 182 if (PageNosaveFree(page))
157 return 0; 183 return 0;
158 184
159 return 1; 185 return 1;
160} 186}
161 187
162static unsigned count_data_pages(void) 188unsigned int count_data_pages(void)
163{ 189{
164 struct zone *zone; 190 struct zone *zone;
165 unsigned long zone_pfn; 191 unsigned long zone_pfn;
@@ -244,7 +270,7 @@ static inline void fill_pb_page(struct pbe *pbpage)
244 * of memory pages allocated with alloc_pagedir() 270 * of memory pages allocated with alloc_pagedir()
245 */ 271 */
246 272
247void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) 273static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
248{ 274{
249 struct pbe *pbpage, *p; 275 struct pbe *pbpage, *p;
250 unsigned int num = PBES_PER_PAGE; 276 unsigned int num = PBES_PER_PAGE;
@@ -261,7 +287,35 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
261 p->next = p + 1; 287 p->next = p + 1;
262 p->next = NULL; 288 p->next = NULL;
263 } 289 }
264 pr_debug("create_pbe_list(): initialized %d PBEs\n", num); 290}
291
292/**
293 * On resume it is necessary to trace and eventually free the unsafe
294 * pages that have been allocated, because they are needed for I/O
295 * (on x86-64 we likely will "eat" these pages once again while
296 * creating the temporary page translation tables)
297 */
298
299struct eaten_page {
300 struct eaten_page *next;
301 char padding[PAGE_SIZE - sizeof(void *)];
302};
303
304static struct eaten_page *eaten_pages = NULL;
305
306void release_eaten_pages(void)
307{
308 struct eaten_page *p, *q;
309
310 p = eaten_pages;
311 while (p) {
312 q = p->next;
313 /* We don't want swsusp_free() to free this page again */
314 ClearPageNosave(virt_to_page(p));
315 free_page((unsigned long)p);
316 p = q;
317 }
318 eaten_pages = NULL;
265} 319}
266 320
267/** 321/**
@@ -282,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
282 if (safe_needed) 336 if (safe_needed)
283 do { 337 do {
284 res = (void *)get_zeroed_page(gfp_mask); 338 res = (void *)get_zeroed_page(gfp_mask);
285 if (res && PageNosaveFree(virt_to_page(res))) 339 if (res && PageNosaveFree(virt_to_page(res))) {
286 /* This is for swsusp_free() */ 340 /* This is for swsusp_free() */
287 SetPageNosave(virt_to_page(res)); 341 SetPageNosave(virt_to_page(res));
342 ((struct eaten_page *)res)->next = eaten_pages;
343 eaten_pages = res;
344 }
288 } while (res && PageNosaveFree(virt_to_page(res))); 345 } while (res && PageNosaveFree(virt_to_page(res)));
289 else 346 else
290 res = (void *)get_zeroed_page(gfp_mask); 347 res = (void *)get_zeroed_page(gfp_mask);
@@ -332,7 +389,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
332 if (!pbe) { /* get_zeroed_page() failed */ 389 if (!pbe) { /* get_zeroed_page() failed */
333 free_pagedir(pblist); 390 free_pagedir(pblist);
334 pblist = NULL; 391 pblist = NULL;
335 } 392 } else
393 create_pbe_list(pblist, nr_pages);
336 return pblist; 394 return pblist;
337} 395}
338 396
@@ -370,8 +428,14 @@ void swsusp_free(void)
370 428
371static int enough_free_mem(unsigned int nr_pages) 429static int enough_free_mem(unsigned int nr_pages)
372{ 430{
373 pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); 431 struct zone *zone;
374 return nr_free_pages() > (nr_pages + PAGES_FOR_IO + 432 unsigned int n = 0;
433
434 for_each_zone (zone)
435 if (!is_highmem(zone))
436 n += zone->free_pages;
437 pr_debug("swsusp: available memory: %u pages\n", n);
438 return n > (nr_pages + PAGES_FOR_IO +
375 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 439 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
376} 440}
377 441
@@ -395,7 +459,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
395 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 459 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
396 return NULL; 460 return NULL;
397 } 461 }
398 create_pbe_list(pblist, nr_pages);
399 462
400 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { 463 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
401 printk(KERN_ERR "suspend: Allocating image pages failed.\n"); 464 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -421,10 +484,6 @@ asmlinkage int swsusp_save(void)
421 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, 484 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
422 PAGES_FOR_IO, nr_free_pages()); 485 PAGES_FOR_IO, nr_free_pages());
423 486
424 /* This is needed because of the fixed size of swsusp_info */
425 if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
426 return -ENOSPC;
427
428 if (!enough_free_mem(nr_pages)) { 487 if (!enough_free_mem(nr_pages)) {
429 printk(KERN_ERR "swsusp: Not enough free memory\n"); 488 printk(KERN_ERR "swsusp: Not enough free memory\n");
430 return -ENOMEM; 489 return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c05f46e7348f..55a18d26abed 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,8 +30,8 @@
30 * Alex Badea <vampire@go.ro>: 30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init 31 * Fixed runaway init
32 * 32 *
33 * Andreas Steinmetz <ast@domdv.de>: 33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Added encrypted suspend option 34 * Added the swap map data structure and reworked the handling of swap
35 * 35 *
36 * More state savers are welcome. Especially for the scsi layer... 36 * More state savers are welcome. Especially for the scsi layer...
37 * 37 *
@@ -67,44 +67,33 @@
67#include <asm/tlbflush.h> 67#include <asm/tlbflush.h>
68#include <asm/io.h> 68#include <asm/io.h>
69 69
70#include <linux/random.h>
71#include <linux/crypto.h>
72#include <asm/scatterlist.h>
73
74#include "power.h" 70#include "power.h"
75 71
72/*
73 * Preferred image size in MB (tunable via /sys/power/image_size).
74 * When it is set to N, swsusp will do its best to ensure the image
75 * size will not exceed N MB, but if that is impossible, it will
76 * try to create the smallest image possible.
77 */
78unsigned int image_size = 500;
79
76#ifdef CONFIG_HIGHMEM 80#ifdef CONFIG_HIGHMEM
81unsigned int count_highmem_pages(void);
77int save_highmem(void); 82int save_highmem(void);
78int restore_highmem(void); 83int restore_highmem(void);
79#else 84#else
80static int save_highmem(void) { return 0; } 85static int save_highmem(void) { return 0; }
81static int restore_highmem(void) { return 0; } 86static int restore_highmem(void) { return 0; }
87static unsigned int count_highmem_pages(void) { return 0; }
82#endif 88#endif
83 89
84#define CIPHER "aes"
85#define MAXKEY 32
86#define MAXIV 32
87
88extern char resume_file[]; 90extern char resume_file[];
89 91
90/* Local variables that should not be affected by save */
91unsigned int nr_copy_pages __nosavedata = 0;
92
93/* Suspend pagedir is allocated before final copy, therefore it
94 must be freed after resume
95
96 Warning: this is even more evil than it seems. Pagedirs this file
97 talks about are completely different from page directories used by
98 MMU hardware.
99 */
100suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
101
102#define SWSUSP_SIG "S1SUSPEND" 92#define SWSUSP_SIG "S1SUSPEND"
103 93
104static struct swsusp_header { 94static struct swsusp_header {
105 char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; 95 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
106 u8 key_iv[MAXKEY+MAXIV]; 96 swp_entry_t image;
107 swp_entry_t swsusp_info;
108 char orig_sig[10]; 97 char orig_sig[10];
109 char sig[10]; 98 char sig[10];
110} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 99} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
@@ -115,140 +104,9 @@ static struct swsusp_info swsusp_info;
115 * Saving part... 104 * Saving part...
116 */ 105 */
117 106
118/* We memorize in swapfile_used what swap devices are used for suspension */ 107static unsigned short root_swap = 0xffff;
119#define SWAPFILE_UNUSED 0
120#define SWAPFILE_SUSPEND 1 /* This is the suspending device */
121#define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */
122
123static unsigned short swapfile_used[MAX_SWAPFILES];
124static unsigned short root_swap;
125
126static int write_page(unsigned long addr, swp_entry_t *loc);
127static int bio_read_page(pgoff_t page_off, void *page);
128
129static u8 key_iv[MAXKEY+MAXIV];
130
131#ifdef CONFIG_SWSUSP_ENCRYPT
132
133static int crypto_init(int mode, void **mem)
134{
135 int error = 0;
136 int len;
137 char *modemsg;
138 struct crypto_tfm *tfm;
139
140 modemsg = mode ? "suspend not possible" : "resume not possible";
141
142 tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
143 if(!tfm) {
144 printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
145 error = -EINVAL;
146 goto out;
147 }
148
149 if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
150 printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
151 error = -ENOKEY;
152 goto fail;
153 }
154
155 if (mode)
156 get_random_bytes(key_iv, MAXKEY+MAXIV);
157
158 len = crypto_tfm_alg_max_keysize(tfm);
159 if (len > MAXKEY)
160 len = MAXKEY;
161
162 if (crypto_cipher_setkey(tfm, key_iv, len)) {
163 printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
164 error = -EKEYREJECTED;
165 goto fail;
166 }
167
168 len = crypto_tfm_alg_ivsize(tfm);
169
170 if (MAXIV < len) {
171 printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
172 error = -EOVERFLOW;
173 goto fail;
174 }
175
176 crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
177
178 *mem=(void *)tfm;
179
180 goto out;
181
182fail: crypto_free_tfm(tfm);
183out: return error;
184}
185
186static __inline__ void crypto_exit(void *mem)
187{
188 crypto_free_tfm((struct crypto_tfm *)mem);
189}
190
191static __inline__ int crypto_write(struct pbe *p, void *mem)
192{
193 int error = 0;
194 struct scatterlist src, dst;
195
196 src.page = virt_to_page(p->address);
197 src.offset = 0;
198 src.length = PAGE_SIZE;
199 dst.page = virt_to_page((void *)&swsusp_header);
200 dst.offset = 0;
201 dst.length = PAGE_SIZE;
202
203 error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
204 PAGE_SIZE);
205
206 if (!error)
207 error = write_page((unsigned long)&swsusp_header,
208 &(p->swap_address));
209 return error;
210}
211
212static __inline__ int crypto_read(struct pbe *p, void *mem)
213{
214 int error = 0;
215 struct scatterlist src, dst;
216
217 error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
218 if (!error) {
219 src.offset = 0;
220 src.length = PAGE_SIZE;
221 dst.offset = 0;
222 dst.length = PAGE_SIZE;
223 src.page = dst.page = virt_to_page((void *)p->address);
224
225 error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
226 &src, PAGE_SIZE);
227 }
228 return error;
229}
230#else
231static __inline__ int crypto_init(int mode, void *mem)
232{
233 return 0;
234}
235
236static __inline__ void crypto_exit(void *mem)
237{
238}
239
240static __inline__ int crypto_write(struct pbe *p, void *mem)
241{
242 return write_page(p->address, &(p->swap_address));
243}
244 108
245static __inline__ int crypto_read(struct pbe *p, void *mem) 109static int mark_swapfiles(swp_entry_t start)
246{
247 return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
248}
249#endif
250
251static int mark_swapfiles(swp_entry_t prev)
252{ 110{
253 int error; 111 int error;
254 112
@@ -259,8 +117,7 @@ static int mark_swapfiles(swp_entry_t prev)
259 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 117 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
260 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 118 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
261 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 119 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
262 memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); 120 swsusp_header.image = start;
263 swsusp_header.swsusp_info = prev;
264 error = rw_swap_page_sync(WRITE, 121 error = rw_swap_page_sync(WRITE,
265 swp_entry(root_swap, 0), 122 swp_entry(root_swap, 0),
266 virt_to_page((unsigned long) 123 virt_to_page((unsigned long)
@@ -283,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
283 * devfs, since the resume code can only recognize the form /dev/hda4, 140 * devfs, since the resume code can only recognize the form /dev/hda4,
284 * but the suspend code would see the long name.) 141 * but the suspend code would see the long name.)
285 */ 142 */
286static int is_resume_device(const struct swap_info_struct *swap_info) 143static inline int is_resume_device(const struct swap_info_struct *swap_info)
287{ 144{
288 struct file *file = swap_info->swap_file; 145 struct file *file = swap_info->swap_file;
289 struct inode *inode = file->f_dentry->d_inode; 146 struct inode *inode = file->f_dentry->d_inode;
@@ -294,54 +151,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
294 151
295static int swsusp_swap_check(void) /* This is called before saving image */ 152static int swsusp_swap_check(void) /* This is called before saving image */
296{ 153{
297 int i, len;
298
299 len=strlen(resume_file);
300 root_swap = 0xFFFF;
301
302 spin_lock(&swap_lock);
303 for (i=0; i<MAX_SWAPFILES; i++) {
304 if (!(swap_info[i].flags & SWP_WRITEOK)) {
305 swapfile_used[i]=SWAPFILE_UNUSED;
306 } else {
307 if (!len) {
308 printk(KERN_WARNING "resume= option should be used to set suspend device" );
309 if (root_swap == 0xFFFF) {
310 swapfile_used[i] = SWAPFILE_SUSPEND;
311 root_swap = i;
312 } else
313 swapfile_used[i] = SWAPFILE_IGNORED;
314 } else {
315 /* we ignore all swap devices that are not the resume_file */
316 if (is_resume_device(&swap_info[i])) {
317 swapfile_used[i] = SWAPFILE_SUSPEND;
318 root_swap = i;
319 } else {
320 swapfile_used[i] = SWAPFILE_IGNORED;
321 }
322 }
323 }
324 }
325 spin_unlock(&swap_lock);
326 return (root_swap != 0xffff) ? 0 : -ENODEV;
327}
328
329/**
330 * This is called after saving image so modification
331 * will be lost after resume... and that's what we want.
332 * we make the device unusable. A new call to
333 * lock_swapdevices can unlock the devices.
334 */
335static void lock_swapdevices(void)
336{
337 int i; 154 int i;
338 155
156 if (!swsusp_resume_device)
157 return -ENODEV;
339 spin_lock(&swap_lock); 158 spin_lock(&swap_lock);
340 for (i = 0; i< MAX_SWAPFILES; i++) 159 for (i = 0; i < MAX_SWAPFILES; i++) {
341 if (swapfile_used[i] == SWAPFILE_IGNORED) { 160 if (!(swap_info[i].flags & SWP_WRITEOK))
342 swap_info[i].flags ^= SWP_WRITEOK; 161 continue;
162 if (is_resume_device(swap_info + i)) {
163 spin_unlock(&swap_lock);
164 root_swap = i;
165 return 0;
343 } 166 }
167 }
344 spin_unlock(&swap_lock); 168 spin_unlock(&swap_lock);
169 return -ENODEV;
345} 170}
346 171
347/** 172/**
@@ -359,72 +184,217 @@ static void lock_swapdevices(void)
359static int write_page(unsigned long addr, swp_entry_t *loc) 184static int write_page(unsigned long addr, swp_entry_t *loc)
360{ 185{
361 swp_entry_t entry; 186 swp_entry_t entry;
362 int error = 0; 187 int error = -ENOSPC;
363 188
364 entry = get_swap_page(); 189 entry = get_swap_page_of_type(root_swap);
365 if (swp_offset(entry) && 190 if (swp_offset(entry)) {
366 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { 191 error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
367 error = rw_swap_page_sync(WRITE, entry, 192 if (!error || error == -EIO)
368 virt_to_page(addr));
369 if (error == -EIO)
370 error = 0;
371 if (!error)
372 *loc = entry; 193 *loc = entry;
373 } else 194 }
374 error = -ENOSPC;
375 return error; 195 return error;
376} 196}
377 197
378/** 198/**
379 * data_free - Free the swap entries used by the saved image. 199 * Swap map-handling functions
200 *
201 * The swap map is a data structure used for keeping track of each page
202 * written to the swap. It consists of many swap_map_page structures
203 * that contain each an array of MAP_PAGE_SIZE swap entries.
204 * These structures are linked together with the help of either the
205 * .next (in memory) or the .next_swap (in swap) member.
380 * 206 *
381 * Walk the list of used swap entries and free each one. 207 * The swap map is created during suspend. At that time we need to keep
382 * This is only used for cleanup when suspend fails. 208 * it in memory, because we have to free all of the allocated swap
209 * entries if an error occurs. The memory needed is preallocated
210 * so that we know in advance if there's enough of it.
211 *
212 * The first swap_map_page structure is filled with the swap entries that
213 * correspond to the first MAP_PAGE_SIZE data pages written to swap and
214 * so on. After the all of the data pages have been written, the order
215 * of the swap_map_page structures in the map is reversed so that they
216 * can be read from swap in the original order. This causes the data
217 * pages to be loaded in exactly the same order in which they have been
218 * saved.
219 *
220 * During resume we only need to use one swap_map_page structure
221 * at a time, which means that we only need to use two memory pages for
222 * reading the image - one for reading the swap_map_page structures
223 * and the second for reading the data pages from swap.
383 */ 224 */
384static void data_free(void) 225
226#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
227 / sizeof(swp_entry_t))
228
229struct swap_map_page {
230 swp_entry_t entries[MAP_PAGE_SIZE];
231 swp_entry_t next_swap;
232 struct swap_map_page *next;
233};
234
235static inline void free_swap_map(struct swap_map_page *swap_map)
385{ 236{
386 swp_entry_t entry; 237 struct swap_map_page *swp;
387 struct pbe *p;
388 238
389 for_each_pbe (p, pagedir_nosave) { 239 while (swap_map) {
390 entry = p->swap_address; 240 swp = swap_map->next;
391 if (entry.val) 241 free_page((unsigned long)swap_map);
392 swap_free(entry); 242 swap_map = swp;
393 else
394 break;
395 } 243 }
396} 244}
397 245
246static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
247{
248 struct swap_map_page *swap_map, *swp;
249 unsigned n = 0;
250
251 if (!nr_pages)
252 return NULL;
253
254 pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
255 swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
256 swp = swap_map;
257 for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
258 swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
259 swp = swp->next;
260 if (!swp) {
261 free_swap_map(swap_map);
262 return NULL;
263 }
264 }
265 return swap_map;
266}
267
398/** 268/**
399 * data_write - Write saved image to swap. 269 * reverse_swap_map - reverse the order of pages in the swap map
400 * 270 * @swap_map
401 * Walk the list of pages in the image and sync each one to swap.
402 */ 271 */
403static int data_write(void) 272
273static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
404{ 274{
405 int error = 0, i = 0; 275 struct swap_map_page *prev, *next;
406 unsigned int mod = nr_copy_pages / 100; 276
407 struct pbe *p; 277 prev = NULL;
408 void *tfm; 278 while (swap_map) {
279 next = swap_map->next;
280 swap_map->next = prev;
281 prev = swap_map;
282 swap_map = next;
283 }
284 return prev;
285}
409 286
410 if ((error = crypto_init(1, &tfm))) 287/**
411 return error; 288 * free_swap_map_entries - free the swap entries allocated to store
289 * the swap map @swap_map (this is only called in case of an error)
290 */
291static inline void free_swap_map_entries(struct swap_map_page *swap_map)
292{
293 while (swap_map) {
294 if (swap_map->next_swap.val)
295 swap_free(swap_map->next_swap);
296 swap_map = swap_map->next;
297 }
298}
412 299
413 if (!mod) 300/**
414 mod = 1; 301 * save_swap_map - save the swap map used for tracing the data pages
302 * stored in the swap
303 */
415 304
416 printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); 305static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
417 for_each_pbe (p, pagedir_nosave) { 306{
418 if (!(i%mod)) 307 swp_entry_t entry = (swp_entry_t){0};
419 printk( "\b\b\b\b%3d%%", i / mod ); 308 int error;
420 if ((error = crypto_write(p, tfm))) { 309
421 crypto_exit(tfm); 310 while (swap_map) {
311 swap_map->next_swap = entry;
312 if ((error = write_page((unsigned long)swap_map, &entry)))
422 return error; 313 return error;
423 } 314 swap_map = swap_map->next;
424 i++;
425 } 315 }
426 printk("\b\b\b\bdone\n"); 316 *start = entry;
427 crypto_exit(tfm); 317 return 0;
318}
319
320/**
321 * free_image_entries - free the swap entries allocated to store
322 * the image data pages (this is only called in case of an error)
323 */
324
325static inline void free_image_entries(struct swap_map_page *swp)
326{
327 unsigned k;
328
329 while (swp) {
330 for (k = 0; k < MAP_PAGE_SIZE; k++)
331 if (swp->entries[k].val)
332 swap_free(swp->entries[k]);
333 swp = swp->next;
334 }
335}
336
337/**
338 * The swap_map_handle structure is used for handling the swap map in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 unsigned int k;
345};
346
347static inline void init_swap_map_handle(struct swap_map_handle *handle,
348 struct swap_map_page *map)
349{
350 handle->cur = map;
351 handle->k = 0;
352}
353
354static inline int swap_map_write_page(struct swap_map_handle *handle,
355 unsigned long addr)
356{
357 int error;
358
359 error = write_page(addr, handle->cur->entries + handle->k);
360 if (error)
361 return error;
362 if (++handle->k >= MAP_PAGE_SIZE) {
363 handle->cur = handle->cur->next;
364 handle->k = 0;
365 }
366 return 0;
367}
368
369/**
370 * save_image_data - save the data pages pointed to by the PBEs
371 * from the list @pblist using the swap map handle @handle
372 * (assume there are @nr_pages data pages to save)
373 */
374
375static int save_image_data(struct pbe *pblist,
376 struct swap_map_handle *handle,
377 unsigned int nr_pages)
378{
379 unsigned int m;
380 struct pbe *p;
381 int error = 0;
382
383 printk("Saving image data pages (%u pages) ... ", nr_pages);
384 m = nr_pages / 100;
385 if (!m)
386 m = 1;
387 nr_pages = 0;
388 for_each_pbe (p, pblist) {
389 error = swap_map_write_page(handle, p->address);
390 if (error)
391 break;
392 if (!(nr_pages % m))
393 printk("\b\b\b\b%3d%%", nr_pages / m);
394 nr_pages++;
395 }
396 if (!error)
397 printk("\b\b\b\bdone\n");
428 return error; 398 return error;
429} 399}
430 400
@@ -440,70 +410,70 @@ static void dump_info(void)
440 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); 410 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
441 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); 411 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
442 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); 412 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
443 pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); 413 pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
444} 414}
445 415
446static void init_header(void) 416static void init_header(unsigned int nr_pages)
447{ 417{
448 memset(&swsusp_info, 0, sizeof(swsusp_info)); 418 memset(&swsusp_info, 0, sizeof(swsusp_info));
449 swsusp_info.version_code = LINUX_VERSION_CODE; 419 swsusp_info.version_code = LINUX_VERSION_CODE;
450 swsusp_info.num_physpages = num_physpages; 420 swsusp_info.num_physpages = num_physpages;
451 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); 421 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
452 422
453 swsusp_info.suspend_pagedir = pagedir_nosave;
454 swsusp_info.cpus = num_online_cpus(); 423 swsusp_info.cpus = num_online_cpus();
455 swsusp_info.image_pages = nr_copy_pages; 424 swsusp_info.image_pages = nr_pages;
456} 425 swsusp_info.pages = nr_pages +
457 426 ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
458static int close_swap(void)
459{
460 swp_entry_t entry;
461 int error;
462
463 dump_info();
464 error = write_page((unsigned long)&swsusp_info, &entry);
465 if (!error) {
466 printk( "S" );
467 error = mark_swapfiles(entry);
468 printk( "|\n" );
469 }
470 return error;
471} 427}
472 428
473/** 429/**
474 * free_pagedir_entries - Free pages used by the page directory. 430 * pack_orig_addresses - the .orig_address fields of the PBEs from the
475 * 431 * list starting at @pbe are stored in the array @buf[] (1 page)
476 * This is used during suspend for error recovery.
477 */ 432 */
478 433
479static void free_pagedir_entries(void) 434static inline struct pbe *pack_orig_addresses(unsigned long *buf,
435 struct pbe *pbe)
480{ 436{
481 int i; 437 int j;
482 438
483 for (i = 0; i < swsusp_info.pagedir_pages; i++) 439 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
484 swap_free(swsusp_info.pagedir[i]); 440 buf[j] = pbe->orig_address;
441 pbe = pbe->next;
442 }
443 if (!pbe)
444 for (; j < PAGE_SIZE / sizeof(long); j++)
445 buf[j] = 0;
446 return pbe;
485} 447}
486 448
487
488/** 449/**
489 * write_pagedir - Write the array of pages holding the page directory. 450 * save_image_metadata - save the .orig_address fields of the PBEs
490 * @last: Last swap entry we write (needed for header). 451 * from the list @pblist using the swap map handle @handle
491 */ 452 */
492 453
493static int write_pagedir(void) 454static int save_image_metadata(struct pbe *pblist,
455 struct swap_map_handle *handle)
494{ 456{
495 int error = 0; 457 unsigned long *buf;
496 unsigned int n = 0; 458 unsigned int n = 0;
497 struct pbe *pbe; 459 struct pbe *p;
460 int error = 0;
498 461
499 printk( "Writing pagedir..."); 462 printk("Saving image metadata ... ");
500 for_each_pb_page (pbe, pagedir_nosave) { 463 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
501 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) 464 if (!buf)
502 return error; 465 return -ENOMEM;
466 p = pblist;
467 while (p) {
468 p = pack_orig_addresses(buf, p);
469 error = swap_map_write_page(handle, (unsigned long)buf);
470 if (error)
471 break;
472 n++;
503 } 473 }
504 474 free_page((unsigned long)buf);
505 swsusp_info.pagedir_pages = n; 475 if (!error)
506 printk("done (%u pages)\n", n); 476 printk("done (%u pages saved)\n", n);
507 return error; 477 return error;
508} 478}
509 479
@@ -511,75 +481,125 @@ static int write_pagedir(void)
511 * enough_swap - Make sure we have enough swap to save the image. 481 * enough_swap - Make sure we have enough swap to save the image.
512 * 482 *
513 * Returns TRUE or FALSE after checking the total amount of swap 483 * Returns TRUE or FALSE after checking the total amount of swap
514 * space avaiable. 484 * space avaiable from the resume partition.
515 *
516 * FIXME: si_swapinfo(&i) returns all swap devices information.
517 * We should only consider resume_device.
518 */ 485 */
519 486
520static int enough_swap(unsigned int nr_pages) 487static int enough_swap(unsigned int nr_pages)
521{ 488{
522 struct sysinfo i; 489 unsigned int free_swap = swap_info[root_swap].pages -
490 swap_info[root_swap].inuse_pages;
523 491
524 si_swapinfo(&i); 492 pr_debug("swsusp: free swap pages: %u\n", free_swap);
525 pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); 493 return free_swap > (nr_pages + PAGES_FOR_IO +
526 return i.freeswap > (nr_pages + PAGES_FOR_IO +
527 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 494 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
528} 495}
529 496
530/** 497/**
531 * write_suspend_image - Write entire image and metadata. 498 * swsusp_write - Write entire image and metadata.
532 * 499 *
500 * It is important _NOT_ to umount filesystems at this point. We want
501 * them synced (in case something goes wrong) but we DO not want to mark
502 * filesystem clean: it is not. (And it does not matter, if we resume
503 * correctly, we'll mark system clean, anyway.)
533 */ 504 */
534static int write_suspend_image(void) 505
506int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
535{ 507{
508 struct swap_map_page *swap_map;
509 struct swap_map_handle handle;
510 swp_entry_t start;
536 int error; 511 int error;
537 512
538 if (!enough_swap(nr_copy_pages)) { 513 if ((error = swsusp_swap_check())) {
514 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
515 return error;
516 }
517 if (!enough_swap(nr_pages)) {
539 printk(KERN_ERR "swsusp: Not enough free swap\n"); 518 printk(KERN_ERR "swsusp: Not enough free swap\n");
540 return -ENOSPC; 519 return -ENOSPC;
541 } 520 }
542 521
543 init_header(); 522 init_header(nr_pages);
544 if ((error = data_write())) 523 swap_map = alloc_swap_map(swsusp_info.pages);
545 goto FreeData; 524 if (!swap_map)
525 return -ENOMEM;
526 init_swap_map_handle(&handle, swap_map);
527
528 error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
529 if (!error)
530 error = save_image_metadata(pblist, &handle);
531 if (!error)
532 error = save_image_data(pblist, &handle, nr_pages);
533 if (error)
534 goto Free_image_entries;
546 535
547 if ((error = write_pagedir())) 536 swap_map = reverse_swap_map(swap_map);
548 goto FreePagedir; 537 error = save_swap_map(swap_map, &start);
538 if (error)
539 goto Free_map_entries;
549 540
550 if ((error = close_swap())) 541 dump_info();
551 goto FreePagedir; 542 printk( "S" );
552 Done: 543 error = mark_swapfiles(start);
553 memset(key_iv, 0, MAXKEY+MAXIV); 544 printk( "|\n" );
545 if (error)
546 goto Free_map_entries;
547
548Free_swap_map:
549 free_swap_map(swap_map);
554 return error; 550 return error;
555 FreePagedir: 551
556 free_pagedir_entries(); 552Free_map_entries:
557 FreeData: 553 free_swap_map_entries(swap_map);
558 data_free(); 554Free_image_entries:
559 goto Done; 555 free_image_entries(swap_map);
556 goto Free_swap_map;
560} 557}
561 558
562/* It is important _NOT_ to umount filesystems at this point. We want 559/**
563 * them synced (in case something goes wrong) but we DO not want to mark 560 * swsusp_shrink_memory - Try to free as much memory as needed
564 * filesystem clean: it is not. (And it does not matter, if we resume 561 *
565 * correctly, we'll mark system clean, anyway.) 562 * ... but do not OOM-kill anyone
563 *
564 * Notice: all userland should be stopped before it is called, or
565 * livelock is possible.
566 */ 566 */
567int swsusp_write(void)
568{
569 int error;
570 567
571 if ((error = swsusp_swap_check())) { 568#define SHRINK_BITE 10000
572 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
573 return error;
574 }
575 lock_swapdevices();
576 error = write_suspend_image();
577 /* This will unlock ignored swap devices since writing is finished */
578 lock_swapdevices();
579 return error;
580}
581 569
570int swsusp_shrink_memory(void)
571{
572 long size, tmp;
573 struct zone *zone;
574 unsigned long pages = 0;
575 unsigned int i = 0;
576 char *p = "-\\|/";
577
578 printk("Shrinking memory... ");
579 do {
580 size = 2 * count_highmem_pages();
581 size += size / 50 + count_data_pages();
582 size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
583 PAGES_FOR_IO;
584 tmp = size;
585 for_each_zone (zone)
586 if (!is_highmem(zone))
587 tmp -= zone->free_pages;
588 if (tmp > 0) {
589 tmp = shrink_all_memory(SHRINK_BITE);
590 if (!tmp)
591 return -ENOMEM;
592 pages += tmp;
593 } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
594 tmp = shrink_all_memory(SHRINK_BITE);
595 pages += tmp;
596 }
597 printk("\b%c", p[i++%4]);
598 } while (tmp > 0);
599 printk("\bdone (%lu pages freed)\n", pages);
582 600
601 return 0;
602}
583 603
584int swsusp_suspend(void) 604int swsusp_suspend(void)
585{ 605{
@@ -677,7 +697,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
677 /* We assume both lists contain the same number of elements */ 697 /* We assume both lists contain the same number of elements */
678 while (src) { 698 while (src) {
679 dst->orig_address = src->orig_address; 699 dst->orig_address = src->orig_address;
680 dst->swap_address = src->swap_address;
681 dst = dst->next; 700 dst = dst->next;
682 src = src->next; 701 src = src->next;
683 } 702 }
@@ -757,198 +776,224 @@ static int bio_write_page(pgoff_t page_off, void *page)
757 return submit(WRITE, page_off, page); 776 return submit(WRITE, page_off, page);
758} 777}
759 778
760/* 779/**
761 * Sanity check if this image makes sense with this kernel/swap context 780 * The following functions allow us to read data using a swap map
762 * I really don't think that it's foolproof but more than nothing.. 781 * in a file-alike way
763 */ 782 */
764 783
765static const char *sanity_check(void) 784static inline void release_swap_map_reader(struct swap_map_handle *handle)
766{ 785{
767 dump_info(); 786 if (handle->cur)
768 if (swsusp_info.version_code != LINUX_VERSION_CODE) 787 free_page((unsigned long)handle->cur);
769 return "kernel version"; 788 handle->cur = NULL;
770 if (swsusp_info.num_physpages != num_physpages)
771 return "memory size";
772 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
773 return "system type";
774 if (strcmp(swsusp_info.uts.release,system_utsname.release))
775 return "kernel release";
776 if (strcmp(swsusp_info.uts.version,system_utsname.version))
777 return "version";
778 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
779 return "machine";
780#if 0
781 /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
782 if (swsusp_info.cpus != num_possible_cpus())
783 return "number of cpus";
784#endif
785 return NULL;
786} 789}
787 790
788 791static inline int get_swap_map_reader(struct swap_map_handle *handle,
789static int check_header(void) 792 swp_entry_t start)
790{ 793{
791 const char *reason = NULL;
792 int error; 794 int error;
793 795
794 if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) 796 if (!swp_offset(start))
797 return -EINVAL;
798 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
799 if (!handle->cur)
800 return -ENOMEM;
801 error = bio_read_page(swp_offset(start), handle->cur);
802 if (error) {
803 release_swap_map_reader(handle);
795 return error; 804 return error;
796
797 /* Is this same machine? */
798 if ((reason = sanity_check())) {
799 printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
800 return -EPERM;
801 } 805 }
802 nr_copy_pages = swsusp_info.image_pages; 806 handle->k = 0;
803 return error; 807 return 0;
804} 808}
805 809
806static int check_sig(void) 810static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
807{ 811{
812 unsigned long offset;
808 int error; 813 int error;
809 814
810 memset(&swsusp_header, 0, sizeof(swsusp_header)); 815 if (!handle->cur)
811 if ((error = bio_read_page(0, &swsusp_header))) 816 return -EINVAL;
812 return error; 817 offset = swp_offset(handle->cur->entries[handle->k]);
813 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 818 if (!offset)
814 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
815 memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
816 memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
817
818 /*
819 * Reset swap signature now.
820 */
821 error = bio_write_page(0, &swsusp_header);
822 } else {
823 return -EINVAL; 819 return -EINVAL;
820 error = bio_read_page(offset, buf);
821 if (error)
822 return error;
823 if (++handle->k >= MAP_PAGE_SIZE) {
824 handle->k = 0;
825 offset = swp_offset(handle->cur->next_swap);
826 if (!offset)
827 release_swap_map_reader(handle);
828 else
829 error = bio_read_page(offset, handle->cur);
824 } 830 }
825 if (!error)
826 pr_debug("swsusp: Signature found, resuming\n");
827 return error; 831 return error;
828} 832}
829 833
830/** 834static int check_header(void)
831 * data_read - Read image pages from swap.
832 *
833 * You do not need to check for overlaps, check_pagedir()
834 * already did that.
835 */
836
837static int data_read(struct pbe *pblist)
838{ 835{
839 struct pbe *p; 836 char *reason = NULL;
840 int error = 0;
841 int i = 0;
842 int mod = swsusp_info.image_pages / 100;
843 void *tfm;
844
845 if ((error = crypto_init(0, &tfm)))
846 return error;
847
848 if (!mod)
849 mod = 1;
850
851 printk("swsusp: Reading image data (%lu pages): ",
852 swsusp_info.image_pages);
853
854 for_each_pbe (p, pblist) {
855 if (!(i % mod))
856 printk("\b\b\b\b%3d%%", i / mod);
857 837
858 if ((error = crypto_read(p, tfm))) { 838 dump_info();
859 crypto_exit(tfm); 839 if (swsusp_info.version_code != LINUX_VERSION_CODE)
860 return error; 840 reason = "kernel version";
861 } 841 if (swsusp_info.num_physpages != num_physpages)
862 842 reason = "memory size";
863 i++; 843 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
844 reason = "system type";
845 if (strcmp(swsusp_info.uts.release,system_utsname.release))
846 reason = "kernel release";
847 if (strcmp(swsusp_info.uts.version,system_utsname.version))
848 reason = "version";
849 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
850 reason = "machine";
851 if (reason) {
852 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
853 return -EPERM;
864 } 854 }
865 printk("\b\b\b\bdone\n"); 855 return 0;
866 crypto_exit(tfm);
867 return error;
868} 856}
869 857
870/** 858/**
871 * read_pagedir - Read page backup list pages from swap 859 * load_image_data - load the image data using the swap map handle
860 * @handle and store them using the page backup list @pblist
861 * (assume there are @nr_pages pages to load)
872 */ 862 */
873 863
874static int read_pagedir(struct pbe *pblist) 864static int load_image_data(struct pbe *pblist,
865 struct swap_map_handle *handle,
866 unsigned int nr_pages)
875{ 867{
876 struct pbe *pbpage, *p;
877 unsigned int i = 0;
878 int error; 868 int error;
869 unsigned int m;
870 struct pbe *p;
879 871
880 if (!pblist) 872 if (!pblist)
881 return -EFAULT; 873 return -EINVAL;
882 874 printk("Loading image data pages (%u pages) ... ", nr_pages);
883 printk("swsusp: Reading pagedir (%lu pages)\n", 875 m = nr_pages / 100;
884 swsusp_info.pagedir_pages); 876 if (!m)
885 877 m = 1;
886 for_each_pb_page (pbpage, pblist) { 878 nr_pages = 0;
887 unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); 879 p = pblist;
888 880 while (p) {
889 error = -EFAULT; 881 error = swap_map_read_page(handle, (void *)p->address);
890 if (offset) {
891 p = (pbpage + PB_PAGE_SKIP)->next;
892 error = bio_read_page(offset, (void *)pbpage);
893 (pbpage + PB_PAGE_SKIP)->next = p;
894 }
895 if (error) 882 if (error)
896 break; 883 break;
884 p = p->next;
885 if (!(nr_pages % m))
886 printk("\b\b\b\b%3d%%", nr_pages / m);
887 nr_pages++;
897 } 888 }
898
899 if (!error) 889 if (!error)
900 BUG_ON(i != swsusp_info.pagedir_pages); 890 printk("\b\b\b\bdone\n");
901
902 return error; 891 return error;
903} 892}
904 893
894/**
895 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
896 * the PBEs in the list starting at @pbe
897 */
905 898
906static int check_suspend_image(void) 899static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
900 struct pbe *pbe)
907{ 901{
908 int error = 0; 902 int j;
909 903
910 if ((error = check_sig())) 904 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
911 return error; 905 pbe->orig_address = buf[j];
912 906 pbe = pbe->next;
913 if ((error = check_header())) 907 }
914 return error; 908 return pbe;
915
916 return 0;
917} 909}
918 910
919static int read_suspend_image(void) 911/**
912 * load_image_metadata - load the image metadata using the swap map
913 * handle @handle and put them into the PBEs in the list @pblist
914 */
915
916static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
920{ 917{
921 int error = 0;
922 struct pbe *p; 918 struct pbe *p;
919 unsigned long *buf;
920 unsigned int n = 0;
921 int error = 0;
923 922
924 if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) 923 printk("Loading image metadata ... ");
924 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
925 if (!buf)
925 return -ENOMEM; 926 return -ENOMEM;
926 927 p = pblist;
927 if ((error = read_pagedir(p))) 928 while (p) {
928 return error; 929 error = swap_map_read_page(handle, buf);
929 create_pbe_list(p, nr_copy_pages); 930 if (error)
930 mark_unsafe_pages(p); 931 break;
931 pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); 932 p = unpack_orig_addresses(buf, p);
932 if (pagedir_nosave) { 933 n++;
933 create_pbe_list(pagedir_nosave, nr_copy_pages);
934 copy_page_backup_list(pagedir_nosave, p);
935 } 934 }
936 free_pagedir(p); 935 free_page((unsigned long)buf);
937 if (!pagedir_nosave) 936 if (!error)
938 return -ENOMEM; 937 printk("done (%u pages loaded)\n", n);
938 return error;
939}
939 940
940 /* Allocate memory for the image and read the data from swap */ 941int swsusp_read(struct pbe **pblist_ptr)
942{
943 int error;
944 struct pbe *p, *pblist;
945 struct swap_map_handle handle;
946 unsigned int nr_pages;
941 947
942 error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); 948 if (IS_ERR(resume_bdev)) {
949 pr_debug("swsusp: block device not initialised\n");
950 return PTR_ERR(resume_bdev);
951 }
943 952
953 error = get_swap_map_reader(&handle, swsusp_header.image);
944 if (!error) 954 if (!error)
945 error = data_read(pagedir_nosave); 955 error = swap_map_read_page(&handle, &swsusp_info);
956 if (!error)
957 error = check_header();
958 if (error)
959 return error;
960 nr_pages = swsusp_info.image_pages;
961 p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
962 if (!p)
963 return -ENOMEM;
964 error = load_image_metadata(p, &handle);
965 if (!error) {
966 mark_unsafe_pages(p);
967 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
968 if (pblist)
969 copy_page_backup_list(pblist, p);
970 free_pagedir(p);
971 if (!pblist)
972 error = -ENOMEM;
973
974 /* Allocate memory for the image and read the data from swap */
975 if (!error)
976 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
977 if (!error) {
978 release_eaten_pages();
979 error = load_image_data(pblist, &handle, nr_pages);
980 }
981 if (!error)
982 *pblist_ptr = pblist;
983 }
984 release_swap_map_reader(&handle);
946 985
986 blkdev_put(resume_bdev);
987
988 if (!error)
989 pr_debug("swsusp: Reading resume file was successful\n");
990 else
991 pr_debug("swsusp: Error %d resuming\n", error);
947 return error; 992 return error;
948} 993}
949 994
950/** 995/**
951 * swsusp_check - Check for saved image in swap 996 * swsusp_check - Check for swsusp signature in the resume device
952 */ 997 */
953 998
954int swsusp_check(void) 999int swsusp_check(void)
@@ -958,40 +1003,27 @@ int swsusp_check(void)
958 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 1003 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
959 if (!IS_ERR(resume_bdev)) { 1004 if (!IS_ERR(resume_bdev)) {
960 set_blocksize(resume_bdev, PAGE_SIZE); 1005 set_blocksize(resume_bdev, PAGE_SIZE);
961 error = check_suspend_image(); 1006 memset(&swsusp_header, 0, sizeof(swsusp_header));
1007 if ((error = bio_read_page(0, &swsusp_header)))
1008 return error;
1009 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1010 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1011 /* Reset swap signature now */
1012 error = bio_write_page(0, &swsusp_header);
1013 } else {
1014 return -EINVAL;
1015 }
962 if (error) 1016 if (error)
963 blkdev_put(resume_bdev); 1017 blkdev_put(resume_bdev);
964 } else 1018 else
1019 pr_debug("swsusp: Signature found, resuming\n");
1020 } else {
965 error = PTR_ERR(resume_bdev); 1021 error = PTR_ERR(resume_bdev);
966
967 if (!error)
968 pr_debug("swsusp: resume file found\n");
969 else
970 pr_debug("swsusp: Error %d check for resume file\n", error);
971 return error;
972}
973
974/**
975 * swsusp_read - Read saved image from swap.
976 */
977
978int swsusp_read(void)
979{
980 int error;
981
982 if (IS_ERR(resume_bdev)) {
983 pr_debug("swsusp: block device not initialised\n");
984 return PTR_ERR(resume_bdev);
985 } 1022 }
986 1023
987 error = read_suspend_image(); 1024 if (error)
988 blkdev_put(resume_bdev); 1025 pr_debug("swsusp: Error %d check for resume file\n", error);
989 memset(key_iv, 0, MAXKEY+MAXIV);
990 1026
991 if (!error)
992 pr_debug("swsusp: Reading resume file was successful\n");
993 else
994 pr_debug("swsusp: Error %d resuming\n", error);
995 return error; 1027 return error;
996} 1028}
997 1029
diff --git a/kernel/printk.c b/kernel/printk.c
index e9be027bc930..13ced0f7828f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -11,7 +11,7 @@
11 * Ted Ts'o, 2/11/93. 11 * Ted Ts'o, 2/11/93.
12 * Modified for sysctl support, 1/8/97, Chris Horn. 12 * Modified for sysctl support, 1/8/97, Chris Horn.
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfreds@colorfullife.com 14 * manfred@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au>
17 */ 17 */
@@ -491,7 +491,10 @@ __attribute__((weak)) unsigned long long printk_clock(void)
491 return sched_clock(); 491 return sched_clock();
492} 492}
493 493
494/* 494/**
495 * printk - print a kernel message
496 * @fmt: format string
497 *
495 * This is printk. It can be called from any context. We want it to work. 498 * This is printk. It can be called from any context. We want it to work.
496 * 499 *
497 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 500 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
@@ -503,6 +506,9 @@ __attribute__((weak)) unsigned long long printk_clock(void)
503 * One effect of this deferred printing is that code which calls printk() and 506 * One effect of this deferred printing is that code which calls printk() and
504 * then changes console_loglevel may break. This is because console_loglevel 507 * then changes console_loglevel may break. This is because console_loglevel
505 * is inspected when the actual printing occurs. 508 * is inspected when the actual printing occurs.
509 *
510 * See also:
511 * printf(3)
506 */ 512 */
507 513
508asmlinkage int printk(const char *fmt, ...) 514asmlinkage int printk(const char *fmt, ...)
@@ -563,7 +569,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
563 p[1] <= '7' && p[2] == '>') { 569 p[1] <= '7' && p[2] == '>') {
564 loglev_char = p[1]; 570 loglev_char = p[1];
565 p += 3; 571 p += 3;
566 printed_len += 3; 572 printed_len -= 3;
567 } else { 573 } else {
568 loglev_char = default_message_loglevel 574 loglev_char = default_message_loglevel
569 + '0'; 575 + '0';
@@ -578,7 +584,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
578 584
579 for (tp = tbuf; tp < tbuf + tlen; tp++) 585 for (tp = tbuf; tp < tbuf + tlen; tp++)
580 emit_log_char(*tp); 586 emit_log_char(*tp);
581 printed_len += tlen - 3; 587 printed_len += tlen;
582 } else { 588 } else {
583 if (p[0] != '<' || p[1] < '0' || 589 if (p[0] != '<' || p[1] < '0' ||
584 p[1] > '7' || p[2] != '>') { 590 p[1] > '7' || p[2] != '>') {
@@ -586,8 +592,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
586 emit_log_char(default_message_loglevel 592 emit_log_char(default_message_loglevel
587 + '0'); 593 + '0');
588 emit_log_char('>'); 594 emit_log_char('>');
595 printed_len += 3;
589 } 596 }
590 printed_len += 3;
591 } 597 }
592 log_level_unknown = 0; 598 log_level_unknown = 0;
593 if (!*p) 599 if (!*p)
@@ -655,6 +661,9 @@ static void call_console_drivers(unsigned long start, unsigned long end)
655 661
656/** 662/**
657 * add_preferred_console - add a device to the list of preferred consoles. 663 * add_preferred_console - add a device to the list of preferred consoles.
664 * @name: device name
665 * @idx: device index
666 * @options: options for this console
658 * 667 *
659 * The last preferred console added will be used for kernel messages 668 * The last preferred console added will be used for kernel messages
660 * and stdin/out/err for init. Normally this is used by console_setup 669 * and stdin/out/err for init. Normally this is used by console_setup
@@ -764,7 +773,8 @@ void release_console_sem(void)
764} 773}
765EXPORT_SYMBOL(release_console_sem); 774EXPORT_SYMBOL(release_console_sem);
766 775
767/** console_conditional_schedule - yield the CPU if required 776/**
777 * console_conditional_schedule - yield the CPU if required
768 * 778 *
769 * If the console code is currently allowed to sleep, and 779 * If the console code is currently allowed to sleep, and
770 * if this CPU should yield the CPU to another task, do 780 * if this CPU should yield the CPU to another task, do
@@ -946,7 +956,7 @@ int unregister_console(struct console *console)
946 if (console_drivers == console) { 956 if (console_drivers == console) {
947 console_drivers=console->next; 957 console_drivers=console->next;
948 res = 0; 958 res = 0;
949 } else { 959 } else if (console_drivers) {
950 for (a=console_drivers->next, b=console_drivers ; 960 for (a=console_drivers->next, b=console_drivers ;
951 a; b=a, a=b->next) { 961 a; b=a, a=b->next) {
952 if (a == console) { 962 if (a == console) {
@@ -976,6 +986,8 @@ EXPORT_SYMBOL(unregister_console);
976 986
977/** 987/**
978 * tty_write_message - write a message to a certain tty, not just the console. 988 * tty_write_message - write a message to a certain tty, not just the console.
989 * @tty: the destination tty_struct
990 * @msg: the message to write
979 * 991 *
980 * This is used for messages that need to be redirected to a specific tty. 992 * This is used for messages that need to be redirected to a specific tty.
981 * We don't put it into the syslog queue right now maybe in the future if 993 * We don't put it into the syslog queue right now maybe in the future if
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b88d4186cd7a..5f33cdb6fff5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -7,6 +7,7 @@
7 * to continually duplicate across every architecture. 7 * to continually duplicate across every architecture.
8 */ 8 */
9 9
10#include <linux/capability.h>
10#include <linux/module.h> 11#include <linux/module.h>
11#include <linux/sched.h> 12#include <linux/sched.h>
12#include <linux/errno.h> 13#include <linux/errno.h>
@@ -241,7 +242,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
241 if (write) { 242 if (write) {
242 copy_to_user_page(vma, page, addr, 243 copy_to_user_page(vma, page, addr,
243 maddr + offset, buf, bytes); 244 maddr + offset, buf, bytes);
244 set_page_dirty_lock(page); 245 if (!PageCompound(page))
246 set_page_dirty_lock(page);
245 } else { 247 } else {
246 copy_from_user_page(vma, page, addr, 248 copy_from_user_page(vma, page, addr,
247 buf, maddr + offset, bytes); 249 buf, maddr + offset, bytes);
@@ -407,54 +409,62 @@ int ptrace_request(struct task_struct *child, long request,
407 return ret; 409 return ret;
408} 410}
409 411
410#ifndef __ARCH_SYS_PTRACE 412/**
411static int ptrace_get_task_struct(long request, long pid, 413 * ptrace_traceme -- helper for PTRACE_TRACEME
412 struct task_struct **childp) 414 *
415 * Performs checks and sets PT_PTRACED.
416 * Should be used by all ptrace implementations for PTRACE_TRACEME.
417 */
418int ptrace_traceme(void)
413{ 419{
414 struct task_struct *child;
415 int ret; 420 int ret;
416 421
417 /* 422 /*
418 * Callers use child == NULL as an indication to exit early even 423 * Are we already being traced?
419 * when the return value is 0, so make sure it is non-NULL here. 424 */
425 if (current->ptrace & PT_PTRACED)
426 return -EPERM;
427 ret = security_ptrace(current->parent, current);
428 if (ret)
429 return -EPERM;
430 /*
431 * Set the ptrace bit in the process ptrace flags.
420 */ 432 */
421 *childp = NULL; 433 current->ptrace |= PT_PTRACED;
434 return 0;
435}
422 436
423 if (request == PTRACE_TRACEME) { 437/**
424 /* 438 * ptrace_get_task_struct -- grab a task struct reference for ptrace
425 * Are we already being traced? 439 * @pid: process id to grab a task_struct reference of
426 */ 440 *
427 if (current->ptrace & PT_PTRACED) 441 * This function is a helper for ptrace implementations. It checks
428 return -EPERM; 442 * permissions and then grabs a task struct for use of the actual
429 ret = security_ptrace(current->parent, current); 443 * ptrace implementation.
430 if (ret) 444 *
431 return -EPERM; 445 * Returns the task_struct for @pid or an ERR_PTR() on failure.
432 /* 446 */
433 * Set the ptrace bit in the process ptrace flags. 447struct task_struct *ptrace_get_task_struct(pid_t pid)
434 */ 448{
435 current->ptrace |= PT_PTRACED; 449 struct task_struct *child;
436 return 0;
437 }
438 450
439 /* 451 /*
440 * You may not mess with init 452 * Tracing init is not allowed.
441 */ 453 */
442 if (pid == 1) 454 if (pid == 1)
443 return -EPERM; 455 return ERR_PTR(-EPERM);
444 456
445 ret = -ESRCH;
446 read_lock(&tasklist_lock); 457 read_lock(&tasklist_lock);
447 child = find_task_by_pid(pid); 458 child = find_task_by_pid(pid);
448 if (child) 459 if (child)
449 get_task_struct(child); 460 get_task_struct(child);
450 read_unlock(&tasklist_lock); 461 read_unlock(&tasklist_lock);
451 if (!child) 462 if (!child)
452 return -ESRCH; 463 return ERR_PTR(-ESRCH);
453 464 return child;
454 *childp = child;
455 return 0;
456} 465}
457 466
467#ifndef __ARCH_SYS_PTRACE
458asmlinkage long sys_ptrace(long request, long pid, long addr, long data) 468asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
459{ 469{
460 struct task_struct *child; 470 struct task_struct *child;
@@ -464,13 +474,20 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
464 * This lock_kernel fixes a subtle race with suid exec 474 * This lock_kernel fixes a subtle race with suid exec
465 */ 475 */
466 lock_kernel(); 476 lock_kernel();
467 ret = ptrace_get_task_struct(request, pid, &child); 477 if (request == PTRACE_TRACEME) {
468 if (!child) 478 ret = ptrace_traceme();
469 goto out; 479 goto out;
480 }
481
482 child = ptrace_get_task_struct(pid);
483 if (IS_ERR(child)) {
484 ret = PTR_ERR(child);
485 goto out;
486 }
470 487
471 if (request == PTRACE_ATTACH) { 488 if (request == PTRACE_ATTACH) {
472 ret = ptrace_attach(child); 489 ret = ptrace_attach(child);
473 goto out; 490 goto out_put_task_struct;
474 } 491 }
475 492
476 ret = ptrace_check_attach(child, request == PTRACE_KILL); 493 ret = ptrace_check_attach(child, request == PTRACE_KILL);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c4d159a21e04..0cf8146bd585 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -35,6 +35,7 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/spinlock.h> 36#include <linux/spinlock.h>
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/rcupdate.h>
38#include <linux/interrupt.h> 39#include <linux/interrupt.h>
39#include <linux/sched.h> 40#include <linux/sched.h>
40#include <asm/atomic.h> 41#include <asm/atomic.h>
@@ -45,26 +46,21 @@
45#include <linux/percpu.h> 46#include <linux/percpu.h>
46#include <linux/notifier.h> 47#include <linux/notifier.h>
47#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
48#include <linux/rcuref.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50 50
51/* Definition for rcupdate control block. */ 51/* Definition for rcupdate control block. */
52struct rcu_ctrlblk rcu_ctrlblk = 52struct rcu_ctrlblk rcu_ctrlblk = {
53 { .cur = -300, .completed = -300 }; 53 .cur = -300,
54struct rcu_ctrlblk rcu_bh_ctrlblk = 54 .completed = -300,
55 { .cur = -300, .completed = -300 }; 55 .lock = SPIN_LOCK_UNLOCKED,
56 56 .cpumask = CPU_MASK_NONE,
57/* Bookkeeping of the progress of the grace period */ 57};
58struct rcu_state { 58struct rcu_ctrlblk rcu_bh_ctrlblk = {
59 spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */ 59 .cur = -300,
60 cpumask_t cpumask; /* CPUs that need to switch in order */ 60 .completed = -300,
61 /* for current batch to proceed. */ 61 .lock = SPIN_LOCK_UNLOCKED,
62 .cpumask = CPU_MASK_NONE,
62}; 63};
63
64static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
65 {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
66static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
67 {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
68 64
69DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 65DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
70DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; 66DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
@@ -73,19 +69,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
73static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; 69static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
74static int maxbatch = 10000; 70static int maxbatch = 10000;
75 71
76#ifndef __HAVE_ARCH_CMPXCHG
77/*
78 * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
79 * 32 bit atomic_t implementations, and a hash function similar to that
80 * for our refcounting needs.
81 * Can't help multiprocessors which donot have cmpxchg :(
82 */
83
84spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
85 [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
86};
87#endif
88
89/** 72/**
90 * call_rcu - Queue an RCU callback for invocation after a grace period. 73 * call_rcu - Queue an RCU callback for invocation after a grace period.
91 * @head: structure to be used for queueing the RCU updates. 74 * @head: structure to be used for queueing the RCU updates.
@@ -116,6 +99,10 @@ void fastcall call_rcu(struct rcu_head *head,
116 local_irq_restore(flags); 99 local_irq_restore(flags);
117} 100}
118 101
102static atomic_t rcu_barrier_cpu_count;
103static struct semaphore rcu_barrier_sema;
104static struct completion rcu_barrier_completion;
105
119/** 106/**
120 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. 107 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
121 * @head: structure to be used for queueing the RCU updates. 108 * @head: structure to be used for queueing the RCU updates.
@@ -162,6 +149,42 @@ long rcu_batches_completed(void)
162 return rcu_ctrlblk.completed; 149 return rcu_ctrlblk.completed;
163} 150}
164 151
152static void rcu_barrier_callback(struct rcu_head *notused)
153{
154 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
155 complete(&rcu_barrier_completion);
156}
157
158/*
159 * Called with preemption disabled, and from cross-cpu IRQ context.
160 */
161static void rcu_barrier_func(void *notused)
162{
163 int cpu = smp_processor_id();
164 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
165 struct rcu_head *head;
166
167 head = &rdp->barrier;
168 atomic_inc(&rcu_barrier_cpu_count);
169 call_rcu(head, rcu_barrier_callback);
170}
171
172/**
173 * rcu_barrier - Wait until all the in-flight RCUs are complete.
174 */
175void rcu_barrier(void)
176{
177 BUG_ON(in_interrupt());
178 /* Take cpucontrol semaphore to protect against CPU hotplug */
179 down(&rcu_barrier_sema);
180 init_completion(&rcu_barrier_completion);
181 atomic_set(&rcu_barrier_cpu_count, 0);
182 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
183 wait_for_completion(&rcu_barrier_completion);
184 up(&rcu_barrier_sema);
185}
186EXPORT_SYMBOL_GPL(rcu_barrier);
187
165/* 188/*
166 * Invoke the completed RCU callbacks. They are expected to be in 189 * Invoke the completed RCU callbacks. They are expected to be in
167 * a per-cpu list. 190 * a per-cpu list.
@@ -193,13 +216,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
193 * This is done by rcu_start_batch. The start is not broadcasted to 216 * This is done by rcu_start_batch. The start is not broadcasted to
194 * all cpus, they must pick this up by comparing rcp->cur with 217 * all cpus, they must pick this up by comparing rcp->cur with
195 * rdp->quiescbatch. All cpus are recorded in the 218 * rdp->quiescbatch. All cpus are recorded in the
196 * rcu_state.cpumask bitmap. 219 * rcu_ctrlblk.cpumask bitmap.
197 * - All cpus must go through a quiescent state. 220 * - All cpus must go through a quiescent state.
198 * Since the start of the grace period is not broadcasted, at least two 221 * Since the start of the grace period is not broadcasted, at least two
199 * calls to rcu_check_quiescent_state are required: 222 * calls to rcu_check_quiescent_state are required:
200 * The first call just notices that a new grace period is running. The 223 * The first call just notices that a new grace period is running. The
201 * following calls check if there was a quiescent state since the beginning 224 * following calls check if there was a quiescent state since the beginning
202 * of the grace period. If so, it updates rcu_state.cpumask. If 225 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
203 * the bitmap is empty, then the grace period is completed. 226 * the bitmap is empty, then the grace period is completed.
204 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace 227 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
205 * period (if necessary). 228 * period (if necessary).
@@ -207,25 +230,29 @@ static void rcu_do_batch(struct rcu_data *rdp)
207/* 230/*
208 * Register a new batch of callbacks, and start it up if there is currently no 231 * Register a new batch of callbacks, and start it up if there is currently no
209 * active batch and the batch to be registered has not already occurred. 232 * active batch and the batch to be registered has not already occurred.
210 * Caller must hold rcu_state.lock. 233 * Caller must hold rcu_ctrlblk.lock.
211 */ 234 */
212static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, 235static void rcu_start_batch(struct rcu_ctrlblk *rcp)
213 int next_pending)
214{ 236{
215 if (next_pending)
216 rcp->next_pending = 1;
217
218 if (rcp->next_pending && 237 if (rcp->next_pending &&
219 rcp->completed == rcp->cur) { 238 rcp->completed == rcp->cur) {
220 /* Can't change, since spin lock held. */
221 cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
222
223 rcp->next_pending = 0; 239 rcp->next_pending = 0;
224 /* next_pending == 0 must be visible in __rcu_process_callbacks() 240 /*
225 * before it can see new value of cur. 241 * next_pending == 0 must be visible in
242 * __rcu_process_callbacks() before it can see new value of cur.
226 */ 243 */
227 smp_wmb(); 244 smp_wmb();
228 rcp->cur++; 245 rcp->cur++;
246
247 /*
248 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
249 * Barrier Otherwise it can cause tickless idle CPUs to be
250 * included in rcp->cpumask, which will extend graceperiods
251 * unnecessarily.
252 */
253 smp_mb();
254 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
255
229 } 256 }
230} 257}
231 258
@@ -234,13 +261,13 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
234 * Clear it from the cpu mask and complete the grace period if it was the last 261 * Clear it from the cpu mask and complete the grace period if it was the last
235 * cpu. Start another grace period if someone has further entries pending 262 * cpu. Start another grace period if someone has further entries pending
236 */ 263 */
237static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) 264static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
238{ 265{
239 cpu_clear(cpu, rsp->cpumask); 266 cpu_clear(cpu, rcp->cpumask);
240 if (cpus_empty(rsp->cpumask)) { 267 if (cpus_empty(rcp->cpumask)) {
241 /* batch completed ! */ 268 /* batch completed ! */
242 rcp->completed = rcp->cur; 269 rcp->completed = rcp->cur;
243 rcu_start_batch(rcp, rsp, 0); 270 rcu_start_batch(rcp);
244 } 271 }
245} 272}
246 273
@@ -250,7 +277,7 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
250 * quiescent cycle, then indicate that it has done so. 277 * quiescent cycle, then indicate that it has done so.
251 */ 278 */
252static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, 279static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
253 struct rcu_state *rsp, struct rcu_data *rdp) 280 struct rcu_data *rdp)
254{ 281{
255 if (rdp->quiescbatch != rcp->cur) { 282 if (rdp->quiescbatch != rcp->cur) {
256 /* start new grace period: */ 283 /* start new grace period: */
@@ -275,15 +302,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
275 return; 302 return;
276 rdp->qs_pending = 0; 303 rdp->qs_pending = 0;
277 304
278 spin_lock(&rsp->lock); 305 spin_lock(&rcp->lock);
279 /* 306 /*
280 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync 307 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
281 * during cpu startup. Ignore the quiescent state. 308 * during cpu startup. Ignore the quiescent state.
282 */ 309 */
283 if (likely(rdp->quiescbatch == rcp->cur)) 310 if (likely(rdp->quiescbatch == rcp->cur))
284 cpu_quiet(rdp->cpu, rcp, rsp); 311 cpu_quiet(rdp->cpu, rcp);
285 312
286 spin_unlock(&rsp->lock); 313 spin_unlock(&rcp->lock);
287} 314}
288 315
289 316
@@ -304,28 +331,29 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
304} 331}
305 332
306static void __rcu_offline_cpu(struct rcu_data *this_rdp, 333static void __rcu_offline_cpu(struct rcu_data *this_rdp,
307 struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp) 334 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
308{ 335{
309 /* if the cpu going offline owns the grace period 336 /* if the cpu going offline owns the grace period
310 * we can block indefinitely waiting for it, so flush 337 * we can block indefinitely waiting for it, so flush
311 * it here 338 * it here
312 */ 339 */
313 spin_lock_bh(&rsp->lock); 340 spin_lock_bh(&rcp->lock);
314 if (rcp->cur != rcp->completed) 341 if (rcp->cur != rcp->completed)
315 cpu_quiet(rdp->cpu, rcp, rsp); 342 cpu_quiet(rdp->cpu, rcp);
316 spin_unlock_bh(&rsp->lock); 343 spin_unlock_bh(&rcp->lock);
317 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); 344 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
318 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); 345 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
319 346 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
320} 347}
348
321static void rcu_offline_cpu(int cpu) 349static void rcu_offline_cpu(int cpu)
322{ 350{
323 struct rcu_data *this_rdp = &get_cpu_var(rcu_data); 351 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
324 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); 352 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
325 353
326 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state, 354 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
327 &per_cpu(rcu_data, cpu)); 355 &per_cpu(rcu_data, cpu));
328 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state, 356 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
329 &per_cpu(rcu_bh_data, cpu)); 357 &per_cpu(rcu_bh_data, cpu));
330 put_cpu_var(rcu_data); 358 put_cpu_var(rcu_data);
331 put_cpu_var(rcu_bh_data); 359 put_cpu_var(rcu_bh_data);
@@ -344,7 +372,7 @@ static void rcu_offline_cpu(int cpu)
344 * This does the RCU processing work from tasklet context. 372 * This does the RCU processing work from tasklet context.
345 */ 373 */
346static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, 374static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
347 struct rcu_state *rsp, struct rcu_data *rdp) 375 struct rcu_data *rdp)
348{ 376{
349 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { 377 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
350 *rdp->donetail = rdp->curlist; 378 *rdp->donetail = rdp->curlist;
@@ -374,24 +402,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
374 402
375 if (!rcp->next_pending) { 403 if (!rcp->next_pending) {
376 /* and start it/schedule start if it's a new batch */ 404 /* and start it/schedule start if it's a new batch */
377 spin_lock(&rsp->lock); 405 spin_lock(&rcp->lock);
378 rcu_start_batch(rcp, rsp, 1); 406 rcp->next_pending = 1;
379 spin_unlock(&rsp->lock); 407 rcu_start_batch(rcp);
408 spin_unlock(&rcp->lock);
380 } 409 }
381 } else { 410 } else {
382 local_irq_enable(); 411 local_irq_enable();
383 } 412 }
384 rcu_check_quiescent_state(rcp, rsp, rdp); 413 rcu_check_quiescent_state(rcp, rdp);
385 if (rdp->donelist) 414 if (rdp->donelist)
386 rcu_do_batch(rdp); 415 rcu_do_batch(rdp);
387} 416}
388 417
389static void rcu_process_callbacks(unsigned long unused) 418static void rcu_process_callbacks(unsigned long unused)
390{ 419{
391 __rcu_process_callbacks(&rcu_ctrlblk, &rcu_state, 420 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
392 &__get_cpu_var(rcu_data)); 421 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
393 __rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state, 422}
394 &__get_cpu_var(rcu_bh_data)); 423
424static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
425{
426 /* This cpu has pending rcu entries and the grace period
427 * for them has completed.
428 */
429 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
430 return 1;
431
432 /* This cpu has no pending entries, but there are new entries */
433 if (!rdp->curlist && rdp->nxtlist)
434 return 1;
435
436 /* This cpu has finished callbacks to invoke */
437 if (rdp->donelist)
438 return 1;
439
440 /* The rcu core waits for a quiescent state from the cpu */
441 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
442 return 1;
443
444 /* nothing to do */
445 return 0;
446}
447
448int rcu_pending(int cpu)
449{
450 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
451 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
395} 452}
396 453
397void rcu_check_callbacks(int cpu, int user) 454void rcu_check_callbacks(int cpu, int user)
@@ -457,6 +514,7 @@ static struct notifier_block __devinitdata rcu_nb = {
457 */ 514 */
458void __init rcu_init(void) 515void __init rcu_init(void)
459{ 516{
517 sema_init(&rcu_barrier_sema, 1);
460 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 518 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
461 (void *)(long)smp_processor_id()); 519 (void *)(long)smp_processor_id());
462 /* Register notifier for non-boot CPUs */ 520 /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b58f1eff3ca..773219907dd8 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,7 +39,6 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/rcuref.h>
43#include <linux/cpu.h> 42#include <linux/cpu.h>
44#include <linux/random.h> 43#include <linux/random.h>
45#include <linux/delay.h> 44#include <linux/delay.h>
@@ -49,9 +48,11 @@
49MODULE_LICENSE("GPL"); 48MODULE_LICENSE("GPL");
50 49
51static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ 50static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */
52static int stat_interval = 0; /* Interval between stats, in seconds. */ 51static int stat_interval; /* Interval between stats, in seconds. */
53 /* Defaults to "only at end of test". */ 52 /* Defaults to "only at end of test". */
54static int verbose = 0; /* Print more debug info. */ 53static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
55 56
56MODULE_PARM(nreaders, "i"); 57MODULE_PARM(nreaders, "i");
57MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -59,6 +60,10 @@ MODULE_PARM(stat_interval, "i");
59MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 60MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
60MODULE_PARM(verbose, "i"); 61MODULE_PARM(verbose, "i");
61MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 62MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
63MODULE_PARM(test_no_idle_hz, "i");
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65MODULE_PARM(shuffle_interval, "i");
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
62#define TORTURE_FLAG "rcutorture: " 67#define TORTURE_FLAG "rcutorture: "
63#define PRINTK_STRING(s) \ 68#define PRINTK_STRING(s) \
64 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 69 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
@@ -73,6 +78,7 @@ static int nrealreaders;
73static struct task_struct *writer_task; 78static struct task_struct *writer_task;
74static struct task_struct **reader_tasks; 79static struct task_struct **reader_tasks;
75static struct task_struct *stats_task; 80static struct task_struct *stats_task;
81static struct task_struct *shuffler_task;
76 82
77#define RCU_TORTURE_PIPE_LEN 10 83#define RCU_TORTURE_PIPE_LEN 10
78 84
@@ -80,6 +86,7 @@ struct rcu_torture {
80 struct rcu_head rtort_rcu; 86 struct rcu_head rtort_rcu;
81 int rtort_pipe_count; 87 int rtort_pipe_count;
82 struct list_head rtort_free; 88 struct list_head rtort_free;
89 int rtort_mbtest;
83}; 90};
84 91
85static int fullstop = 0; /* stop generating callbacks at test end. */ 92static int fullstop = 0; /* stop generating callbacks at test end. */
@@ -96,11 +103,13 @@ static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
96atomic_t n_rcu_torture_alloc; 103atomic_t n_rcu_torture_alloc;
97atomic_t n_rcu_torture_alloc_fail; 104atomic_t n_rcu_torture_alloc_fail;
98atomic_t n_rcu_torture_free; 105atomic_t n_rcu_torture_free;
106atomic_t n_rcu_torture_mberror;
107atomic_t n_rcu_torture_error;
99 108
100/* 109/*
101 * Allocate an element from the rcu_tortures pool. 110 * Allocate an element from the rcu_tortures pool.
102 */ 111 */
103struct rcu_torture * 112static struct rcu_torture *
104rcu_torture_alloc(void) 113rcu_torture_alloc(void)
105{ 114{
106 struct list_head *p; 115 struct list_head *p;
@@ -145,9 +154,10 @@ rcu_torture_cb(struct rcu_head *p)
145 if (i > RCU_TORTURE_PIPE_LEN) 154 if (i > RCU_TORTURE_PIPE_LEN)
146 i = RCU_TORTURE_PIPE_LEN; 155 i = RCU_TORTURE_PIPE_LEN;
147 atomic_inc(&rcu_torture_wcount[i]); 156 atomic_inc(&rcu_torture_wcount[i]);
148 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) 157 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
158 rp->rtort_mbtest = 0;
149 rcu_torture_free(rp); 159 rcu_torture_free(rp);
150 else 160 } else
151 call_rcu(p, rcu_torture_cb); 161 call_rcu(p, rcu_torture_cb);
152} 162}
153 163
@@ -195,6 +205,8 @@ rcu_torture_writer(void *arg)
195 static DEFINE_RCU_RANDOM(rand); 205 static DEFINE_RCU_RANDOM(rand);
196 206
197 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 207 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
208 set_user_nice(current, 19);
209
198 do { 210 do {
199 schedule_timeout_uninterruptible(1); 211 schedule_timeout_uninterruptible(1);
200 if (rcu_batches_completed() == oldbatch) 212 if (rcu_batches_completed() == oldbatch)
@@ -204,6 +216,7 @@ rcu_torture_writer(void *arg)
204 rp->rtort_pipe_count = 0; 216 rp->rtort_pipe_count = 0;
205 udelay(rcu_random(&rand) & 0x3ff); 217 udelay(rcu_random(&rand) & 0x3ff);
206 old_rp = rcu_torture_current; 218 old_rp = rcu_torture_current;
219 rp->rtort_mbtest = 1;
207 rcu_assign_pointer(rcu_torture_current, rp); 220 rcu_assign_pointer(rcu_torture_current, rp);
208 smp_wmb(); 221 smp_wmb();
209 if (old_rp != NULL) { 222 if (old_rp != NULL) {
@@ -238,6 +251,8 @@ rcu_torture_reader(void *arg)
238 int pipe_count; 251 int pipe_count;
239 252
240 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 253 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
254 set_user_nice(current, 19);
255
241 do { 256 do {
242 rcu_read_lock(); 257 rcu_read_lock();
243 completed = rcu_batches_completed(); 258 completed = rcu_batches_completed();
@@ -248,6 +263,8 @@ rcu_torture_reader(void *arg)
248 schedule_timeout_interruptible(HZ); 263 schedule_timeout_interruptible(HZ);
249 continue; 264 continue;
250 } 265 }
266 if (p->rtort_mbtest == 0)
267 atomic_inc(&n_rcu_torture_mberror);
251 udelay(rcu_random(&rand) & 0x7f); 268 udelay(rcu_random(&rand) & 0x7f);
252 preempt_disable(); 269 preempt_disable();
253 pipe_count = p->rtort_pipe_count; 270 pipe_count = p->rtort_pipe_count;
@@ -296,16 +313,22 @@ rcu_torture_printk(char *page)
296 } 313 }
297 cnt += sprintf(&page[cnt], "rcutorture: "); 314 cnt += sprintf(&page[cnt], "rcutorture: ");
298 cnt += sprintf(&page[cnt], 315 cnt += sprintf(&page[cnt],
299 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d", 316 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
317 "rtmbe: %d",
300 rcu_torture_current, 318 rcu_torture_current,
301 rcu_torture_current_version, 319 rcu_torture_current_version,
302 list_empty(&rcu_torture_freelist), 320 list_empty(&rcu_torture_freelist),
303 atomic_read(&n_rcu_torture_alloc), 321 atomic_read(&n_rcu_torture_alloc),
304 atomic_read(&n_rcu_torture_alloc_fail), 322 atomic_read(&n_rcu_torture_alloc_fail),
305 atomic_read(&n_rcu_torture_free)); 323 atomic_read(&n_rcu_torture_free),
324 atomic_read(&n_rcu_torture_mberror));
325 if (atomic_read(&n_rcu_torture_mberror) != 0)
326 cnt += sprintf(&page[cnt], " !!!");
306 cnt += sprintf(&page[cnt], "\nrcutorture: "); 327 cnt += sprintf(&page[cnt], "\nrcutorture: ");
307 if (i > 1) 328 if (i > 1) {
308 cnt += sprintf(&page[cnt], "!!! "); 329 cnt += sprintf(&page[cnt], "!!! ");
330 atomic_inc(&n_rcu_torture_error);
331 }
309 cnt += sprintf(&page[cnt], "Reader Pipe: "); 332 cnt += sprintf(&page[cnt], "Reader Pipe: ");
310 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 333 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
311 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 334 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
@@ -359,12 +382,77 @@ rcu_torture_stats(void *arg)
359 return 0; 382 return 0;
360} 383}
361 384
385static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
386
387/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
388 * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
389 */
390void rcu_torture_shuffle_tasks(void)
391{
392 cpumask_t tmp_mask = CPU_MASK_ALL;
393 int i;
394
395 lock_cpu_hotplug();
396
397 /* No point in shuffling if there is only one online CPU (ex: UP) */
398 if (num_online_cpus() == 1) {
399 unlock_cpu_hotplug();
400 return;
401 }
402
403 if (rcu_idle_cpu != -1)
404 cpu_clear(rcu_idle_cpu, tmp_mask);
405
406 set_cpus_allowed(current, tmp_mask);
407
408 if (reader_tasks != NULL) {
409 for (i = 0; i < nrealreaders; i++)
410 if (reader_tasks[i])
411 set_cpus_allowed(reader_tasks[i], tmp_mask);
412 }
413
414 if (writer_task)
415 set_cpus_allowed(writer_task, tmp_mask);
416
417 if (stats_task)
418 set_cpus_allowed(stats_task, tmp_mask);
419
420 if (rcu_idle_cpu == -1)
421 rcu_idle_cpu = num_online_cpus() - 1;
422 else
423 rcu_idle_cpu--;
424
425 unlock_cpu_hotplug();
426}
427
428/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
429 * system to become idle at a time and cut off its timer ticks. This is meant
430 * to test the support for such tickless idle CPU in RCU.
431 */
432static int
433rcu_torture_shuffle(void *arg)
434{
435 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
436 do {
437 schedule_timeout_interruptible(shuffle_interval * HZ);
438 rcu_torture_shuffle_tasks();
439 } while (!kthread_should_stop());
440 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
441 return 0;
442}
443
362static void 444static void
363rcu_torture_cleanup(void) 445rcu_torture_cleanup(void)
364{ 446{
365 int i; 447 int i;
366 448
367 fullstop = 1; 449 fullstop = 1;
450 if (shuffler_task != NULL) {
451 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
452 kthread_stop(shuffler_task);
453 }
454 shuffler_task = NULL;
455
368 if (writer_task != NULL) { 456 if (writer_task != NULL) {
369 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); 457 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
370 kthread_stop(writer_task); 458 kthread_stop(writer_task);
@@ -392,11 +480,12 @@ rcu_torture_cleanup(void)
392 stats_task = NULL; 480 stats_task = NULL;
393 481
394 /* Wait for all RCU callbacks to fire. */ 482 /* Wait for all RCU callbacks to fire. */
483 rcu_barrier();
395 484
396 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
397 synchronize_rcu();
398 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 485 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
399 PRINTK_STRING("--- End of test"); 486 printk(KERN_ALERT TORTURE_FLAG
487 "--- End of test: %s\n",
488 atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
400} 489}
401 490
402static int 491static int
@@ -412,15 +501,18 @@ rcu_torture_init(void)
412 nrealreaders = nreaders; 501 nrealreaders = nreaders;
413 else 502 else
414 nrealreaders = 2 * num_online_cpus(); 503 nrealreaders = 2 * num_online_cpus();
415 printk(KERN_ALERT TORTURE_FLAG 504 printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d "
416 "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", 505 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
417 nrealreaders, stat_interval, verbose); 506 "shuffle_interval = %d\n",
507 nrealreaders, stat_interval, verbose, test_no_idle_hz,
508 shuffle_interval);
418 fullstop = 0; 509 fullstop = 0;
419 510
420 /* Set up the freelist. */ 511 /* Set up the freelist. */
421 512
422 INIT_LIST_HEAD(&rcu_torture_freelist); 513 INIT_LIST_HEAD(&rcu_torture_freelist);
423 for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { 514 for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
515 rcu_tortures[i].rtort_mbtest = 0;
424 list_add_tail(&rcu_tortures[i].rtort_free, 516 list_add_tail(&rcu_tortures[i].rtort_free,
425 &rcu_torture_freelist); 517 &rcu_torture_freelist);
426 } 518 }
@@ -432,6 +524,8 @@ rcu_torture_init(void)
432 atomic_set(&n_rcu_torture_alloc, 0); 524 atomic_set(&n_rcu_torture_alloc, 0);
433 atomic_set(&n_rcu_torture_alloc_fail, 0); 525 atomic_set(&n_rcu_torture_alloc_fail, 0);
434 atomic_set(&n_rcu_torture_free, 0); 526 atomic_set(&n_rcu_torture_free, 0);
527 atomic_set(&n_rcu_torture_mberror, 0);
528 atomic_set(&n_rcu_torture_error, 0);
435 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 529 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
436 atomic_set(&rcu_torture_wcount[i], 0); 530 atomic_set(&rcu_torture_wcount[i], 0);
437 for_each_cpu(cpu) { 531 for_each_cpu(cpu) {
@@ -481,6 +575,18 @@ rcu_torture_init(void)
481 goto unwind; 575 goto unwind;
482 } 576 }
483 } 577 }
578 if (test_no_idle_hz) {
579 rcu_idle_cpu = num_online_cpus() - 1;
580 /* Create the shuffler thread */
581 shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
582 "rcu_torture_shuffle");
583 if (IS_ERR(shuffler_task)) {
584 firsterr = PTR_ERR(shuffler_task);
585 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
586 shuffler_task = NULL;
587 goto unwind;
588 }
589 }
484 return 0; 590 return 0;
485 591
486unwind: 592unwind:
diff --git a/kernel/resource.c b/kernel/resource.c
index 92285d822de6..e3080fcc66a3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -464,7 +464,7 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
464 464
465EXPORT_SYMBOL(__request_region); 465EXPORT_SYMBOL(__request_region);
466 466
467int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n) 467int __check_region(struct resource *parent, unsigned long start, unsigned long n)
468{ 468{
469 struct resource * res; 469 struct resource * res;
470 470
diff --git a/kernel/sched.c b/kernel/sched.c
index b6506671b2be..3ee2ae45125f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -27,12 +27,14 @@
27#include <linux/smp_lock.h> 27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 28#include <asm/mmu_context.h>
29#include <linux/interrupt.h> 29#include <linux/interrupt.h>
30#include <linux/capability.h>
30#include <linux/completion.h> 31#include <linux/completion.h>
31#include <linux/kernel_stat.h> 32#include <linux/kernel_stat.h>
32#include <linux/security.h> 33#include <linux/security.h>
33#include <linux/notifier.h> 34#include <linux/notifier.h>
34#include <linux/profile.h> 35#include <linux/profile.h>
35#include <linux/suspend.h> 36#include <linux/suspend.h>
37#include <linux/vmalloc.h>
36#include <linux/blkdev.h> 38#include <linux/blkdev.h>
37#include <linux/delay.h> 39#include <linux/delay.h>
38#include <linux/smp.h> 40#include <linux/smp.h>
@@ -176,6 +178,13 @@ static unsigned int task_timeslice(task_t *p)
176#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 178#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
177 < (long long) (sd)->cache_hot_time) 179 < (long long) (sd)->cache_hot_time)
178 180
181void __put_task_struct_cb(struct rcu_head *rhp)
182{
183 __put_task_struct(container_of(rhp, struct task_struct, rcu));
184}
185
186EXPORT_SYMBOL_GPL(__put_task_struct_cb);
187
179/* 188/*
180 * These are the runqueue data structures: 189 * These are the runqueue data structures:
181 */ 190 */
@@ -512,7 +521,7 @@ static inline void sched_info_dequeued(task_t *t)
512 * long it was waiting to run. We also note when it began so that we 521 * long it was waiting to run. We also note when it began so that we
513 * can keep stats on how long its timeslice is. 522 * can keep stats on how long its timeslice is.
514 */ 523 */
515static inline void sched_info_arrive(task_t *t) 524static void sched_info_arrive(task_t *t)
516{ 525{
517 unsigned long now = jiffies, diff = 0; 526 unsigned long now = jiffies, diff = 0;
518 struct runqueue *rq = task_rq(t); 527 struct runqueue *rq = task_rq(t);
@@ -739,10 +748,14 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
739 unsigned long long __sleep_time = now - p->timestamp; 748 unsigned long long __sleep_time = now - p->timestamp;
740 unsigned long sleep_time; 749 unsigned long sleep_time;
741 750
742 if (__sleep_time > NS_MAX_SLEEP_AVG) 751 if (unlikely(p->policy == SCHED_BATCH))
743 sleep_time = NS_MAX_SLEEP_AVG; 752 sleep_time = 0;
744 else 753 else {
745 sleep_time = (unsigned long)__sleep_time; 754 if (__sleep_time > NS_MAX_SLEEP_AVG)
755 sleep_time = NS_MAX_SLEEP_AVG;
756 else
757 sleep_time = (unsigned long)__sleep_time;
758 }
746 759
747 if (likely(sleep_time > 0)) { 760 if (likely(sleep_time > 0)) {
748 /* 761 /*
@@ -994,7 +1007,7 @@ void kick_process(task_t *p)
994 * We want to under-estimate the load of migration sources, to 1007 * We want to under-estimate the load of migration sources, to
995 * balance conservatively. 1008 * balance conservatively.
996 */ 1009 */
997static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) 1010static unsigned long __source_load(int cpu, int type, enum idle_type idle)
998{ 1011{
999 runqueue_t *rq = cpu_rq(cpu); 1012 runqueue_t *rq = cpu_rq(cpu);
1000 unsigned long running = rq->nr_running; 1013 unsigned long running = rq->nr_running;
@@ -1281,6 +1294,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1281 } 1294 }
1282 } 1295 }
1283 1296
1297 if (p->last_waker_cpu != this_cpu)
1298 goto out_set_cpu;
1299
1284 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1300 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1285 goto out_set_cpu; 1301 goto out_set_cpu;
1286 1302
@@ -1351,6 +1367,8 @@ out_set_cpu:
1351 cpu = task_cpu(p); 1367 cpu = task_cpu(p);
1352 } 1368 }
1353 1369
1370 p->last_waker_cpu = this_cpu;
1371
1354out_activate: 1372out_activate:
1355#endif /* CONFIG_SMP */ 1373#endif /* CONFIG_SMP */
1356 if (old_state == TASK_UNINTERRUPTIBLE) { 1374 if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1432,12 +1450,15 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1432#ifdef CONFIG_SCHEDSTATS 1450#ifdef CONFIG_SCHEDSTATS
1433 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1451 memset(&p->sched_info, 0, sizeof(p->sched_info));
1434#endif 1452#endif
1435#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1453#if defined(CONFIG_SMP)
1454 p->last_waker_cpu = cpu;
1455#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
1436 p->oncpu = 0; 1456 p->oncpu = 0;
1437#endif 1457#endif
1458#endif
1438#ifdef CONFIG_PREEMPT 1459#ifdef CONFIG_PREEMPT
1439 /* Want to start with kernel preemption disabled. */ 1460 /* Want to start with kernel preemption disabled. */
1440 p->thread_info->preempt_count = 1; 1461 task_thread_info(p)->preempt_count = 1;
1441#endif 1462#endif
1442 /* 1463 /*
1443 * Share the timeslice between parent and child, thus the 1464 * Share the timeslice between parent and child, thus the
@@ -1849,7 +1870,7 @@ void sched_exec(void)
1849 * pull_task - move a task from a remote runqueue to the local runqueue. 1870 * pull_task - move a task from a remote runqueue to the local runqueue.
1850 * Both runqueues must be locked. 1871 * Both runqueues must be locked.
1851 */ 1872 */
1852static inline 1873static
1853void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, 1874void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1854 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1875 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1855{ 1876{
@@ -1871,7 +1892,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1871/* 1892/*
1872 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 1893 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1873 */ 1894 */
1874static inline 1895static
1875int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1896int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1876 struct sched_domain *sd, enum idle_type idle, 1897 struct sched_domain *sd, enum idle_type idle,
1877 int *all_pinned) 1898 int *all_pinned)
@@ -2357,7 +2378,7 @@ out_balanced:
2357 * idle_balance is called by schedule() if this_cpu is about to become 2378 * idle_balance is called by schedule() if this_cpu is about to become
2358 * idle. Attempts to pull tasks from other CPUs. 2379 * idle. Attempts to pull tasks from other CPUs.
2359 */ 2380 */
2360static inline void idle_balance(int this_cpu, runqueue_t *this_rq) 2381static void idle_balance(int this_cpu, runqueue_t *this_rq)
2361{ 2382{
2362 struct sched_domain *sd; 2383 struct sched_domain *sd;
2363 2384
@@ -2741,7 +2762,7 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
2741 resched_task(rq->idle); 2762 resched_task(rq->idle);
2742} 2763}
2743 2764
2744static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2765static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2745{ 2766{
2746 struct sched_domain *tmp, *sd = NULL; 2767 struct sched_domain *tmp, *sd = NULL;
2747 cpumask_t sibling_map; 2768 cpumask_t sibling_map;
@@ -2795,7 +2816,7 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2795 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 2816 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2796} 2817}
2797 2818
2798static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2819static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2799{ 2820{
2800 struct sched_domain *tmp, *sd = NULL; 2821 struct sched_domain *tmp, *sd = NULL;
2801 cpumask_t sibling_map; 2822 cpumask_t sibling_map;
@@ -3543,7 +3564,7 @@ void set_user_nice(task_t *p, long nice)
3543 * The RT priorities are set via sched_setscheduler(), but we still 3564 * The RT priorities are set via sched_setscheduler(), but we still
3544 * allow the 'normal' nice value to be set - but as expected 3565 * allow the 'normal' nice value to be set - but as expected
3545 * it wont have any effect on scheduling until the task is 3566 * it wont have any effect on scheduling until the task is
3546 * not SCHED_NORMAL: 3567 * not SCHED_NORMAL/SCHED_BATCH:
3547 */ 3568 */
3548 if (rt_task(p)) { 3569 if (rt_task(p)) {
3549 p->static_prio = NICE_TO_PRIO(nice); 3570 p->static_prio = NICE_TO_PRIO(nice);
@@ -3689,10 +3710,16 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3689 BUG_ON(p->array); 3710 BUG_ON(p->array);
3690 p->policy = policy; 3711 p->policy = policy;
3691 p->rt_priority = prio; 3712 p->rt_priority = prio;
3692 if (policy != SCHED_NORMAL) 3713 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
3693 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3714 p->prio = MAX_RT_PRIO-1 - p->rt_priority;
3694 else 3715 } else {
3695 p->prio = p->static_prio; 3716 p->prio = p->static_prio;
3717 /*
3718 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3719 */
3720 if (policy == SCHED_BATCH)
3721 p->sleep_avg = 0;
3722 }
3696} 3723}
3697 3724
3698/** 3725/**
@@ -3716,29 +3743,35 @@ recheck:
3716 if (policy < 0) 3743 if (policy < 0)
3717 policy = oldpolicy = p->policy; 3744 policy = oldpolicy = p->policy;
3718 else if (policy != SCHED_FIFO && policy != SCHED_RR && 3745 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3719 policy != SCHED_NORMAL) 3746 policy != SCHED_NORMAL && policy != SCHED_BATCH)
3720 return -EINVAL; 3747 return -EINVAL;
3721 /* 3748 /*
3722 * Valid priorities for SCHED_FIFO and SCHED_RR are 3749 * Valid priorities for SCHED_FIFO and SCHED_RR are
3723 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3750 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
3751 * SCHED_BATCH is 0.
3724 */ 3752 */
3725 if (param->sched_priority < 0 || 3753 if (param->sched_priority < 0 ||
3726 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3754 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3727 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3755 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3728 return -EINVAL; 3756 return -EINVAL;
3729 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3757 if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
3758 != (param->sched_priority == 0))
3730 return -EINVAL; 3759 return -EINVAL;
3731 3760
3732 /* 3761 /*
3733 * Allow unprivileged RT tasks to decrease priority: 3762 * Allow unprivileged RT tasks to decrease priority:
3734 */ 3763 */
3735 if (!capable(CAP_SYS_NICE)) { 3764 if (!capable(CAP_SYS_NICE)) {
3736 /* can't change policy */ 3765 /*
3737 if (policy != p->policy && 3766 * can't change policy, except between SCHED_NORMAL
3738 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3767 * and SCHED_BATCH:
3768 */
3769 if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
3770 (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
3771 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3739 return -EPERM; 3772 return -EPERM;
3740 /* can't increase priority */ 3773 /* can't increase priority */
3741 if (policy != SCHED_NORMAL && 3774 if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
3742 param->sched_priority > p->rt_priority && 3775 param->sched_priority > p->rt_priority &&
3743 param->sched_priority > 3776 param->sched_priority >
3744 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3777 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
@@ -3817,6 +3850,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3817asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 3850asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3818 struct sched_param __user *param) 3851 struct sched_param __user *param)
3819{ 3852{
3853 /* negative values for policy are not valid */
3854 if (policy < 0)
3855 return -EINVAL;
3856
3820 return do_sched_setscheduler(pid, policy, param); 3857 return do_sched_setscheduler(pid, policy, param);
3821} 3858}
3822 3859
@@ -3972,12 +4009,12 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3972 * method, such as ACPI for e.g. 4009 * method, such as ACPI for e.g.
3973 */ 4010 */
3974 4011
3975cpumask_t cpu_present_map; 4012cpumask_t cpu_present_map __read_mostly;
3976EXPORT_SYMBOL(cpu_present_map); 4013EXPORT_SYMBOL(cpu_present_map);
3977 4014
3978#ifndef CONFIG_SMP 4015#ifndef CONFIG_SMP
3979cpumask_t cpu_online_map = CPU_MASK_ALL; 4016cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
3980cpumask_t cpu_possible_map = CPU_MASK_ALL; 4017cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
3981#endif 4018#endif
3982 4019
3983long sched_getaffinity(pid_t pid, cpumask_t *mask) 4020long sched_getaffinity(pid_t pid, cpumask_t *mask)
@@ -4216,6 +4253,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4216 ret = MAX_USER_RT_PRIO-1; 4253 ret = MAX_USER_RT_PRIO-1;
4217 break; 4254 break;
4218 case SCHED_NORMAL: 4255 case SCHED_NORMAL:
4256 case SCHED_BATCH:
4219 ret = 0; 4257 ret = 0;
4220 break; 4258 break;
4221 } 4259 }
@@ -4239,6 +4277,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4239 ret = 1; 4277 ret = 1;
4240 break; 4278 break;
4241 case SCHED_NORMAL: 4279 case SCHED_NORMAL:
4280 case SCHED_BATCH:
4242 ret = 0; 4281 ret = 0;
4243 } 4282 }
4244 return ret; 4283 return ret;
@@ -4327,10 +4366,10 @@ static void show_task(task_t *p)
4327#endif 4366#endif
4328#ifdef CONFIG_DEBUG_STACK_USAGE 4367#ifdef CONFIG_DEBUG_STACK_USAGE
4329 { 4368 {
4330 unsigned long *n = (unsigned long *) (p->thread_info+1); 4369 unsigned long *n = end_of_stack(p);
4331 while (!*n) 4370 while (!*n)
4332 n++; 4371 n++;
4333 free = (unsigned long) n - (unsigned long)(p->thread_info+1); 4372 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4334 } 4373 }
4335#endif 4374#endif
4336 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); 4375 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
@@ -4379,6 +4418,7 @@ void show_state(void)
4379 } while_each_thread(g, p); 4418 } while_each_thread(g, p);
4380 4419
4381 read_unlock(&tasklist_lock); 4420 read_unlock(&tasklist_lock);
4421 mutex_debug_show_all_locks();
4382} 4422}
4383 4423
4384/** 4424/**
@@ -4410,9 +4450,9 @@ void __devinit init_idle(task_t *idle, int cpu)
4410 4450
4411 /* Set the preempt count _outside_ the spinlocks! */ 4451 /* Set the preempt count _outside_ the spinlocks! */
4412#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) 4452#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4413 idle->thread_info->preempt_count = (idle->lock_depth >= 0); 4453 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4414#else 4454#else
4415 idle->thread_info->preempt_count = 0; 4455 task_thread_info(idle)->preempt_count = 0;
4416#endif 4456#endif
4417} 4457}
4418 4458
@@ -5073,7 +5113,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5073 5113
5074#define SD_NODES_PER_DOMAIN 16 5114#define SD_NODES_PER_DOMAIN 16
5075 5115
5116/*
5117 * Self-tuning task migration cost measurement between source and target CPUs.
5118 *
5119 * This is done by measuring the cost of manipulating buffers of varying
5120 * sizes. For a given buffer-size here are the steps that are taken:
5121 *
5122 * 1) the source CPU reads+dirties a shared buffer
5123 * 2) the target CPU reads+dirties the same shared buffer
5124 *
5125 * We measure how long they take, in the following 4 scenarios:
5126 *
5127 * - source: CPU1, target: CPU2 | cost1
5128 * - source: CPU2, target: CPU1 | cost2
5129 * - source: CPU1, target: CPU1 | cost3
5130 * - source: CPU2, target: CPU2 | cost4
5131 *
5132 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5133 * the cost of migration.
5134 *
5135 * We then start off from a small buffer-size and iterate up to larger
5136 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5137 * doing a maximum search for the cost. (The maximum cost for a migration
5138 * normally occurs when the working set size is around the effective cache
5139 * size.)
5140 */
5141#define SEARCH_SCOPE 2
5142#define MIN_CACHE_SIZE (64*1024U)
5143#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5144#define ITERATIONS 2
5145#define SIZE_THRESH 130
5146#define COST_THRESH 130
5147
5148/*
5149 * The migration cost is a function of 'domain distance'. Domain
5150 * distance is the number of steps a CPU has to iterate down its
5151 * domain tree to share a domain with the other CPU. The farther
5152 * two CPUs are from each other, the larger the distance gets.
5153 *
5154 * Note that we use the distance only to cache measurement results,
5155 * the distance value is not used numerically otherwise. When two
5156 * CPUs have the same distance it is assumed that the migration
5157 * cost is the same. (this is a simplification but quite practical)
5158 */
5159#define MAX_DOMAIN_DISTANCE 32
5160
5161static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5162 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
5163
5164/*
5165 * Allow override of migration cost - in units of microseconds.
5166 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5167 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5168 */
5169static int __init migration_cost_setup(char *str)
5170{
5171 int ints[MAX_DOMAIN_DISTANCE+1], i;
5172
5173 str = get_options(str, ARRAY_SIZE(ints), ints);
5174
5175 printk("#ints: %d\n", ints[0]);
5176 for (i = 1; i <= ints[0]; i++) {
5177 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5178 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5179 }
5180 return 1;
5181}
5182
5183__setup ("migration_cost=", migration_cost_setup);
5184
5185/*
5186 * Global multiplier (divisor) for migration-cutoff values,
5187 * in percentiles. E.g. use a value of 150 to get 1.5 times
5188 * longer cache-hot cutoff times.
5189 *
5190 * (We scale it from 100 to 128 to long long handling easier.)
5191 */
5192
5193#define MIGRATION_FACTOR_SCALE 128
5194
5195static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5196
5197static int __init setup_migration_factor(char *str)
5198{
5199 get_option(&str, &migration_factor);
5200 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5201 return 1;
5202}
5203
5204__setup("migration_factor=", setup_migration_factor);
5205
5206/*
5207 * Estimated distance of two CPUs, measured via the number of domains
5208 * we have to pass for the two CPUs to be in the same span:
5209 */
5210static unsigned long domain_distance(int cpu1, int cpu2)
5211{
5212 unsigned long distance = 0;
5213 struct sched_domain *sd;
5214
5215 for_each_domain(cpu1, sd) {
5216 WARN_ON(!cpu_isset(cpu1, sd->span));
5217 if (cpu_isset(cpu2, sd->span))
5218 return distance;
5219 distance++;
5220 }
5221 if (distance >= MAX_DOMAIN_DISTANCE) {
5222 WARN_ON(1);
5223 distance = MAX_DOMAIN_DISTANCE-1;
5224 }
5225
5226 return distance;
5227}
5228
5229static unsigned int migration_debug;
5230
5231static int __init setup_migration_debug(char *str)
5232{
5233 get_option(&str, &migration_debug);
5234 return 1;
5235}
5236
5237__setup("migration_debug=", setup_migration_debug);
5238
5239/*
5240 * Maximum cache-size that the scheduler should try to measure.
5241 * Architectures with larger caches should tune this up during
5242 * bootup. Gets used in the domain-setup code (i.e. during SMP
5243 * bootup).
5244 */
5245unsigned int max_cache_size;
5246
5247static int __init setup_max_cache_size(char *str)
5248{
5249 get_option(&str, &max_cache_size);
5250 return 1;
5251}
5252
5253__setup("max_cache_size=", setup_max_cache_size);
5254
5255/*
5256 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5257 * is the operation that is timed, so we try to generate unpredictable
5258 * cachemisses that still end up filling the L2 cache:
5259 */
5260static void touch_cache(void *__cache, unsigned long __size)
5261{
5262 unsigned long size = __size/sizeof(long), chunk1 = size/3,
5263 chunk2 = 2*size/3;
5264 unsigned long *cache = __cache;
5265 int i;
5266
5267 for (i = 0; i < size/6; i += 8) {
5268 switch (i % 6) {
5269 case 0: cache[i]++;
5270 case 1: cache[size-1-i]++;
5271 case 2: cache[chunk1-i]++;
5272 case 3: cache[chunk1+i]++;
5273 case 4: cache[chunk2-i]++;
5274 case 5: cache[chunk2+i]++;
5275 }
5276 }
5277}
5278
5279/*
5280 * Measure the cache-cost of one task migration. Returns in units of nsec.
5281 */
5282static unsigned long long measure_one(void *cache, unsigned long size,
5283 int source, int target)
5284{
5285 cpumask_t mask, saved_mask;
5286 unsigned long long t0, t1, t2, t3, cost;
5287
5288 saved_mask = current->cpus_allowed;
5289
5290 /*
5291 * Flush source caches to RAM and invalidate them:
5292 */
5293 sched_cacheflush();
5294
5295 /*
5296 * Migrate to the source CPU:
5297 */
5298 mask = cpumask_of_cpu(source);
5299 set_cpus_allowed(current, mask);
5300 WARN_ON(smp_processor_id() != source);
5301
5302 /*
5303 * Dirty the working set:
5304 */
5305 t0 = sched_clock();
5306 touch_cache(cache, size);
5307 t1 = sched_clock();
5308
5309 /*
5310 * Migrate to the target CPU, dirty the L2 cache and access
5311 * the shared buffer. (which represents the working set
5312 * of a migrated task.)
5313 */
5314 mask = cpumask_of_cpu(target);
5315 set_cpus_allowed(current, mask);
5316 WARN_ON(smp_processor_id() != target);
5317
5318 t2 = sched_clock();
5319 touch_cache(cache, size);
5320 t3 = sched_clock();
5321
5322 cost = t1-t0 + t3-t2;
5323
5324 if (migration_debug >= 2)
5325 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
5326 source, target, t1-t0, t1-t0, t3-t2, cost);
5327 /*
5328 * Flush target caches to RAM and invalidate them:
5329 */
5330 sched_cacheflush();
5331
5332 set_cpus_allowed(current, saved_mask);
5333
5334 return cost;
5335}
5336
5337/*
5338 * Measure a series of task migrations and return the average
5339 * result. Since this code runs early during bootup the system
5340 * is 'undisturbed' and the average latency makes sense.
5341 *
5342 * The algorithm in essence auto-detects the relevant cache-size,
5343 * so it will properly detect different cachesizes for different
5344 * cache-hierarchies, depending on how the CPUs are connected.
5345 *
5346 * Architectures can prime the upper limit of the search range via
5347 * max_cache_size, otherwise the search range defaults to 20MB...64K.
5348 */
5349static unsigned long long
5350measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5351{
5352 unsigned long long cost1, cost2;
5353 int i;
5354
5355 /*
5356 * Measure the migration cost of 'size' bytes, over an
5357 * average of 10 runs:
5358 *
5359 * (We perturb the cache size by a small (0..4k)
5360 * value to compensate size/alignment related artifacts.
5361 * We also subtract the cost of the operation done on
5362 * the same CPU.)
5363 */
5364 cost1 = 0;
5365
5366 /*
5367 * dry run, to make sure we start off cache-cold on cpu1,
5368 * and to get any vmalloc pagefaults in advance:
5369 */
5370 measure_one(cache, size, cpu1, cpu2);
5371 for (i = 0; i < ITERATIONS; i++)
5372 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
5373
5374 measure_one(cache, size, cpu2, cpu1);
5375 for (i = 0; i < ITERATIONS; i++)
5376 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
5377
5378 /*
5379 * (We measure the non-migrating [cached] cost on both
5380 * cpu1 and cpu2, to handle CPUs with different speeds)
5381 */
5382 cost2 = 0;
5383
5384 measure_one(cache, size, cpu1, cpu1);
5385 for (i = 0; i < ITERATIONS; i++)
5386 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
5387
5388 measure_one(cache, size, cpu2, cpu2);
5389 for (i = 0; i < ITERATIONS; i++)
5390 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
5391
5392 /*
5393 * Get the per-iteration migration cost:
5394 */
5395 do_div(cost1, 2*ITERATIONS);
5396 do_div(cost2, 2*ITERATIONS);
5397
5398 return cost1 - cost2;
5399}
5400
5401static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5402{
5403 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
5404 unsigned int max_size, size, size_found = 0;
5405 long long cost = 0, prev_cost;
5406 void *cache;
5407
5408 /*
5409 * Search from max_cache_size*5 down to 64K - the real relevant
5410 * cachesize has to lie somewhere inbetween.
5411 */
5412 if (max_cache_size) {
5413 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
5414 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
5415 } else {
5416 /*
5417 * Since we have no estimation about the relevant
5418 * search range
5419 */
5420 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
5421 size = MIN_CACHE_SIZE;
5422 }
5423
5424 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
5425 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
5426 return 0;
5427 }
5428
5429 /*
5430 * Allocate the working set:
5431 */
5432 cache = vmalloc(max_size);
5433 if (!cache) {
5434 printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
5435 return 1000000; // return 1 msec on very small boxen
5436 }
5437
5438 while (size <= max_size) {
5439 prev_cost = cost;
5440 cost = measure_cost(cpu1, cpu2, cache, size);
5441
5442 /*
5443 * Update the max:
5444 */
5445 if (cost > 0) {
5446 if (max_cost < cost) {
5447 max_cost = cost;
5448 size_found = size;
5449 }
5450 }
5451 /*
5452 * Calculate average fluctuation, we use this to prevent
5453 * noise from triggering an early break out of the loop:
5454 */
5455 fluct = abs(cost - prev_cost);
5456 avg_fluct = (avg_fluct + fluct)/2;
5457
5458 if (migration_debug)
5459 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
5460 cpu1, cpu2, size,
5461 (long)cost / 1000000,
5462 ((long)cost / 100000) % 10,
5463 (long)max_cost / 1000000,
5464 ((long)max_cost / 100000) % 10,
5465 domain_distance(cpu1, cpu2),
5466 cost, avg_fluct);
5467
5468 /*
5469 * If we iterated at least 20% past the previous maximum,
5470 * and the cost has dropped by more than 20% already,
5471 * (taking fluctuations into account) then we assume to
5472 * have found the maximum and break out of the loop early:
5473 */
5474 if (size_found && (size*100 > size_found*SIZE_THRESH))
5475 if (cost+avg_fluct <= 0 ||
5476 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
5477
5478 if (migration_debug)
5479 printk("-> found max.\n");
5480 break;
5481 }
5482 /*
5483 * Increase the cachesize in 5% steps:
5484 */
5485 size = size * 20 / 19;
5486 }
5487
5488 if (migration_debug)
5489 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
5490 cpu1, cpu2, size_found, max_cost);
5491
5492 vfree(cache);
5493
5494 /*
5495 * A task is considered 'cache cold' if at least 2 times
5496 * the worst-case cost of migration has passed.
5497 *
5498 * (this limit is only listened to if the load-balancing
5499 * situation is 'nice' - if there is a large imbalance we
5500 * ignore it for the sake of CPU utilization and
5501 * processing fairness.)
5502 */
5503 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
5504}
5505
5506static void calibrate_migration_costs(const cpumask_t *cpu_map)
5507{
5508 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
5509 unsigned long j0, j1, distance, max_distance = 0;
5510 struct sched_domain *sd;
5511
5512 j0 = jiffies;
5513
5514 /*
5515 * First pass - calculate the cacheflush times:
5516 */
5517 for_each_cpu_mask(cpu1, *cpu_map) {
5518 for_each_cpu_mask(cpu2, *cpu_map) {
5519 if (cpu1 == cpu2)
5520 continue;
5521 distance = domain_distance(cpu1, cpu2);
5522 max_distance = max(max_distance, distance);
5523 /*
5524 * No result cached yet?
5525 */
5526 if (migration_cost[distance] == -1LL)
5527 migration_cost[distance] =
5528 measure_migration_cost(cpu1, cpu2);
5529 }
5530 }
5531 /*
5532 * Second pass - update the sched domain hierarchy with
5533 * the new cache-hot-time estimations:
5534 */
5535 for_each_cpu_mask(cpu, *cpu_map) {
5536 distance = 0;
5537 for_each_domain(cpu, sd) {
5538 sd->cache_hot_time = migration_cost[distance];
5539 distance++;
5540 }
5541 }
5542 /*
5543 * Print the matrix:
5544 */
5545 if (migration_debug)
5546 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
5547 max_cache_size,
5548#ifdef CONFIG_X86
5549 cpu_khz/1000
5550#else
5551 -1
5552#endif
5553 );
5554 printk("migration_cost=");
5555 for (distance = 0; distance <= max_distance; distance++) {
5556 if (distance)
5557 printk(",");
5558 printk("%ld", (long)migration_cost[distance] / 1000);
5559 }
5560 printk("\n");
5561 j1 = jiffies;
5562 if (migration_debug)
5563 printk("migration: %ld seconds\n", (j1-j0)/HZ);
5564
5565 /*
5566 * Move back to the original CPU. NUMA-Q gets confused
5567 * if we migrate to another quad during bootup.
5568 */
5569 if (raw_smp_processor_id() != orig_cpu) {
5570 cpumask_t mask = cpumask_of_cpu(orig_cpu),
5571 saved_mask = current->cpus_allowed;
5572
5573 set_cpus_allowed(current, mask);
5574 set_cpus_allowed(current, saved_mask);
5575 }
5576}
5577
5076#ifdef CONFIG_NUMA 5578#ifdef CONFIG_NUMA
5579
5077/** 5580/**
5078 * find_next_best_node - find the next node to include in a sched_domain 5581 * find_next_best_node - find the next node to include in a sched_domain
5079 * @node: node whose sched_domain we're building 5582 * @node: node whose sched_domain we're building
@@ -5439,6 +5942,10 @@ next_sg:
5439#endif 5942#endif
5440 cpu_attach_domain(sd, i); 5943 cpu_attach_domain(sd, i);
5441 } 5944 }
5945 /*
5946 * Tune cache-hot values:
5947 */
5948 calibrate_migration_costs(cpu_map);
5442} 5949}
5443/* 5950/*
5444 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5951 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
@@ -5505,7 +6012,7 @@ next_sg:
5505 * Detach sched domains from a group of cpus specified in cpu_map 6012 * Detach sched domains from a group of cpus specified in cpu_map
5506 * These cpus will now be attached to the NULL domain 6013 * These cpus will now be attached to the NULL domain
5507 */ 6014 */
5508static inline void detach_destroy_domains(const cpumask_t *cpu_map) 6015static void detach_destroy_domains(const cpumask_t *cpu_map)
5509{ 6016{
5510 int i; 6017 int i;
5511 6018
diff --git a/kernel/signal.c b/kernel/signal.c
index 80789a59b4db..d3efafd8109a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -25,6 +25,7 @@
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/audit.h> 27#include <linux/audit.h>
28#include <linux/capability.h>
28#include <asm/param.h> 29#include <asm/param.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/unistd.h> 31#include <asm/unistd.h>
@@ -329,13 +330,20 @@ void __exit_sighand(struct task_struct *tsk)
329 /* Ok, we're done with the signal handlers */ 330 /* Ok, we're done with the signal handlers */
330 tsk->sighand = NULL; 331 tsk->sighand = NULL;
331 if (atomic_dec_and_test(&sighand->count)) 332 if (atomic_dec_and_test(&sighand->count))
332 kmem_cache_free(sighand_cachep, sighand); 333 sighand_free(sighand);
333} 334}
334 335
335void exit_sighand(struct task_struct *tsk) 336void exit_sighand(struct task_struct *tsk)
336{ 337{
337 write_lock_irq(&tasklist_lock); 338 write_lock_irq(&tasklist_lock);
338 __exit_sighand(tsk); 339 rcu_read_lock();
340 if (tsk->sighand != NULL) {
341 struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
342 spin_lock(&sighand->siglock);
343 __exit_sighand(tsk);
344 spin_unlock(&sighand->siglock);
345 }
346 rcu_read_unlock();
339 write_unlock_irq(&tasklist_lock); 347 write_unlock_irq(&tasklist_lock);
340} 348}
341 349
@@ -345,19 +353,20 @@ void exit_sighand(struct task_struct *tsk)
345void __exit_signal(struct task_struct *tsk) 353void __exit_signal(struct task_struct *tsk)
346{ 354{
347 struct signal_struct * sig = tsk->signal; 355 struct signal_struct * sig = tsk->signal;
348 struct sighand_struct * sighand = tsk->sighand; 356 struct sighand_struct * sighand;
349 357
350 if (!sig) 358 if (!sig)
351 BUG(); 359 BUG();
352 if (!atomic_read(&sig->count)) 360 if (!atomic_read(&sig->count))
353 BUG(); 361 BUG();
362 rcu_read_lock();
363 sighand = rcu_dereference(tsk->sighand);
354 spin_lock(&sighand->siglock); 364 spin_lock(&sighand->siglock);
355 posix_cpu_timers_exit(tsk); 365 posix_cpu_timers_exit(tsk);
356 if (atomic_dec_and_test(&sig->count)) { 366 if (atomic_dec_and_test(&sig->count)) {
357 posix_cpu_timers_exit_group(tsk); 367 posix_cpu_timers_exit_group(tsk);
358 if (tsk == sig->curr_target)
359 sig->curr_target = next_thread(tsk);
360 tsk->signal = NULL; 368 tsk->signal = NULL;
369 __exit_sighand(tsk);
361 spin_unlock(&sighand->siglock); 370 spin_unlock(&sighand->siglock);
362 flush_sigqueue(&sig->shared_pending); 371 flush_sigqueue(&sig->shared_pending);
363 } else { 372 } else {
@@ -389,9 +398,11 @@ void __exit_signal(struct task_struct *tsk)
389 sig->nvcsw += tsk->nvcsw; 398 sig->nvcsw += tsk->nvcsw;
390 sig->nivcsw += tsk->nivcsw; 399 sig->nivcsw += tsk->nivcsw;
391 sig->sched_time += tsk->sched_time; 400 sig->sched_time += tsk->sched_time;
401 __exit_sighand(tsk);
392 spin_unlock(&sighand->siglock); 402 spin_unlock(&sighand->siglock);
393 sig = NULL; /* Marker for below. */ 403 sig = NULL; /* Marker for below. */
394 } 404 }
405 rcu_read_unlock();
395 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 406 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
396 flush_sigqueue(&tsk->pending); 407 flush_sigqueue(&tsk->pending);
397 if (sig) { 408 if (sig) {
@@ -465,7 +476,7 @@ unblock_all_signals(void)
465 spin_unlock_irqrestore(&current->sighand->siglock, flags); 476 spin_unlock_irqrestore(&current->sighand->siglock, flags);
466} 477}
467 478
468static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info) 479static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
469{ 480{
470 struct sigqueue *q, *first = NULL; 481 struct sigqueue *q, *first = NULL;
471 int still_pending = 0; 482 int still_pending = 0;
@@ -513,16 +524,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
513{ 524{
514 int sig = 0; 525 int sig = 0;
515 526
516 /* SIGKILL must have priority, otherwise it is quite easy 527 sig = next_signal(pending, mask);
517 * to create an unkillable process, sending sig < SIGKILL
518 * to self */
519 if (unlikely(sigismember(&pending->signal, SIGKILL))) {
520 if (!sigismember(mask, SIGKILL))
521 sig = SIGKILL;
522 }
523
524 if (likely(!sig))
525 sig = next_signal(pending, mask);
526 if (sig) { 528 if (sig) {
527 if (current->notifier) { 529 if (current->notifier) {
528 if (sigismember(current->notifier_mask, sig)) { 530 if (sigismember(current->notifier_mask, sig)) {
@@ -622,6 +624,33 @@ void signal_wake_up(struct task_struct *t, int resume)
622 * Returns 1 if any signals were found. 624 * Returns 1 if any signals were found.
623 * 625 *
624 * All callers must be holding the siglock. 626 * All callers must be holding the siglock.
627 *
628 * This version takes a sigset mask and looks at all signals,
629 * not just those in the first mask word.
630 */
631static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
632{
633 struct sigqueue *q, *n;
634 sigset_t m;
635
636 sigandsets(&m, mask, &s->signal);
637 if (sigisemptyset(&m))
638 return 0;
639
640 signandsets(&s->signal, &s->signal, mask);
641 list_for_each_entry_safe(q, n, &s->list, list) {
642 if (sigismember(mask, q->info.si_signo)) {
643 list_del_init(&q->list);
644 __sigqueue_free(q);
645 }
646 }
647 return 1;
648}
649/*
650 * Remove signals in mask from the pending set and queue.
651 * Returns 1 if any signals were found.
652 *
653 * All callers must be holding the siglock.
625 */ 654 */
626static int rm_from_queue(unsigned long mask, struct sigpending *s) 655static int rm_from_queue(unsigned long mask, struct sigpending *s)
627{ 656{
@@ -1089,18 +1118,29 @@ void zap_other_threads(struct task_struct *p)
1089} 1118}
1090 1119
1091/* 1120/*
1092 * Must be called with the tasklist_lock held for reading! 1121 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
1093 */ 1122 */
1094int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1123int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1095{ 1124{
1096 unsigned long flags; 1125 unsigned long flags;
1126 struct sighand_struct *sp;
1097 int ret; 1127 int ret;
1098 1128
1129retry:
1099 ret = check_kill_permission(sig, info, p); 1130 ret = check_kill_permission(sig, info, p);
1100 if (!ret && sig && p->sighand) { 1131 if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
1101 spin_lock_irqsave(&p->sighand->siglock, flags); 1132 spin_lock_irqsave(&sp->siglock, flags);
1133 if (p->sighand != sp) {
1134 spin_unlock_irqrestore(&sp->siglock, flags);
1135 goto retry;
1136 }
1137 if ((atomic_read(&sp->count) == 0) ||
1138 (atomic_read(&p->usage) == 0)) {
1139 spin_unlock_irqrestore(&sp->siglock, flags);
1140 return -ESRCH;
1141 }
1102 ret = __group_send_sig_info(sig, info, p); 1142 ret = __group_send_sig_info(sig, info, p);
1103 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1143 spin_unlock_irqrestore(&sp->siglock, flags);
1104 } 1144 }
1105 1145
1106 return ret; 1146 return ret;
@@ -1145,14 +1185,21 @@ int
1145kill_proc_info(int sig, struct siginfo *info, pid_t pid) 1185kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1146{ 1186{
1147 int error; 1187 int error;
1188 int acquired_tasklist_lock = 0;
1148 struct task_struct *p; 1189 struct task_struct *p;
1149 1190
1150 read_lock(&tasklist_lock); 1191 rcu_read_lock();
1192 if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
1193 read_lock(&tasklist_lock);
1194 acquired_tasklist_lock = 1;
1195 }
1151 p = find_task_by_pid(pid); 1196 p = find_task_by_pid(pid);
1152 error = -ESRCH; 1197 error = -ESRCH;
1153 if (p) 1198 if (p)
1154 error = group_send_sig_info(sig, info, p); 1199 error = group_send_sig_info(sig, info, p);
1155 read_unlock(&tasklist_lock); 1200 if (unlikely(acquired_tasklist_lock))
1201 read_unlock(&tasklist_lock);
1202 rcu_read_unlock();
1156 return error; 1203 return error;
1157} 1204}
1158 1205
@@ -1172,8 +1219,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1172 ret = -ESRCH; 1219 ret = -ESRCH;
1173 goto out_unlock; 1220 goto out_unlock;
1174 } 1221 }
1175 if ((!info || ((unsigned long)info != 1 && 1222 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
1176 (unsigned long)info != 2 && SI_FROMUSER(info)))
1177 && (euid != p->suid) && (euid != p->uid) 1223 && (euid != p->suid) && (euid != p->uid)
1178 && (uid != p->suid) && (uid != p->uid)) { 1224 && (uid != p->suid) && (uid != p->uid)) {
1179 ret = -EPERM; 1225 ret = -EPERM;
@@ -1364,16 +1410,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1364{ 1410{
1365 unsigned long flags; 1411 unsigned long flags;
1366 int ret = 0; 1412 int ret = 0;
1413 struct sighand_struct *sh;
1367 1414
1368 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1415 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1369 read_lock(&tasklist_lock); 1416
1417 /*
1418 * The rcu based delayed sighand destroy makes it possible to
1419 * run this without tasklist lock held. The task struct itself
1420 * cannot go away as create_timer did get_task_struct().
1421 *
1422 * We return -1, when the task is marked exiting, so
1423 * posix_timer_event can redirect it to the group leader
1424 */
1425 rcu_read_lock();
1370 1426
1371 if (unlikely(p->flags & PF_EXITING)) { 1427 if (unlikely(p->flags & PF_EXITING)) {
1372 ret = -1; 1428 ret = -1;
1373 goto out_err; 1429 goto out_err;
1374 } 1430 }
1375 1431
1376 spin_lock_irqsave(&p->sighand->siglock, flags); 1432retry:
1433 sh = rcu_dereference(p->sighand);
1434
1435 spin_lock_irqsave(&sh->siglock, flags);
1436 if (p->sighand != sh) {
1437 /* We raced with exec() in a multithreaded process... */
1438 spin_unlock_irqrestore(&sh->siglock, flags);
1439 goto retry;
1440 }
1441
1442 /*
1443 * We do the check here again to handle the following scenario:
1444 *
1445 * CPU 0 CPU 1
1446 * send_sigqueue
1447 * check PF_EXITING
1448 * interrupt exit code running
1449 * __exit_signal
1450 * lock sighand->siglock
1451 * unlock sighand->siglock
1452 * lock sh->siglock
1453 * add(tsk->pending) flush_sigqueue(tsk->pending)
1454 *
1455 */
1456
1457 if (unlikely(p->flags & PF_EXITING)) {
1458 ret = -1;
1459 goto out;
1460 }
1377 1461
1378 if (unlikely(!list_empty(&q->list))) { 1462 if (unlikely(!list_empty(&q->list))) {
1379 /* 1463 /*
@@ -1397,9 +1481,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1397 signal_wake_up(p, sig == SIGKILL); 1481 signal_wake_up(p, sig == SIGKILL);
1398 1482
1399out: 1483out:
1400 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1484 spin_unlock_irqrestore(&sh->siglock, flags);
1401out_err: 1485out_err:
1402 read_unlock(&tasklist_lock); 1486 rcu_read_unlock();
1403 1487
1404 return ret; 1488 return ret;
1405} 1489}
@@ -1411,7 +1495,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1411 int ret = 0; 1495 int ret = 0;
1412 1496
1413 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1497 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1498
1414 read_lock(&tasklist_lock); 1499 read_lock(&tasklist_lock);
1500 /* Since it_lock is held, p->sighand cannot be NULL. */
1415 spin_lock_irqsave(&p->sighand->siglock, flags); 1501 spin_lock_irqsave(&p->sighand->siglock, flags);
1416 handle_stop_signal(sig, p); 1502 handle_stop_signal(sig, p);
1417 1503
@@ -1445,7 +1531,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1445out: 1531out:
1446 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1532 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1447 read_unlock(&tasklist_lock); 1533 read_unlock(&tasklist_lock);
1448 return(ret); 1534 return ret;
1449} 1535}
1450 1536
1451/* 1537/*
@@ -1795,7 +1881,7 @@ do_signal_stop(int signr)
1795 * We return zero if we still hold the siglock and should look 1881 * We return zero if we still hold the siglock and should look
1796 * for another signal without checking group_stop_count again. 1882 * for another signal without checking group_stop_count again.
1797 */ 1883 */
1798static inline int handle_group_stop(void) 1884static int handle_group_stop(void)
1799{ 1885{
1800 int stop_count; 1886 int stop_count;
1801 1887
@@ -2347,6 +2433,7 @@ int
2347do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) 2433do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2348{ 2434{
2349 struct k_sigaction *k; 2435 struct k_sigaction *k;
2436 sigset_t mask;
2350 2437
2351 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) 2438 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
2352 return -EINVAL; 2439 return -EINVAL;
@@ -2394,9 +2481,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
2394 *k = *act; 2481 *k = *act;
2395 sigdelsetmask(&k->sa.sa_mask, 2482 sigdelsetmask(&k->sa.sa_mask,
2396 sigmask(SIGKILL) | sigmask(SIGSTOP)); 2483 sigmask(SIGKILL) | sigmask(SIGSTOP));
2397 rm_from_queue(sigmask(sig), &t->signal->shared_pending); 2484 sigemptyset(&mask);
2485 sigaddset(&mask, sig);
2486 rm_from_queue_full(&mask, &t->signal->shared_pending);
2398 do { 2487 do {
2399 rm_from_queue(sigmask(sig), &t->pending); 2488 rm_from_queue_full(&mask, &t->pending);
2400 recalc_sigpending_tsk(t); 2489 recalc_sigpending_tsk(t);
2401 t = next_thread(t); 2490 t = next_thread(t);
2402 } while (t != current); 2491 } while (t != current);
@@ -2632,6 +2721,32 @@ sys_pause(void)
2632 2721
2633#endif 2722#endif
2634 2723
2724#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
2725asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
2726{
2727 sigset_t newset;
2728
2729 /* XXX: Don't preclude handling different sized sigset_t's. */
2730 if (sigsetsize != sizeof(sigset_t))
2731 return -EINVAL;
2732
2733 if (copy_from_user(&newset, unewset, sizeof(newset)))
2734 return -EFAULT;
2735 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2736
2737 spin_lock_irq(&current->sighand->siglock);
2738 current->saved_sigmask = current->blocked;
2739 current->blocked = newset;
2740 recalc_sigpending();
2741 spin_unlock_irq(&current->sighand->siglock);
2742
2743 current->state = TASK_INTERRUPTIBLE;
2744 schedule();
2745 set_thread_flag(TIF_RESTORE_SIGMASK);
2746 return -ERESTARTNOHAND;
2747}
2748#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
2749
2635void __init signals_init(void) 2750void __init signals_init(void)
2636{ 2751{
2637 sigqueue_cachep = 2752 sigqueue_cachep =
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84a9d18aa8da..dcfb5d731466 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -87,13 +87,9 @@ static int stop_machine(void)
87{ 87{
88 int i, ret = 0; 88 int i, ret = 0;
89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
90 mm_segment_t old_fs = get_fs();
91 90
92 /* One high-prio thread per cpu. We'll do this one. */ 91 /* One high-prio thread per cpu. We'll do this one. */
93 set_fs(KERNEL_DS); 92 sched_setscheduler(current, SCHED_FIFO, &param);
94 sys_sched_setscheduler(current->pid, SCHED_FIFO,
95 (struct sched_param __user *)&param);
96 set_fs(old_fs);
97 93
98 atomic_set(&stopmachine_thread_ack, 0); 94 atomic_set(&stopmachine_thread_ack, 0);
99 stopmachine_num_threads = 0; 95 stopmachine_num_threads = 0;
@@ -119,13 +115,12 @@ static int stop_machine(void)
119 return ret; 115 return ret;
120 } 116 }
121 117
122 /* Don't schedule us away at this point, please. */
123 local_irq_disable();
124
125 /* Now they are all started, make them hold the CPUs, ready. */ 118 /* Now they are all started, make them hold the CPUs, ready. */
119 preempt_disable();
126 stopmachine_set_state(STOPMACHINE_PREPARE); 120 stopmachine_set_state(STOPMACHINE_PREPARE);
127 121
128 /* Make them disable irqs. */ 122 /* Make them disable irqs. */
123 local_irq_disable();
129 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); 124 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
130 125
131 return 0; 126 return 0;
@@ -135,6 +130,7 @@ static void restart_machine(void)
135{ 130{
136 stopmachine_set_state(STOPMACHINE_EXIT); 131 stopmachine_set_state(STOPMACHINE_EXIT);
137 local_irq_enable(); 132 local_irq_enable();
133 preempt_enable_no_resched();
138} 134}
139 135
140struct stop_machine_data 136struct stop_machine_data
diff --git a/kernel/sys.c b/kernel/sys.c
index bce933ebb29f..d09cac23fdfd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -19,6 +19,7 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
21#include <linux/workqueue.h> 21#include <linux/workqueue.h>
22#include <linux/capability.h>
22#include <linux/device.h> 23#include <linux/device.h>
23#include <linux/key.h> 24#include <linux/key.h>
24#include <linux/times.h> 25#include <linux/times.h>
@@ -32,6 +33,7 @@
32 33
33#include <linux/compat.h> 34#include <linux/compat.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/kprobes.h>
35 37
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37#include <asm/io.h> 39#include <asm/io.h>
@@ -168,7 +170,7 @@ EXPORT_SYMBOL(notifier_chain_unregister);
168 * of the last notifier function called. 170 * of the last notifier function called.
169 */ 171 */
170 172
171int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) 173int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
172{ 174{
173 int ret=NOTIFY_DONE; 175 int ret=NOTIFY_DONE;
174 struct notifier_block *nb = *n; 176 struct notifier_block *nb = *n;
@@ -222,6 +224,18 @@ int unregister_reboot_notifier(struct notifier_block * nb)
222 224
223EXPORT_SYMBOL(unregister_reboot_notifier); 225EXPORT_SYMBOL(unregister_reboot_notifier);
224 226
227#ifndef CONFIG_SECURITY
228int capable(int cap)
229{
230 if (cap_raised(current->cap_effective, cap)) {
231 current->flags |= PF_SUPERPRIV;
232 return 1;
233 }
234 return 0;
235}
236EXPORT_SYMBOL(capable);
237#endif
238
225static int set_one_prio(struct task_struct *p, int niceval, int error) 239static int set_one_prio(struct task_struct *p, int niceval, int error)
226{ 240{
227 int no_nice; 241 int no_nice;
@@ -488,6 +502,12 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
488 magic2 != LINUX_REBOOT_MAGIC2C)) 502 magic2 != LINUX_REBOOT_MAGIC2C))
489 return -EINVAL; 503 return -EINVAL;
490 504
505 /* Instead of trying to make the power_off code look like
506 * halt when pm_power_off is not set do it the easy way.
507 */
508 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
509 cmd = LINUX_REBOOT_CMD_HALT;
510
491 lock_kernel(); 511 lock_kernel();
492 switch (cmd) { 512 switch (cmd) {
493 case LINUX_REBOOT_CMD_RESTART: 513 case LINUX_REBOOT_CMD_RESTART:
@@ -1083,10 +1103,11 @@ asmlinkage long sys_times(struct tms __user * tbuf)
1083asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) 1103asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1084{ 1104{
1085 struct task_struct *p; 1105 struct task_struct *p;
1106 struct task_struct *group_leader = current->group_leader;
1086 int err = -EINVAL; 1107 int err = -EINVAL;
1087 1108
1088 if (!pid) 1109 if (!pid)
1089 pid = current->pid; 1110 pid = group_leader->pid;
1090 if (!pgid) 1111 if (!pgid)
1091 pgid = pid; 1112 pgid = pid;
1092 if (pgid < 0) 1113 if (pgid < 0)
@@ -1106,16 +1127,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1106 if (!thread_group_leader(p)) 1127 if (!thread_group_leader(p))
1107 goto out; 1128 goto out;
1108 1129
1109 if (p->parent == current || p->real_parent == current) { 1130 if (p->real_parent == group_leader) {
1110 err = -EPERM; 1131 err = -EPERM;
1111 if (p->signal->session != current->signal->session) 1132 if (p->signal->session != group_leader->signal->session)
1112 goto out; 1133 goto out;
1113 err = -EACCES; 1134 err = -EACCES;
1114 if (p->did_exec) 1135 if (p->did_exec)
1115 goto out; 1136 goto out;
1116 } else { 1137 } else {
1117 err = -ESRCH; 1138 err = -ESRCH;
1118 if (p != current) 1139 if (p != group_leader)
1119 goto out; 1140 goto out;
1120 } 1141 }
1121 1142
@@ -1127,7 +1148,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1127 struct task_struct *p; 1148 struct task_struct *p;
1128 1149
1129 do_each_task_pid(pgid, PIDTYPE_PGID, p) { 1150 do_each_task_pid(pgid, PIDTYPE_PGID, p) {
1130 if (p->signal->session == current->signal->session) 1151 if (p->signal->session == group_leader->signal->session)
1131 goto ok_pgid; 1152 goto ok_pgid;
1132 } while_each_task_pid(pgid, PIDTYPE_PGID, p); 1153 } while_each_task_pid(pgid, PIDTYPE_PGID, p);
1133 goto out; 1154 goto out;
@@ -1207,24 +1228,22 @@ asmlinkage long sys_getsid(pid_t pid)
1207 1228
1208asmlinkage long sys_setsid(void) 1229asmlinkage long sys_setsid(void)
1209{ 1230{
1231 struct task_struct *group_leader = current->group_leader;
1210 struct pid *pid; 1232 struct pid *pid;
1211 int err = -EPERM; 1233 int err = -EPERM;
1212 1234
1213 if (!thread_group_leader(current))
1214 return -EINVAL;
1215
1216 down(&tty_sem); 1235 down(&tty_sem);
1217 write_lock_irq(&tasklist_lock); 1236 write_lock_irq(&tasklist_lock);
1218 1237
1219 pid = find_pid(PIDTYPE_PGID, current->pid); 1238 pid = find_pid(PIDTYPE_PGID, group_leader->pid);
1220 if (pid) 1239 if (pid)
1221 goto out; 1240 goto out;
1222 1241
1223 current->signal->leader = 1; 1242 group_leader->signal->leader = 1;
1224 __set_special_pids(current->pid, current->pid); 1243 __set_special_pids(group_leader->pid, group_leader->pid);
1225 current->signal->tty = NULL; 1244 group_leader->signal->tty = NULL;
1226 current->signal->tty_old_pgrp = 0; 1245 group_leader->signal->tty_old_pgrp = 0;
1227 err = process_group(current); 1246 err = process_group(group_leader);
1228out: 1247out:
1229 write_unlock_irq(&tasklist_lock); 1248 write_unlock_irq(&tasklist_lock);
1230 up(&tty_sem); 1249 up(&tty_sem);
@@ -1686,7 +1705,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1686 if (unlikely(!p->signal)) 1705 if (unlikely(!p->signal))
1687 return; 1706 return;
1688 1707
1708 utime = stime = cputime_zero;
1709
1689 switch (who) { 1710 switch (who) {
1711 case RUSAGE_BOTH:
1690 case RUSAGE_CHILDREN: 1712 case RUSAGE_CHILDREN:
1691 spin_lock_irqsave(&p->sighand->siglock, flags); 1713 spin_lock_irqsave(&p->sighand->siglock, flags);
1692 utime = p->signal->cutime; 1714 utime = p->signal->cutime;
@@ -1696,22 +1718,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1696 r->ru_minflt = p->signal->cmin_flt; 1718 r->ru_minflt = p->signal->cmin_flt;
1697 r->ru_majflt = p->signal->cmaj_flt; 1719 r->ru_majflt = p->signal->cmaj_flt;
1698 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1720 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1699 cputime_to_timeval(utime, &r->ru_utime); 1721
1700 cputime_to_timeval(stime, &r->ru_stime); 1722 if (who == RUSAGE_CHILDREN)
1701 break; 1723 break;
1724
1702 case RUSAGE_SELF: 1725 case RUSAGE_SELF:
1703 spin_lock_irqsave(&p->sighand->siglock, flags);
1704 utime = stime = cputime_zero;
1705 goto sum_group;
1706 case RUSAGE_BOTH:
1707 spin_lock_irqsave(&p->sighand->siglock, flags);
1708 utime = p->signal->cutime;
1709 stime = p->signal->cstime;
1710 r->ru_nvcsw = p->signal->cnvcsw;
1711 r->ru_nivcsw = p->signal->cnivcsw;
1712 r->ru_minflt = p->signal->cmin_flt;
1713 r->ru_majflt = p->signal->cmaj_flt;
1714 sum_group:
1715 utime = cputime_add(utime, p->signal->utime); 1726 utime = cputime_add(utime, p->signal->utime);
1716 stime = cputime_add(stime, p->signal->stime); 1727 stime = cputime_add(stime, p->signal->stime);
1717 r->ru_nvcsw += p->signal->nvcsw; 1728 r->ru_nvcsw += p->signal->nvcsw;
@@ -1728,13 +1739,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1728 r->ru_majflt += t->maj_flt; 1739 r->ru_majflt += t->maj_flt;
1729 t = next_thread(t); 1740 t = next_thread(t);
1730 } while (t != p); 1741 } while (t != p);
1731 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1732 cputime_to_timeval(utime, &r->ru_utime);
1733 cputime_to_timeval(stime, &r->ru_stime);
1734 break; 1742 break;
1743
1735 default: 1744 default:
1736 BUG(); 1745 BUG();
1737 } 1746 }
1747
1748 cputime_to_timeval(utime, &r->ru_utime);
1749 cputime_to_timeval(stime, &r->ru_stime);
1738} 1750}
1739 1751
1740int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1752int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2efa..17313b99e53d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -82,6 +82,28 @@ cond_syscall(compat_sys_socketcall);
82cond_syscall(sys_inotify_init); 82cond_syscall(sys_inotify_init);
83cond_syscall(sys_inotify_add_watch); 83cond_syscall(sys_inotify_add_watch);
84cond_syscall(sys_inotify_rm_watch); 84cond_syscall(sys_inotify_rm_watch);
85cond_syscall(sys_migrate_pages);
86cond_syscall(sys_chown16);
87cond_syscall(sys_fchown16);
88cond_syscall(sys_getegid16);
89cond_syscall(sys_geteuid16);
90cond_syscall(sys_getgid16);
91cond_syscall(sys_getgroups16);
92cond_syscall(sys_getresgid16);
93cond_syscall(sys_getresuid16);
94cond_syscall(sys_getuid16);
95cond_syscall(sys_lchown16);
96cond_syscall(sys_setfsgid16);
97cond_syscall(sys_setfsuid16);
98cond_syscall(sys_setgid16);
99cond_syscall(sys_setgroups16);
100cond_syscall(sys_setregid16);
101cond_syscall(sys_setresgid16);
102cond_syscall(sys_setresuid16);
103cond_syscall(sys_setreuid16);
104cond_syscall(sys_setuid16);
105cond_syscall(sys_vm86old);
106cond_syscall(sys_vm86);
85 107
86/* arch-specific weak syscall entries */ 108/* arch-specific weak syscall entries */
87cond_syscall(sys_pciconfig_read); 109cond_syscall(sys_pciconfig_read);
@@ -90,3 +112,5 @@ cond_syscall(sys_pciconfig_iobase);
90cond_syscall(sys32_ipc); 112cond_syscall(sys32_ipc);
91cond_syscall(sys32_sysctl); 113cond_syscall(sys32_sysctl);
92cond_syscall(ppc_rtas); 114cond_syscall(ppc_rtas);
115cond_syscall(sys_spu_run);
116cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9990e10192e8..cb99a42f8b37 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -25,12 +25,14 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
28#include <linux/capability.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
29#include <linux/utsname.h> 30#include <linux/utsname.h>
30#include <linux/capability.h> 31#include <linux/capability.h>
31#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
32#include <linux/init.h> 33#include <linux/init.h>
33#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/kobject.h>
34#include <linux/net.h> 36#include <linux/net.h>
35#include <linux/sysrq.h> 37#include <linux/sysrq.h>
36#include <linux/highuid.h> 38#include <linux/highuid.h>
@@ -67,6 +69,8 @@ extern int min_free_kbytes;
67extern int printk_ratelimit_jiffies; 69extern int printk_ratelimit_jiffies;
68extern int printk_ratelimit_burst; 70extern int printk_ratelimit_burst;
69extern int pid_max_min, pid_max_max; 71extern int pid_max_min, pid_max_max;
72extern int sysctl_drop_caches;
73extern int percpu_pagelist_fraction;
70 74
71#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 75#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
72int unknown_nmi_panic; 76int unknown_nmi_panic;
@@ -77,15 +81,13 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
77/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
78static int maxolduid = 65535; 82static int maxolduid = 65535;
79static int minolduid; 83static int minolduid;
84static int min_percpu_pagelist_fract = 8;
80 85
81static int ngroups_max = NGROUPS_MAX; 86static int ngroups_max = NGROUPS_MAX;
82 87
83#ifdef CONFIG_KMOD 88#ifdef CONFIG_KMOD
84extern char modprobe_path[]; 89extern char modprobe_path[];
85#endif 90#endif
86#ifdef CONFIG_HOTPLUG
87extern char hotplug_path[];
88#endif
89#ifdef CONFIG_CHR_DEV_SG 91#ifdef CONFIG_CHR_DEV_SG
90extern int sg_big_buff; 92extern int sg_big_buff;
91#endif 93#endif
@@ -110,7 +112,7 @@ extern int pwrsw_enabled;
110extern int unaligned_enabled; 112extern int unaligned_enabled;
111#endif 113#endif
112 114
113#ifdef CONFIG_ARCH_S390 115#ifdef CONFIG_S390
114#ifdef CONFIG_MATHEMU 116#ifdef CONFIG_MATHEMU
115extern int sysctl_ieee_emulation_warnings; 117extern int sysctl_ieee_emulation_warnings;
116#endif 118#endif
@@ -397,8 +399,8 @@ static ctl_table kern_table[] = {
397 { 399 {
398 .ctl_name = KERN_HOTPLUG, 400 .ctl_name = KERN_HOTPLUG,
399 .procname = "hotplug", 401 .procname = "hotplug",
400 .data = &hotplug_path, 402 .data = &uevent_helper,
401 .maxlen = HOTPLUG_PATH_LEN, 403 .maxlen = UEVENT_HELPER_PATH_LEN,
402 .mode = 0644, 404 .mode = 0644,
403 .proc_handler = &proc_dostring, 405 .proc_handler = &proc_dostring,
404 .strategy = &sysctl_string, 406 .strategy = &sysctl_string,
@@ -544,7 +546,7 @@ static ctl_table kern_table[] = {
544 .extra1 = &minolduid, 546 .extra1 = &minolduid,
545 .extra2 = &maxolduid, 547 .extra2 = &maxolduid,
546 }, 548 },
547#ifdef CONFIG_ARCH_S390 549#ifdef CONFIG_S390
548#ifdef CONFIG_MATHEMU 550#ifdef CONFIG_MATHEMU
549 { 551 {
550 .ctl_name = KERN_IEEE_EMULATION_WARNINGS, 552 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
@@ -646,7 +648,7 @@ static ctl_table kern_table[] = {
646 .mode = 0644, 648 .mode = 0644,
647 .proc_handler = &proc_dointvec, 649 .proc_handler = &proc_dointvec,
648 }, 650 },
649#if defined(CONFIG_ARCH_S390) 651#if defined(CONFIG_S390) && defined(CONFIG_SMP)
650 { 652 {
651 .ctl_name = KERN_SPIN_RETRY, 653 .ctl_name = KERN_SPIN_RETRY,
652 .procname = "spin_retry", 654 .procname = "spin_retry",
@@ -777,6 +779,15 @@ static ctl_table vm_table[] = {
777 .strategy = &sysctl_intvec, 779 .strategy = &sysctl_intvec,
778 }, 780 },
779 { 781 {
782 .ctl_name = VM_DROP_PAGECACHE,
783 .procname = "drop_caches",
784 .data = &sysctl_drop_caches,
785 .maxlen = sizeof(int),
786 .mode = 0644,
787 .proc_handler = drop_caches_sysctl_handler,
788 .strategy = &sysctl_intvec,
789 },
790 {
780 .ctl_name = VM_MIN_FREE_KBYTES, 791 .ctl_name = VM_MIN_FREE_KBYTES,
781 .procname = "min_free_kbytes", 792 .procname = "min_free_kbytes",
782 .data = &min_free_kbytes, 793 .data = &min_free_kbytes,
@@ -786,6 +797,16 @@ static ctl_table vm_table[] = {
786 .strategy = &sysctl_intvec, 797 .strategy = &sysctl_intvec,
787 .extra1 = &zero, 798 .extra1 = &zero,
788 }, 799 },
800 {
801 .ctl_name = VM_PERCPU_PAGELIST_FRACTION,
802 .procname = "percpu_pagelist_fraction",
803 .data = &percpu_pagelist_fraction,
804 .maxlen = sizeof(percpu_pagelist_fraction),
805 .mode = 0644,
806 .proc_handler = &percpu_pagelist_fraction_sysctl_handler,
807 .strategy = &sysctl_intvec,
808 .extra1 = &min_percpu_pagelist_fract,
809 },
789#ifdef CONFIG_MMU 810#ifdef CONFIG_MMU
790 { 811 {
791 .ctl_name = VM_MAX_MAP_COUNT, 812 .ctl_name = VM_MAX_MAP_COUNT,
@@ -849,6 +870,17 @@ static ctl_table vm_table[] = {
849 .strategy = &sysctl_jiffies, 870 .strategy = &sysctl_jiffies,
850 }, 871 },
851#endif 872#endif
873#ifdef CONFIG_NUMA
874 {
875 .ctl_name = VM_ZONE_RECLAIM_MODE,
876 .procname = "zone_reclaim_mode",
877 .data = &zone_reclaim_mode,
878 .maxlen = sizeof(zone_reclaim_mode),
879 .mode = 0644,
880 .proc_handler = &proc_dointvec,
881 .strategy = &zero,
882 },
883#endif
852 { .ctl_name = 0 } 884 { .ctl_name = 0 }
853}; 885};
854 886
@@ -2192,29 +2224,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2192 void __user *oldval, size_t __user *oldlenp, 2224 void __user *oldval, size_t __user *oldlenp,
2193 void __user *newval, size_t newlen, void **context) 2225 void __user *newval, size_t newlen, void **context)
2194{ 2226{
2195 size_t l, len;
2196
2197 if (!table->data || !table->maxlen) 2227 if (!table->data || !table->maxlen)
2198 return -ENOTDIR; 2228 return -ENOTDIR;
2199 2229
2200 if (oldval && oldlenp) { 2230 if (oldval && oldlenp) {
2201 if (get_user(len, oldlenp)) 2231 size_t bufsize;
2232 if (get_user(bufsize, oldlenp))
2202 return -EFAULT; 2233 return -EFAULT;
2203 if (len) { 2234 if (bufsize) {
2204 l = strlen(table->data); 2235 size_t len = strlen(table->data), copied;
2205 if (len > l) len = l; 2236
2206 if (len >= table->maxlen) 2237 /* This shouldn't trigger for a well-formed sysctl */
2238 if (len > table->maxlen)
2207 len = table->maxlen; 2239 len = table->maxlen;
2208 if(copy_to_user(oldval, table->data, len)) 2240
2209 return -EFAULT; 2241 /* Copy up to a max of bufsize-1 bytes of the string */
2210 if(put_user(0, ((char __user *) oldval) + len)) 2242 copied = (len >= bufsize) ? bufsize - 1 : len;
2243
2244 if (copy_to_user(oldval, table->data, copied) ||
2245 put_user(0, (char __user *)(oldval + copied)))
2211 return -EFAULT; 2246 return -EFAULT;
2212 if(put_user(len, oldlenp)) 2247 if (put_user(len, oldlenp))
2213 return -EFAULT; 2248 return -EFAULT;
2214 } 2249 }
2215 } 2250 }
2216 if (newval && newlen) { 2251 if (newval && newlen) {
2217 len = newlen; 2252 size_t len = newlen;
2218 if (len > table->maxlen) 2253 if (len > table->maxlen)
2219 len = table->maxlen; 2254 len = table->maxlen;
2220 if(copy_from_user(table->data, newval, len)) 2255 if(copy_from_user(table->data, newval, len))
@@ -2223,7 +2258,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2223 len--; 2258 len--;
2224 ((char *) table->data)[len] = 0; 2259 ((char *) table->data)[len] = 0;
2225 } 2260 }
2226 return 0; 2261 return 1;
2227} 2262}
2228 2263
2229/* 2264/*
diff --git a/kernel/time.c b/kernel/time.c
index 245d595a13cb..7477b1d2079e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h>
32#include <linux/errno.h> 33#include <linux/errno.h>
33#include <linux/smp_lock.h> 34#include <linux/smp_lock.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
@@ -154,6 +155,9 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
154 static int firsttime = 1; 155 static int firsttime = 1;
155 int error = 0; 156 int error = 0;
156 157
158 if (!timespec_valid(tv))
159 return -EINVAL;
160
157 error = security_settime(tv, tz); 161 error = security_settime(tv, tz);
158 if (error) 162 if (error)
159 return error; 163 return error;
@@ -561,6 +565,108 @@ void getnstimeofday(struct timespec *tv)
561EXPORT_SYMBOL_GPL(getnstimeofday); 565EXPORT_SYMBOL_GPL(getnstimeofday);
562#endif 566#endif
563 567
568/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
569 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
570 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
571 *
572 * [For the Julian calendar (which was used in Russia before 1917,
573 * Britain & colonies before 1752, anywhere else before 1582,
574 * and is still in use by some communities) leave out the
575 * -year/100+year/400 terms, and add 10.]
576 *
577 * This algorithm was first published by Gauss (I think).
578 *
579 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
580 * machines were long is 32-bit! (However, as time_t is signed, we
581 * will already get problems at other places on 2038-01-19 03:14:08)
582 */
583unsigned long
584mktime(const unsigned int year0, const unsigned int mon0,
585 const unsigned int day, const unsigned int hour,
586 const unsigned int min, const unsigned int sec)
587{
588 unsigned int mon = mon0, year = year0;
589
590 /* 1..12 -> 11,12,1..10 */
591 if (0 >= (int) (mon -= 2)) {
592 mon += 12; /* Puts Feb last since it has leap day */
593 year -= 1;
594 }
595
596 return ((((unsigned long)
597 (year/4 - year/100 + year/400 + 367*mon/12 + day) +
598 year*365 - 719499
599 )*24 + hour /* now have hours */
600 )*60 + min /* now have minutes */
601 )*60 + sec; /* finally seconds */
602}
603
604EXPORT_SYMBOL(mktime);
605
606/**
607 * set_normalized_timespec - set timespec sec and nsec parts and normalize
608 *
609 * @ts: pointer to timespec variable to be set
610 * @sec: seconds to set
611 * @nsec: nanoseconds to set
612 *
613 * Set seconds and nanoseconds field of a timespec variable and
614 * normalize to the timespec storage format
615 *
616 * Note: The tv_nsec part is always in the range of
617 * 0 <= tv_nsec < NSEC_PER_SEC
618 * For negative values only the tv_sec field is negative !
619 */
620void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
621{
622 while (nsec >= NSEC_PER_SEC) {
623 nsec -= NSEC_PER_SEC;
624 ++sec;
625 }
626 while (nsec < 0) {
627 nsec += NSEC_PER_SEC;
628 --sec;
629 }
630 ts->tv_sec = sec;
631 ts->tv_nsec = nsec;
632}
633
634/**
635 * ns_to_timespec - Convert nanoseconds to timespec
636 * @nsec: the nanoseconds value to be converted
637 *
638 * Returns the timespec representation of the nsec parameter.
639 */
640inline struct timespec ns_to_timespec(const nsec_t nsec)
641{
642 struct timespec ts;
643
644 if (nsec)
645 ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC,
646 &ts.tv_nsec);
647 else
648 ts.tv_sec = ts.tv_nsec = 0;
649
650 return ts;
651}
652
653/**
654 * ns_to_timeval - Convert nanoseconds to timeval
655 * @nsec: the nanoseconds value to be converted
656 *
657 * Returns the timeval representation of the nsec parameter.
658 */
659struct timeval ns_to_timeval(const nsec_t nsec)
660{
661 struct timespec ts = ns_to_timespec(nsec);
662 struct timeval tv;
663
664 tv.tv_sec = ts.tv_sec;
665 tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
666
667 return tv;
668}
669
564#if (BITS_PER_LONG < 64) 670#if (BITS_PER_LONG < 64)
565u64 get_jiffies_64(void) 671u64 get_jiffies_64(void)
566{ 672{
diff --git a/kernel/timer.c b/kernel/timer.c
index fd74268d8663..4f1cb0ab5251 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -33,6 +33,7 @@
33#include <linux/posix-timers.h> 33#include <linux/posix-timers.h>
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/delay.h>
36 37
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38#include <asm/unistd.h> 39#include <asm/unistd.h>
@@ -857,6 +858,7 @@ static void run_timer_softirq(struct softirq_action *h)
857{ 858{
858 tvec_base_t *base = &__get_cpu_var(tvec_bases); 859 tvec_base_t *base = &__get_cpu_var(tvec_bases);
859 860
861 hrtimer_run_queues();
860 if (time_after_eq(jiffies, base->timer_jiffies)) 862 if (time_after_eq(jiffies, base->timer_jiffies))
861 __run_timers(base); 863 __run_timers(base);
862} 864}
@@ -1118,62 +1120,6 @@ asmlinkage long sys_gettid(void)
1118 return current->pid; 1120 return current->pid;
1119} 1121}
1120 1122
1121static long __sched nanosleep_restart(struct restart_block *restart)
1122{
1123 unsigned long expire = restart->arg0, now = jiffies;
1124 struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
1125 long ret;
1126
1127 /* Did it expire while we handled signals? */
1128 if (!time_after(expire, now))
1129 return 0;
1130
1131 expire = schedule_timeout_interruptible(expire - now);
1132
1133 ret = 0;
1134 if (expire) {
1135 struct timespec t;
1136 jiffies_to_timespec(expire, &t);
1137
1138 ret = -ERESTART_RESTARTBLOCK;
1139 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1140 ret = -EFAULT;
1141 /* The 'restart' block is already filled in */
1142 }
1143 return ret;
1144}
1145
1146asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1147{
1148 struct timespec t;
1149 unsigned long expire;
1150 long ret;
1151
1152 if (copy_from_user(&t, rqtp, sizeof(t)))
1153 return -EFAULT;
1154
1155 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
1156 return -EINVAL;
1157
1158 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1159 expire = schedule_timeout_interruptible(expire);
1160
1161 ret = 0;
1162 if (expire) {
1163 struct restart_block *restart;
1164 jiffies_to_timespec(expire, &t);
1165 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1166 return -EFAULT;
1167
1168 restart = &current_thread_info()->restart_block;
1169 restart->fn = nanosleep_restart;
1170 restart->arg0 = jiffies + expire;
1171 restart->arg1 = (unsigned long) rmtp;
1172 ret = -ERESTART_RESTARTBLOCK;
1173 }
1174 return ret;
1175}
1176
1177/* 1123/*
1178 * sys_sysinfo - fill in sysinfo struct 1124 * sys_sysinfo - fill in sysinfo struct
1179 */ 1125 */
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f669941e8b26..aa25605027c8 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -10,6 +10,7 @@
10#include <linux/notifier.h> 10#include <linux/notifier.h>
11#include <linux/reboot.h> 11#include <linux/reboot.h>
12#include <linux/prctl.h> 12#include <linux/prctl.h>
13#include <linux/capability.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/highuid.h> 15#include <linux/highuid.h>
15#include <linux/security.h> 16#include <linux/security.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42df83d7fad2..b052e2c4c710 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,7 +29,8 @@
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30 30
31/* 31/*
32 * The per-CPU workqueue (if single thread, we always use cpu 0's). 32 * The per-CPU workqueue (if single thread, we always use the first
33 * possible cpu).
33 * 34 *
34 * The sequence counters are for flush_scheduled_work(). It wants to wait 35 * The sequence counters are for flush_scheduled_work(). It wants to wait
35 * until until all currently-scheduled works are completed, but it doesn't 36 * until until all currently-scheduled works are completed, but it doesn't
@@ -69,6 +70,8 @@ struct workqueue_struct {
69static DEFINE_SPINLOCK(workqueue_lock); 70static DEFINE_SPINLOCK(workqueue_lock);
70static LIST_HEAD(workqueues); 71static LIST_HEAD(workqueues);
71 72
73static int singlethread_cpu;
74
72/* If it's single threaded, it isn't in the list of workqueues. */ 75/* If it's single threaded, it isn't in the list of workqueues. */
73static inline int is_single_threaded(struct workqueue_struct *wq) 76static inline int is_single_threaded(struct workqueue_struct *wq)
74{ 77{
@@ -102,7 +105,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
102 105
103 if (!test_and_set_bit(0, &work->pending)) { 106 if (!test_and_set_bit(0, &work->pending)) {
104 if (unlikely(is_single_threaded(wq))) 107 if (unlikely(is_single_threaded(wq)))
105 cpu = 0; 108 cpu = singlethread_cpu;
106 BUG_ON(!list_empty(&work->entry)); 109 BUG_ON(!list_empty(&work->entry));
107 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 110 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
108 ret = 1; 111 ret = 1;
@@ -118,7 +121,7 @@ static void delayed_work_timer_fn(unsigned long __data)
118 int cpu = smp_processor_id(); 121 int cpu = smp_processor_id();
119 122
120 if (unlikely(is_single_threaded(wq))) 123 if (unlikely(is_single_threaded(wq)))
121 cpu = 0; 124 cpu = singlethread_cpu;
122 125
123 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 126 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
124} 127}
@@ -144,7 +147,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
144 return ret; 147 return ret;
145} 148}
146 149
147static inline void run_workqueue(struct cpu_workqueue_struct *cwq) 150static void run_workqueue(struct cpu_workqueue_struct *cwq)
148{ 151{
149 unsigned long flags; 152 unsigned long flags;
150 153
@@ -266,8 +269,8 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
266 might_sleep(); 269 might_sleep();
267 270
268 if (is_single_threaded(wq)) { 271 if (is_single_threaded(wq)) {
269 /* Always use cpu 0's area. */ 272 /* Always use first cpu's area. */
270 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0)); 273 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
271 } else { 274 } else {
272 int cpu; 275 int cpu;
273 276
@@ -315,12 +318,17 @@ struct workqueue_struct *__create_workqueue(const char *name,
315 return NULL; 318 return NULL;
316 319
317 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 320 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
321 if (!wq->cpu_wq) {
322 kfree(wq);
323 return NULL;
324 }
325
318 wq->name = name; 326 wq->name = name;
319 /* We don't need the distraction of CPUs appearing and vanishing. */ 327 /* We don't need the distraction of CPUs appearing and vanishing. */
320 lock_cpu_hotplug(); 328 lock_cpu_hotplug();
321 if (singlethread) { 329 if (singlethread) {
322 INIT_LIST_HEAD(&wq->list); 330 INIT_LIST_HEAD(&wq->list);
323 p = create_workqueue_thread(wq, 0); 331 p = create_workqueue_thread(wq, singlethread_cpu);
324 if (!p) 332 if (!p)
325 destroy = 1; 333 destroy = 1;
326 else 334 else
@@ -374,7 +382,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
374 /* We don't need the distraction of CPUs appearing and vanishing. */ 382 /* We don't need the distraction of CPUs appearing and vanishing. */
375 lock_cpu_hotplug(); 383 lock_cpu_hotplug();
376 if (is_single_threaded(wq)) 384 if (is_single_threaded(wq))
377 cleanup_workqueue_thread(wq, 0); 385 cleanup_workqueue_thread(wq, singlethread_cpu);
378 else { 386 else {
379 for_each_online_cpu(cpu) 387 for_each_online_cpu(cpu)
380 cleanup_workqueue_thread(wq, cpu); 388 cleanup_workqueue_thread(wq, cpu);
@@ -419,6 +427,25 @@ int schedule_delayed_work_on(int cpu,
419 return ret; 427 return ret;
420} 428}
421 429
430int schedule_on_each_cpu(void (*func) (void *info), void *info)
431{
432 int cpu;
433 struct work_struct *work;
434
435 work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
436
437 if (!work)
438 return -ENOMEM;
439 for_each_online_cpu(cpu) {
440 INIT_WORK(work + cpu, func, info);
441 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
442 work + cpu);
443 }
444 flush_workqueue(keventd_wq);
445 kfree(work);
446 return 0;
447}
448
422void flush_scheduled_work(void) 449void flush_scheduled_work(void)
423{ 450{
424 flush_workqueue(keventd_wq); 451 flush_workqueue(keventd_wq);
@@ -543,6 +570,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
543 570
544void init_workqueues(void) 571void init_workqueues(void)
545{ 572{
573 singlethread_cpu = first_cpu(cpu_possible_map);
546 hotcpu_notifier(workqueue_cpu_callback, 0); 574 hotcpu_notifier(workqueue_cpu_callback, 0);
547 keventd_wq = create_workqueue("events"); 575 keventd_wq = create_workqueue("events");
548 BUG_ON(!keventd_wq); 576 BUG_ON(!keventd_wq);