aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDave Jones <davej@redhat.com>2006-12-12 17:41:41 -0500
committerDave Jones <davej@redhat.com>2006-12-12 17:41:41 -0500
commitc4366889dda8110247be59ca41fddb82951a8c26 (patch)
tree705c1a996bed8fd48ce94ff33ec9fd00f9b94875 /kernel
parentdb2fb9db5735cc532fd4fc55e94b9a3c3750378e (diff)
parente1036502e5263851259d147771226161e5ccc85a (diff)
Merge ../linus
Conflicts: drivers/cpufreq/cpufreq.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz20
-rw-r--r--kernel/acct.c29
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c13
-rw-r--r--kernel/compat.c35
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu.c44
-rw-r--r--kernel/cpuset.c44
-rw-r--r--kernel/delayacct.c19
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exit.c84
-rw-r--r--kernel/fork.c135
-rw-r--r--kernel/futex.c62
-rw-r--r--kernel/irq/chip.c35
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/kallsyms.c33
-rw-r--r--kernel/kexec.c60
-rw-r--r--kernel/kmod.c26
-rw-r--r--kernel/kprobes.c117
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/latency.c1
-rw-r--r--kernel/lockdep.c73
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c6
-rw-r--r--kernel/module.c52
-rw-r--r--kernel/mutex-debug.c5
-rw-r--r--kernel/mutex.c9
-rw-r--r--kernel/nsproxy.c42
-rw-r--r--kernel/pid.c77
-rw-r--r--kernel/posix-cpu-timers.c27
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c101
-rw-r--r--kernel/power/main.c14
-rw-r--r--kernel/power/power.h32
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c130
-rw-r--r--kernel/power/snapshot.c860
-rw-r--r--kernel/power/swap.c346
-rw-r--r--kernel/power/swsusp.c98
-rw-r--r--kernel/power/user.c102
-rw-r--r--kernel/printk.c45
-rw-r--r--kernel/profile.c47
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/relay.c16
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/sched.c568
-rw-r--r--kernel/signal.c34
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/spinlock.c21
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c447
-rw-r--r--kernel/taskstats.c255
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/timer.c148
-rw-r--r--kernel/tsacct.c26
-rw-r--r--kernel/unwind.c498
-rw-r--r--kernel/user.c15
-rw-r--r--kernel/workqueue.c220
67 files changed, 3483 insertions, 1699 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8b..4af15802ccd4 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
7 default HZ_250 7 default HZ_250
8 help 8 help
9 Allows the configuration of the timer frequency. It is customary 9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more 10 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
11 beneficial for servers and NUMA systems that do not need to have 11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus 12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts. 13 contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
19 config HZ_100 19 config HZ_100
20 bool "100 HZ" 20 bool "100 HZ"
21 help 21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems 22 100 Hz is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if 23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring. 24 too many timer interrupts are occurring.
25 25
26 config HZ_250 26 config HZ_250
27 bool "250 HZ" 27 bool "250 HZ"
28 help 28 help
29 250 HZ is a good compromise choice allowing server performance 29 250 Hz is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even 30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems. 31 on SMP and NUMA systems. If you are going to be using NTSC video
32 or multimedia, selected 300Hz instead.
33
34 config HZ_300
35 bool "300 HZ"
36 help
37 300 Hz is a good compromise choice allowing server performance
38 while also showing good interactive responsiveness even
39 on SMP and NUMA systems and exactly dividing by both PAL and
40 NTSC frame rates for video and multimedia work.
32 41
33 config HZ_1000 42 config HZ_1000
34 bool "1000 HZ" 43 bool "1000 HZ"
35 help 44 help
36 1000 HZ is the preferred choice for desktop systems and other 45 1000 Hz is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events. 46 systems requiring fast interactive responses to events.
38 47
39endchoice 48endchoice
@@ -42,5 +51,6 @@ config HZ
42 int 51 int
43 default 100 if HZ_100 52 default 100 if HZ_100
44 default 250 if HZ_250 53 default 250 if HZ_250
54 default 300 if HZ_300
45 default 1000 if HZ_1000 55 default 1000 if HZ_1000
46 56
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a81..70d0d88e5554 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
89 struct timer_list timer; 89 struct timer_list timer;
90}; 90};
91 91
92static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; 92static struct acct_glbs acct_globals __cacheline_aligned =
93 {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
93 94
94/* 95/*
95 * Called whenever the timer says to check the free space. 96 * Called whenever the timer says to check the free space.
@@ -117,7 +118,7 @@ static int check_free_space(struct file *file)
117 spin_unlock(&acct_globals.lock); 118 spin_unlock(&acct_globals.lock);
118 119
119 /* May block */ 120 /* May block */
120 if (vfs_statfs(file->f_dentry, &sbuf)) 121 if (vfs_statfs(file->f_path.dentry, &sbuf))
121 return res; 122 return res;
122 suspend = sbuf.f_blocks * SUSPEND; 123 suspend = sbuf.f_blocks * SUSPEND;
123 resume = sbuf.f_blocks * RESUME; 124 resume = sbuf.f_blocks * RESUME;
@@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file)
193 add_timer(&acct_globals.timer); 194 add_timer(&acct_globals.timer);
194 } 195 }
195 if (old_acct) { 196 if (old_acct) {
196 mnt_unpin(old_acct->f_vfsmnt); 197 mnt_unpin(old_acct->f_path.mnt);
197 spin_unlock(&acct_globals.lock); 198 spin_unlock(&acct_globals.lock);
198 do_acct_process(old_acct); 199 do_acct_process(old_acct);
199 filp_close(old_acct, NULL); 200 filp_close(old_acct, NULL);
@@ -211,7 +212,7 @@ static int acct_on(char *name)
211 if (IS_ERR(file)) 212 if (IS_ERR(file))
212 return PTR_ERR(file); 213 return PTR_ERR(file);
213 214
214 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { 215 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
215 filp_close(file, NULL); 216 filp_close(file, NULL);
216 return -EACCES; 217 return -EACCES;
217 } 218 }
@@ -228,11 +229,11 @@ static int acct_on(char *name)
228 } 229 }
229 230
230 spin_lock(&acct_globals.lock); 231 spin_lock(&acct_globals.lock);
231 mnt_pin(file->f_vfsmnt); 232 mnt_pin(file->f_path.mnt);
232 acct_file_reopen(file); 233 acct_file_reopen(file);
233 spin_unlock(&acct_globals.lock); 234 spin_unlock(&acct_globals.lock);
234 235
235 mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ 236 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
236 237
237 return 0; 238 return 0;
238} 239}
@@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
282void acct_auto_close_mnt(struct vfsmount *m) 283void acct_auto_close_mnt(struct vfsmount *m)
283{ 284{
284 spin_lock(&acct_globals.lock); 285 spin_lock(&acct_globals.lock);
285 if (acct_globals.file && acct_globals.file->f_vfsmnt == m) 286 if (acct_globals.file && acct_globals.file->f_path.mnt == m)
286 acct_file_reopen(NULL); 287 acct_file_reopen(NULL);
287 spin_unlock(&acct_globals.lock); 288 spin_unlock(&acct_globals.lock);
288} 289}
@@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
298{ 299{
299 spin_lock(&acct_globals.lock); 300 spin_lock(&acct_globals.lock);
300 if (acct_globals.file && 301 if (acct_globals.file &&
301 acct_globals.file->f_vfsmnt->mnt_sb == sb) { 302 acct_globals.file->f_path.mnt->mnt_sb == sb) {
302 acct_file_reopen(NULL); 303 acct_file_reopen(NULL);
303 } 304 }
304 spin_unlock(&acct_globals.lock); 305 spin_unlock(&acct_globals.lock);
@@ -427,6 +428,7 @@ static void do_acct_process(struct file *file)
427 u64 elapsed; 428 u64 elapsed;
428 u64 run_time; 429 u64 run_time;
429 struct timespec uptime; 430 struct timespec uptime;
431 struct tty_struct *tty;
430 432
431 /* 433 /*
432 * First check to see if there is enough free_space to continue 434 * First check to see if there is enough free_space to continue
@@ -483,16 +485,9 @@ static void do_acct_process(struct file *file)
483 ac.ac_ppid = current->parent->tgid; 485 ac.ac_ppid = current->parent->tgid;
484#endif 486#endif
485 487
486 mutex_lock(&tty_mutex);
487 /* FIXME: Whoever is responsible for current->signal locking needs
488 to use the same locking all over the kernel and document it */
489 read_lock(&tasklist_lock);
490 ac.ac_tty = current->signal->tty ?
491 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
492 read_unlock(&tasklist_lock);
493 mutex_unlock(&tty_mutex);
494
495 spin_lock_irq(&current->sighand->siglock); 488 spin_lock_irq(&current->sighand->siglock);
489 tty = current->signal->tty;
490 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
496 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 491 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
497 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 492 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
498 ac.ac_flag = pacct->ac_flag; 493 ac.ac_flag = pacct->ac_flag;
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b0..d9b690ac684b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h> 59#include <linux/inotify.h>
60#include <linux/freezer.h>
60 61
61#include "audit.h" 62#include "audit.h"
62 63
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8e..2e896f8ae29e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
636 struct audit_rule *rule; 636 struct audit_rule *rule;
637 int i; 637 int i;
638 638
639 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 639 rule = kzalloc(sizeof(*rule), GFP_KERNEL);
640 if (unlikely(!rule)) 640 if (unlikely(!rule))
641 return NULL; 641 return NULL;
642 memset(rule, 0, sizeof(*rule));
643 642
644 rule->flags = krule->flags | krule->listnr; 643 rule->flags = krule->flags | krule->listnr;
645 rule->action = krule->action; 644 rule->action = krule->action;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 42f2f1179711..298897559ca4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
64#include <linux/tty.h> 64#include <linux/tty.h>
65#include <linux/selinux.h> 65#include <linux/selinux.h>
66#include <linux/binfmts.h> 66#include <linux/binfmts.h>
67#include <linux/highmem.h>
67#include <linux/syscalls.h> 68#include <linux/syscalls.h>
68 69
69#include "audit.h" 70#include "audit.h"
@@ -730,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context)
730 printk(KERN_ERR "audit: freed %d contexts\n", count); 731 printk(KERN_ERR "audit: freed %d contexts\n", count);
731} 732}
732 733
733static void audit_log_task_context(struct audit_buffer *ab) 734void audit_log_task_context(struct audit_buffer *ab)
734{ 735{
735 char *ctx = NULL; 736 char *ctx = NULL;
736 ssize_t len = 0; 737 ssize_t len = 0;
@@ -759,6 +760,8 @@ error_path:
759 return; 760 return;
760} 761}
761 762
763EXPORT_SYMBOL(audit_log_task_context);
764
762static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 765static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
763{ 766{
764 char name[sizeof(tsk->comm)]; 767 char name[sizeof(tsk->comm)];
@@ -778,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
778 if ((vma->vm_flags & VM_EXECUTABLE) && 781 if ((vma->vm_flags & VM_EXECUTABLE) &&
779 vma->vm_file) { 782 vma->vm_file) {
780 audit_log_d_path(ab, "exe=", 783 audit_log_d_path(ab, "exe=",
781 vma->vm_file->f_dentry, 784 vma->vm_file->f_path.dentry,
782 vma->vm_file->f_vfsmnt); 785 vma->vm_file->f_path.mnt);
783 break; 786 break;
784 } 787 }
785 vma = vma->vm_next; 788 vma = vma->vm_next;
@@ -823,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
823 context->return_code); 826 context->return_code);
824 827
825 mutex_lock(&tty_mutex); 828 mutex_lock(&tty_mutex);
829 read_lock(&tasklist_lock);
826 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 830 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
827 tty = tsk->signal->tty->name; 831 tty = tsk->signal->tty->name;
828 else 832 else
829 tty = "(none)"; 833 tty = "(none)";
834 read_unlock(&tasklist_lock);
830 audit_log_format(ab, 835 audit_log_format(ab,
831 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 836 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
832 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 837 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1487,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1487 return ctx ? ctx->loginuid : -1; 1492 return ctx ? ctx->loginuid : -1;
1488} 1493}
1489 1494
1495EXPORT_SYMBOL(audit_get_loginuid);
1496
1490/** 1497/**
1491 * __audit_mq_open - record audit data for a POSIX MQ open 1498 * __audit_mq_open - record audit data for a POSIX MQ open
1492 * @oflag: open flag 1499 * @oflag: open flag
diff --git a/kernel/compat.c b/kernel/compat.c
index 75573e5d27b0..6952dd057300 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -678,7 +678,7 @@ int get_compat_sigevent(struct sigevent *event,
678 ? -EFAULT : 0; 678 ? -EFAULT : 0;
679} 679}
680 680
681long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, 681long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
682 unsigned long bitmap_size) 682 unsigned long bitmap_size)
683{ 683{
684 int i, j; 684 int i, j;
@@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
982 } 982 }
983 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); 983 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
984} 984}
985
986asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
987 compat_ulong_t maxnode,
988 const compat_ulong_t __user *old_nodes,
989 const compat_ulong_t __user *new_nodes)
990{
991 unsigned long __user *old = NULL;
992 unsigned long __user *new = NULL;
993 nodemask_t tmp_mask;
994 unsigned long nr_bits;
995 unsigned long size;
996
997 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
998 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
999 if (old_nodes) {
1000 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1001 return -EFAULT;
1002 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1003 if (new_nodes)
1004 new = old + size / sizeof(unsigned long);
1005 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1006 return -EFAULT;
1007 }
1008 if (new_nodes) {
1009 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1010 return -EFAULT;
1011 if (new == NULL)
1012 new = compat_alloc_user_space(size);
1013 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1014 return -EFAULT;
1015 }
1016 return sys_migrate_pages(pid, nr_bits + 1, old, new);
1017}
985#endif 1018#endif
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4ad..8fa1fb28f8a7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
75 return count; 75 return count;
76} 76}
77 77
78static struct file_operations ikconfig_file_ops = { 78static const struct file_operations ikconfig_file_ops = {
79 .owner = THIS_MODULE, 79 .owner = THIS_MODULE,
80 .read = ikconfig_read_current, 80 .read = ikconfig_read_current,
81}; 81};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 32c96628463e..9124669f4586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -19,7 +19,7 @@
19static DEFINE_MUTEX(cpu_add_remove_lock); 19static DEFINE_MUTEX(cpu_add_remove_lock);
20static DEFINE_MUTEX(cpu_bitmask_lock); 20static DEFINE_MUTEX(cpu_bitmask_lock);
21 21
22static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
23 23
24/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 24/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
25 * Should always be manipulated under cpu_add_remove_lock 25 * Should always be manipulated under cpu_add_remove_lock
@@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void)
58 recursive_depth--; 58 recursive_depth--;
59 return; 59 return;
60 } 60 }
61 mutex_unlock(&cpu_bitmask_lock);
62 recursive = NULL; 61 recursive = NULL;
62 mutex_unlock(&cpu_bitmask_lock);
63} 63}
64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
65 65
@@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
68/* Need to know about CPUs going up/down? */ 68/* Need to know about CPUs going up/down? */
69int __cpuinit register_cpu_notifier(struct notifier_block *nb) 69int __cpuinit register_cpu_notifier(struct notifier_block *nb)
70{ 70{
71 return blocking_notifier_chain_register(&cpu_chain, nb); 71 int ret;
72 mutex_lock(&cpu_add_remove_lock);
73 ret = raw_notifier_chain_register(&cpu_chain, nb);
74 mutex_unlock(&cpu_add_remove_lock);
75 return ret;
72} 76}
73 77
74#ifdef CONFIG_HOTPLUG_CPU 78#ifdef CONFIG_HOTPLUG_CPU
@@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
77 81
78void unregister_cpu_notifier(struct notifier_block *nb) 82void unregister_cpu_notifier(struct notifier_block *nb)
79{ 83{
80 blocking_notifier_chain_unregister(&cpu_chain, nb); 84 mutex_lock(&cpu_add_remove_lock);
85 raw_notifier_chain_unregister(&cpu_chain, nb);
86 mutex_unlock(&cpu_add_remove_lock);
81} 87}
82EXPORT_SYMBOL(unregister_cpu_notifier); 88EXPORT_SYMBOL(unregister_cpu_notifier);
83 89
@@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu)
126 if (!cpu_online(cpu)) 132 if (!cpu_online(cpu))
127 return -EINVAL; 133 return -EINVAL;
128 134
129 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 135 err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
130 (void *)(long)cpu); 136 (void *)(long)cpu);
131 if (err == NOTIFY_BAD) { 137 if (err == NOTIFY_BAD) {
132 printk("%s: attempt to take down CPU %u failed\n", 138 printk("%s: attempt to take down CPU %u failed\n",
@@ -144,18 +150,18 @@ static int _cpu_down(unsigned int cpu)
144 p = __stop_machine_run(take_cpu_down, NULL, cpu); 150 p = __stop_machine_run(take_cpu_down, NULL, cpu);
145 mutex_unlock(&cpu_bitmask_lock); 151 mutex_unlock(&cpu_bitmask_lock);
146 152
147 if (IS_ERR(p)) { 153 if (IS_ERR(p) || cpu_online(cpu)) {
148 /* CPU didn't die: tell everyone. Can't complain. */ 154 /* CPU didn't die: tell everyone. Can't complain. */
149 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 155 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
150 (void *)(long)cpu) == NOTIFY_BAD) 156 (void *)(long)cpu) == NOTIFY_BAD)
151 BUG(); 157 BUG();
152 158
153 err = PTR_ERR(p); 159 if (IS_ERR(p)) {
154 goto out_allowed; 160 err = PTR_ERR(p);
155 } 161 goto out_allowed;
156 162 }
157 if (cpu_online(cpu))
158 goto out_thread; 163 goto out_thread;
164 }
159 165
160 /* Wait for it to sleep (leaving idle task). */ 166 /* Wait for it to sleep (leaving idle task). */
161 while (!idle_cpu(cpu)) 167 while (!idle_cpu(cpu))
@@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu)
169 put_cpu(); 175 put_cpu();
170 176
171 /* CPU is completely dead: tell everyone. Too late to complain. */ 177 /* CPU is completely dead: tell everyone. Too late to complain. */
172 if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, 178 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
173 (void *)(long)cpu) == NOTIFY_BAD) 179 (void *)(long)cpu) == NOTIFY_BAD)
174 BUG(); 180 BUG();
175 181
@@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned int cpu)
206 if (cpu_online(cpu) || !cpu_present(cpu)) 212 if (cpu_online(cpu) || !cpu_present(cpu))
207 return -EINVAL; 213 return -EINVAL;
208 214
209 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 215 ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
210 if (ret == NOTIFY_BAD) { 216 if (ret == NOTIFY_BAD) {
211 printk("%s: attempt to bring up CPU %u failed\n", 217 printk("%s: attempt to bring up CPU %u failed\n",
212 __FUNCTION__, cpu); 218 __FUNCTION__, cpu);
@@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned int cpu)
223 BUG_ON(!cpu_online(cpu)); 229 BUG_ON(!cpu_online(cpu));
224 230
225 /* Now call notifier in preparation. */ 231 /* Now call notifier in preparation. */
226 blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); 232 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
227 233
228out_notify: 234out_notify:
229 if (ret != 0) 235 if (ret != 0)
230 blocking_notifier_call_chain(&cpu_chain, 236 raw_notifier_call_chain(&cpu_chain,
231 CPU_UP_CANCELED, hcpu); 237 CPU_UP_CANCELED, hcpu);
232 238
233 return ret; 239 return ret;
@@ -264,11 +270,7 @@ int disable_nonboot_cpus(void)
264 goto out; 270 goto out;
265 } 271 }
266 } 272 }
267 error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); 273
268 if (error) {
269 printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
270 goto out;
271 }
272 /* We take down all of the non-boot CPUs in one shot to avoid races 274 /* We take down all of the non-boot CPUs in one shot to avoid races
273 * with the userspace trying to use the CPU hotplug at the same time 275 * with the userspace trying to use the CPU hotplug at the same time
274 */ 276 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930e..2c3b4431472b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
413 * 413 *
414 * 414 *
415 * When reading/writing to a file: 415 * When reading/writing to a file:
416 * - the cpuset to use in file->f_dentry->d_parent->d_fsdata 416 * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
417 * - the 'cftype' of the file is file->f_dentry->d_fsdata 417 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
418 */ 418 */
419 419
420struct cftype { 420struct cftype {
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
729 } 729 }
730 730
731 /* Remaining checks don't apply to root cpuset */ 731 /* Remaining checks don't apply to root cpuset */
732 if ((par = cur->parent) == NULL) 732 if (cur == &top_cpuset)
733 return 0; 733 return 0;
734 734
735 par = cur->parent;
736
735 /* We must be a subset of our parent cpuset */ 737 /* We must be a subset of our parent cpuset */
736 if (!is_cpuset_subset(trial, par)) 738 if (!is_cpuset_subset(trial, par))
737 return -EACCES; 739 return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1060 cpu_exclusive_changed = 1062 cpu_exclusive_changed =
1061 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 1063 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
1062 mutex_lock(&callback_mutex); 1064 mutex_lock(&callback_mutex);
1063 if (turning_on) 1065 cs->flags = trialcs.flags;
1064 set_bit(bit, &cs->flags);
1065 else
1066 clear_bit(bit, &cs->flags);
1067 mutex_unlock(&callback_mutex); 1066 mutex_unlock(&callback_mutex);
1068 1067
1069 if (cpu_exclusive_changed) 1068 if (cpu_exclusive_changed)
@@ -1281,18 +1280,19 @@ typedef enum {
1281 FILE_TASKLIST, 1280 FILE_TASKLIST,
1282} cpuset_filetype_t; 1281} cpuset_filetype_t;
1283 1282
1284static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, 1283static ssize_t cpuset_common_file_write(struct file *file,
1284 const char __user *userbuf,
1285 size_t nbytes, loff_t *unused_ppos) 1285 size_t nbytes, loff_t *unused_ppos)
1286{ 1286{
1287 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1287 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1288 struct cftype *cft = __d_cft(file->f_dentry); 1288 struct cftype *cft = __d_cft(file->f_path.dentry);
1289 cpuset_filetype_t type = cft->private; 1289 cpuset_filetype_t type = cft->private;
1290 char *buffer; 1290 char *buffer;
1291 char *pathbuf = NULL; 1291 char *pathbuf = NULL;
1292 int retval = 0; 1292 int retval = 0;
1293 1293
1294 /* Crude upper limit on largest legitimate cpulist user might write. */ 1294 /* Crude upper limit on largest legitimate cpulist user might write. */
1295 if (nbytes > 100 + 6 * NR_CPUS) 1295 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
1296 return -E2BIG; 1296 return -E2BIG;
1297 1297
1298 /* +1 for nul-terminator */ 1298 /* +1 for nul-terminator */
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
1367 size_t nbytes, loff_t *ppos) 1367 size_t nbytes, loff_t *ppos)
1368{ 1368{
1369 ssize_t retval = 0; 1369 ssize_t retval = 0;
1370 struct cftype *cft = __d_cft(file->f_dentry); 1370 struct cftype *cft = __d_cft(file->f_path.dentry);
1371 if (!cft) 1371 if (!cft)
1372 return -ENODEV; 1372 return -ENODEV;
1373 1373
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1417static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, 1417static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1418 size_t nbytes, loff_t *ppos) 1418 size_t nbytes, loff_t *ppos)
1419{ 1419{
1420 struct cftype *cft = __d_cft(file->f_dentry); 1420 struct cftype *cft = __d_cft(file->f_path.dentry);
1421 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1421 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1422 cpuset_filetype_t type = cft->private; 1422 cpuset_filetype_t type = cft->private;
1423 char *page; 1423 char *page;
1424 ssize_t retval = 0; 1424 ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
1476 loff_t *ppos) 1476 loff_t *ppos)
1477{ 1477{
1478 ssize_t retval = 0; 1478 ssize_t retval = 0;
1479 struct cftype *cft = __d_cft(file->f_dentry); 1479 struct cftype *cft = __d_cft(file->f_path.dentry);
1480 if (!cft) 1480 if (!cft)
1481 return -ENODEV; 1481 return -ENODEV;
1482 1482
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
1498 if (err) 1498 if (err)
1499 return err; 1499 return err;
1500 1500
1501 cft = __d_cft(file->f_dentry); 1501 cft = __d_cft(file->f_path.dentry);
1502 if (!cft) 1502 if (!cft)
1503 return -ENODEV; 1503 return -ENODEV;
1504 if (cft->open) 1504 if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
1511 1511
1512static int cpuset_file_release(struct inode *inode, struct file *file) 1512static int cpuset_file_release(struct inode *inode, struct file *file)
1513{ 1513{
1514 struct cftype *cft = __d_cft(file->f_dentry); 1514 struct cftype *cft = __d_cft(file->f_path.dentry);
1515 if (cft->release) 1515 if (cft->release)
1516 return cft->release(inode, file); 1516 return cft->release(inode, file);
1517 return 0; 1517 return 0;
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1533} 1533}
1534 1534
1535static struct file_operations cpuset_file_operations = { 1535static const struct file_operations cpuset_file_operations = {
1536 .read = cpuset_file_read, 1536 .read = cpuset_file_read,
1537 .write = cpuset_file_write, 1537 .write = cpuset_file_write,
1538 .llseek = generic_file_llseek, 1538 .llseek = generic_file_llseek,
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1700 */ 1700 */
1701static int cpuset_tasks_open(struct inode *unused, struct file *file) 1701static int cpuset_tasks_open(struct inode *unused, struct file *file)
1702{ 1702{
1703 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1703 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1704 struct ctr_struct *ctr; 1704 struct ctr_struct *ctr;
1705 pid_t *pidarray; 1705 pid_t *pidarray;
1706 int npids; 1706 int npids;
@@ -2045,7 +2045,6 @@ out:
2045 return err; 2045 return err;
2046} 2046}
2047 2047
2048#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
2049/* 2048/*
2050 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 2049 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
2051 * or memory nodes, we need to walk over the cpuset hierarchy, 2050 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void)
2109 mutex_unlock(&callback_mutex); 2108 mutex_unlock(&callback_mutex);
2110 mutex_unlock(&manage_mutex); 2109 mutex_unlock(&manage_mutex);
2111} 2110}
2112#endif
2113 2111
2114#ifdef CONFIG_HOTPLUG_CPU
2115/* 2112/*
2116 * The top_cpuset tracks what CPUs and Memory Nodes are online, 2113 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2117 * period. This is necessary in order to make cpusets transparent 2114 * period. This is necessary in order to make cpusets transparent
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
2128 common_cpu_mem_hotplug_unplug(); 2125 common_cpu_mem_hotplug_unplug();
2129 return 0; 2126 return 0;
2130} 2127}
2131#endif
2132 2128
2133#ifdef CONFIG_MEMORY_HOTPLUG 2129#ifdef CONFIG_MEMORY_HOTPLUG
2134/* 2130/*
@@ -2610,7 +2606,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
2610 return single_open(file, proc_cpuset_show, pid); 2606 return single_open(file, proc_cpuset_show, pid);
2611} 2607}
2612 2608
2613struct file_operations proc_cpuset_operations = { 2609const struct file_operations proc_cpuset_operations = {
2614 .open = cpuset_open, 2610 .open = cpuset_open,
2615 .read = seq_read, 2611 .read = seq_read,
2616 .llseek = seq_lseek, 2612 .llseek = seq_lseek,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 36752f124c6a..766d5912b26a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
20#include <linux/delayacct.h> 20#include <linux/delayacct.h>
21 21
22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
23kmem_cache_t *delayacct_cache; 23struct kmem_cache *delayacct_cache;
24 24
25static int __init delayacct_setup_disable(char *str) 25static int __init delayacct_setup_disable(char *str)
26{ 26{
@@ -41,7 +41,7 @@ void delayacct_init(void)
41 41
42void __delayacct_tsk_init(struct task_struct *tsk) 42void __delayacct_tsk_init(struct task_struct *tsk)
43{ 43{
44 tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); 44 tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
45 if (tsk->delays) 45 if (tsk->delays)
46 spin_lock_init(&tsk->delays->lock); 46 spin_lock_init(&tsk->delays->lock);
47} 47}
@@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
66{ 66{
67 struct timespec ts; 67 struct timespec ts;
68 s64 ns; 68 s64 ns;
69 unsigned long flags;
69 70
70 do_posix_clock_monotonic_gettime(end); 71 do_posix_clock_monotonic_gettime(end);
71 ts = timespec_sub(*end, *start); 72 ts = timespec_sub(*end, *start);
@@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
73 if (ns < 0) 74 if (ns < 0)
74 return; 75 return;
75 76
76 spin_lock(&current->delays->lock); 77 spin_lock_irqsave(&current->delays->lock, flags);
77 *total += ns; 78 *total += ns;
78 (*count)++; 79 (*count)++;
79 spin_unlock(&current->delays->lock); 80 spin_unlock_irqrestore(&current->delays->lock, flags);
80} 81}
81 82
82void __delayacct_blkio_start(void) 83void __delayacct_blkio_start(void)
@@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
104 s64 tmp; 105 s64 tmp;
105 struct timespec ts; 106 struct timespec ts;
106 unsigned long t1,t2,t3; 107 unsigned long t1,t2,t3;
108 unsigned long flags;
107 109
108 /* Though tsk->delays accessed later, early exit avoids 110 /* Though tsk->delays accessed later, early exit avoids
109 * unnecessary returning of other data 111 * unnecessary returning of other data
@@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
136 138
137 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ 139 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
138 140
139 spin_lock(&tsk->delays->lock); 141 spin_lock_irqsave(&tsk->delays->lock, flags);
140 tmp = d->blkio_delay_total + tsk->delays->blkio_delay; 142 tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
141 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; 143 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
142 tmp = d->swapin_delay_total + tsk->delays->swapin_delay; 144 tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
143 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; 145 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
144 d->blkio_count += tsk->delays->blkio_count; 146 d->blkio_count += tsk->delays->blkio_count;
145 d->swapin_count += tsk->delays->swapin_count; 147 d->swapin_count += tsk->delays->swapin_count;
146 spin_unlock(&tsk->delays->lock); 148 spin_unlock_irqrestore(&tsk->delays->lock, flags);
147 149
148done: 150done:
149 return 0; 151 return 0;
@@ -152,11 +154,12 @@ done:
152__u64 __delayacct_blkio_ticks(struct task_struct *tsk) 154__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
153{ 155{
154 __u64 ret; 156 __u64 ret;
157 unsigned long flags;
155 158
156 spin_lock(&tsk->delays->lock); 159 spin_lock_irqsave(&tsk->delays->lock, flags);
157 ret = nsec_to_clock_t(tsk->delays->blkio_delay + 160 ret = nsec_to_clock_t(tsk->delays->blkio_delay +
158 tsk->delays->swapin_delay); 161 tsk->delays->swapin_delay);
159 spin_unlock(&tsk->delays->lock); 162 spin_unlock_irqrestore(&tsk->delays->lock, flags);
160 return ret; 163 return ret;
161} 164}
162 165
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938a..937b13ca33ba 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
140 return single_open(file, proc_dma_show, NULL); 140 return single_open(file, proc_dma_show, NULL);
141} 141}
142 142
143static struct file_operations proc_dma_operations = { 143static const struct file_operations proc_dma_operations = {
144 .open = proc_dma_open, 144 .open = proc_dma_open,
145 .read = seq_read, 145 .read = seq_read,
146 .llseek = seq_lseek, 146 .llseek = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index f250a5e3e281..122fadb972fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/tty.h> 15#include <linux/tty.h>
16#include <linux/namespace.h> 16#include <linux/mnt_namespace.h>
17#include <linux/key.h> 17#include <linux/key.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -22,6 +22,7 @@
22#include <linux/file.h> 22#include <linux/file.h>
23#include <linux/binfmts.h> 23#include <linux/binfmts.h>
24#include <linux/nsproxy.h> 24#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h>
25#include <linux/ptrace.h> 26#include <linux/ptrace.h>
26#include <linux/profile.h> 27#include <linux/profile.h>
27#include <linux/mount.h> 28#include <linux/mount.h>
@@ -48,7 +49,6 @@
48#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
49 50
50extern void sem_exit (void); 51extern void sem_exit (void);
51extern struct task_struct *child_reaper;
52 52
53static void exit_mm(struct task_struct * tsk); 53static void exit_mm(struct task_struct * tsk);
54 54
@@ -128,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk)
128 flush_sigqueue(&tsk->pending); 128 flush_sigqueue(&tsk->pending);
129 if (sig) { 129 if (sig) {
130 flush_sigqueue(&sig->shared_pending); 130 flush_sigqueue(&sig->shared_pending);
131 taskstats_tgid_free(sig);
131 __cleanup_signal(sig); 132 __cleanup_signal(sig);
132 } 133 }
133} 134}
@@ -188,21 +189,18 @@ repeat:
188int session_of_pgrp(int pgrp) 189int session_of_pgrp(int pgrp)
189{ 190{
190 struct task_struct *p; 191 struct task_struct *p;
191 int sid = -1; 192 int sid = 0;
192 193
193 read_lock(&tasklist_lock); 194 read_lock(&tasklist_lock);
194 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 195
195 if (p->signal->session > 0) { 196 p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
196 sid = p->signal->session; 197 if (p == NULL)
197 goto out; 198 p = find_task_by_pid(pgrp);
198 } 199 if (p != NULL)
199 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 200 sid = process_session(p);
200 p = find_task_by_pid(pgrp); 201
201 if (p)
202 sid = p->signal->session;
203out:
204 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
205 203
206 return sid; 204 return sid;
207} 205}
208 206
@@ -224,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
224 || p->exit_state 222 || p->exit_state
225 || is_init(p->real_parent)) 223 || is_init(p->real_parent))
226 continue; 224 continue;
227 if (process_group(p->real_parent) != pgrp 225 if (process_group(p->real_parent) != pgrp &&
228 && p->real_parent->signal->session == p->signal->session) { 226 process_session(p->real_parent) == process_session(p)) {
229 ret = 0; 227 ret = 0;
230 break; 228 break;
231 } 229 }
@@ -259,7 +257,8 @@ static int has_stopped_jobs(int pgrp)
259} 257}
260 258
261/** 259/**
262 * reparent_to_init - Reparent the calling kernel thread to the init task. 260 * reparent_to_init - Reparent the calling kernel thread to the init task
261 * of the pid space that the thread belongs to.
263 * 262 *
264 * If a kernel thread is launched as a result of a system call, or if 263 * If a kernel thread is launched as a result of a system call, or if
265 * it ever exits, it should generally reparent itself to init so that 264 * it ever exits, it should generally reparent itself to init so that
@@ -277,8 +276,8 @@ static void reparent_to_init(void)
277 ptrace_unlink(current); 276 ptrace_unlink(current);
278 /* Reparent to init */ 277 /* Reparent to init */
279 remove_parent(current); 278 remove_parent(current);
280 current->parent = child_reaper; 279 current->parent = child_reaper(current);
281 current->real_parent = child_reaper; 280 current->real_parent = child_reaper(current);
282 add_parent(current); 281 add_parent(current);
283 282
284 /* Set the exit signal to SIGCHLD so we signal init on exit */ 283 /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -301,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
301{ 300{
302 struct task_struct *curr = current->group_leader; 301 struct task_struct *curr = current->group_leader;
303 302
304 if (curr->signal->session != session) { 303 if (process_session(curr) != session) {
305 detach_pid(curr, PIDTYPE_SID); 304 detach_pid(curr, PIDTYPE_SID);
306 curr->signal->session = session; 305 set_signal_session(curr->signal, session);
307 attach_pid(curr, PIDTYPE_SID, session); 306 attach_pid(curr, PIDTYPE_SID, session);
308 } 307 }
309 if (process_group(curr) != pgrp) { 308 if (process_group(curr) != pgrp) {
@@ -313,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
313 } 312 }
314} 313}
315 314
316void set_special_pids(pid_t session, pid_t pgrp) 315static void set_special_pids(pid_t session, pid_t pgrp)
317{ 316{
318 write_lock_irq(&tasklist_lock); 317 write_lock_irq(&tasklist_lock);
319 __set_special_pids(session, pgrp); 318 __set_special_pids(session, pgrp);
@@ -383,9 +382,7 @@ void daemonize(const char *name, ...)
383 exit_mm(current); 382 exit_mm(current);
384 383
385 set_special_pids(1, 1); 384 set_special_pids(1, 1);
386 mutex_lock(&tty_mutex); 385 proc_clear_tty(current);
387 current->signal->tty = NULL;
388 mutex_unlock(&tty_mutex);
389 386
390 /* Block and flush all signals */ 387 /* Block and flush all signals */
391 sigfillset(&blocked); 388 sigfillset(&blocked);
@@ -428,7 +425,7 @@ static void close_files(struct files_struct * files)
428 for (;;) { 425 for (;;) {
429 unsigned long set; 426 unsigned long set;
430 i = j * __NFDBITS; 427 i = j * __NFDBITS;
431 if (i >= fdt->max_fdset || i >= fdt->max_fds) 428 if (i >= fdt->max_fds)
432 break; 429 break;
433 set = fdt->open_fds->fds_bits[j++]; 430 set = fdt->open_fds->fds_bits[j++];
434 while (set) { 431 while (set) {
@@ -469,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files)
469 * you can free files immediately. 466 * you can free files immediately.
470 */ 467 */
471 fdt = files_fdtable(files); 468 fdt = files_fdtable(files);
472 if (fdt == &files->fdtab) 469 if (fdt != &files->fdtab)
473 fdt->free_files = files;
474 else
475 kmem_cache_free(files_cachep, files); 470 kmem_cache_free(files_cachep, files);
476 free_fdtable(fdt); 471 call_rcu(&fdt->rcu, free_fdtable_rcu);
477 } 472 }
478} 473}
479 474
@@ -648,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
648 * outside, so the child pgrp is now orphaned. 643 * outside, so the child pgrp is now orphaned.
649 */ 644 */
650 if ((process_group(p) != process_group(father)) && 645 if ((process_group(p) != process_group(father)) &&
651 (p->signal->session == father->signal->session)) { 646 (process_session(p) == process_session(father))) {
652 int pgrp = process_group(p); 647 int pgrp = process_group(p);
653 648
654 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { 649 if (will_become_orphaned_pgrp(pgrp, NULL) &&
650 has_stopped_jobs(pgrp)) {
655 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); 651 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
656 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); 652 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
657 } 653 }
@@ -662,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
662 * When we die, we re-parent all our children. 658 * When we die, we re-parent all our children.
663 * Try to give them to another thread in our thread 659 * Try to give them to another thread in our thread
664 * group, and if no such member exists, give it to 660 * group, and if no such member exists, give it to
665 * the global child reaper process (ie "init") 661 * the child reaper process (ie "init") in our pid
662 * space.
666 */ 663 */
667static void 664static void
668forget_original_parent(struct task_struct *father, struct list_head *to_release) 665forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -673,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
673 do { 670 do {
674 reaper = next_thread(reaper); 671 reaper = next_thread(reaper);
675 if (reaper == father) { 672 if (reaper == father) {
676 reaper = child_reaper; 673 reaper = child_reaper(father);
677 break; 674 break;
678 } 675 }
679 } while (reaper->exit_state); 676 } while (reaper->exit_state);
@@ -785,7 +782,7 @@ static void exit_notify(struct task_struct *tsk)
785 t = tsk->real_parent; 782 t = tsk->real_parent;
786 783
787 if ((process_group(t) != process_group(tsk)) && 784 if ((process_group(t) != process_group(tsk)) &&
788 (t->signal->session == tsk->signal->session) && 785 (process_session(t) == process_session(tsk)) &&
789 will_become_orphaned_pgrp(process_group(tsk), tsk) && 786 will_become_orphaned_pgrp(process_group(tsk), tsk) &&
790 has_stopped_jobs(process_group(tsk))) { 787 has_stopped_jobs(process_group(tsk))) {
791 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); 788 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
@@ -849,9 +846,7 @@ static void exit_notify(struct task_struct *tsk)
849fastcall NORET_TYPE void do_exit(long code) 846fastcall NORET_TYPE void do_exit(long code)
850{ 847{
851 struct task_struct *tsk = current; 848 struct task_struct *tsk = current;
852 struct taskstats *tidstats;
853 int group_dead; 849 int group_dead;
854 unsigned int mycpu;
855 850
856 profile_task_exit(tsk); 851 profile_task_exit(tsk);
857 852
@@ -861,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code)
861 panic("Aiee, killing interrupt handler!"); 856 panic("Aiee, killing interrupt handler!");
862 if (unlikely(!tsk->pid)) 857 if (unlikely(!tsk->pid))
863 panic("Attempted to kill the idle task!"); 858 panic("Attempted to kill the idle task!");
864 if (unlikely(tsk == child_reaper)) 859 if (unlikely(tsk == child_reaper(tsk))) {
865 panic("Attempted to kill init!"); 860 if (tsk->nsproxy->pid_ns != &init_pid_ns)
861 tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
862 else
863 panic("Attempted to kill init!");
864 }
865
866 866
867 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 867 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
868 current->ptrace_message = code; 868 current->ptrace_message = code;
@@ -889,8 +889,6 @@ fastcall NORET_TYPE void do_exit(long code)
889 current->comm, current->pid, 889 current->comm, current->pid,
890 preempt_count()); 890 preempt_count());
891 891
892 taskstats_exit_alloc(&tidstats, &mycpu);
893
894 acct_update_integrals(tsk); 892 acct_update_integrals(tsk);
895 if (tsk->mm) { 893 if (tsk->mm) {
896 update_hiwater_rss(tsk->mm); 894 update_hiwater_rss(tsk->mm);
@@ -910,8 +908,8 @@ fastcall NORET_TYPE void do_exit(long code)
910#endif 908#endif
911 if (unlikely(tsk->audit_context)) 909 if (unlikely(tsk->audit_context))
912 audit_free(tsk); 910 audit_free(tsk);
913 taskstats_exit_send(tsk, tidstats, group_dead, mycpu); 911
914 taskstats_exit_free(tidstats); 912 taskstats_exit(tsk, group_dead);
915 913
916 exit_mm(tsk); 914 exit_mm(tsk);
917 915
diff --git a/kernel/fork.c b/kernel/fork.c
index 7dc6140baac6..d16c566eb645 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/completion.h> 20#include <linux/completion.h>
21#include <linux/namespace.h> 21#include <linux/mnt_namespace.h>
22#include <linux/personality.h> 22#include <linux/personality.h>
23#include <linux/mempolicy.h> 23#include <linux/mempolicy.h>
24#include <linux/sem.h> 24#include <linux/sem.h>
@@ -36,6 +36,7 @@
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/jiffies.h> 37#include <linux/jiffies.h>
38#include <linux/futex.h> 38#include <linux/futex.h>
39#include <linux/task_io_accounting_ops.h>
39#include <linux/rcupdate.h> 40#include <linux/rcupdate.h>
40#include <linux/ptrace.h> 41#include <linux/ptrace.h>
41#include <linux/mount.h> 42#include <linux/mount.h>
@@ -82,26 +83,26 @@ int nr_processes(void)
82#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 83#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
83# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 84# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
84# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 85# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
85static kmem_cache_t *task_struct_cachep; 86static struct kmem_cache *task_struct_cachep;
86#endif 87#endif
87 88
88/* SLAB cache for signal_struct structures (tsk->signal) */ 89/* SLAB cache for signal_struct structures (tsk->signal) */
89static kmem_cache_t *signal_cachep; 90static struct kmem_cache *signal_cachep;
90 91
91/* SLAB cache for sighand_struct structures (tsk->sighand) */ 92/* SLAB cache for sighand_struct structures (tsk->sighand) */
92kmem_cache_t *sighand_cachep; 93struct kmem_cache *sighand_cachep;
93 94
94/* SLAB cache for files_struct structures (tsk->files) */ 95/* SLAB cache for files_struct structures (tsk->files) */
95kmem_cache_t *files_cachep; 96struct kmem_cache *files_cachep;
96 97
97/* SLAB cache for fs_struct structures (tsk->fs) */ 98/* SLAB cache for fs_struct structures (tsk->fs) */
98kmem_cache_t *fs_cachep; 99struct kmem_cache *fs_cachep;
99 100
100/* SLAB cache for vm_area_struct structures */ 101/* SLAB cache for vm_area_struct structures */
101kmem_cache_t *vm_area_cachep; 102struct kmem_cache *vm_area_cachep;
102 103
103/* SLAB cache for mm_struct structures (tsk->mm) */ 104/* SLAB cache for mm_struct structures (tsk->mm) */
104static kmem_cache_t *mm_cachep; 105static struct kmem_cache *mm_cachep;
105 106
106void free_task(struct task_struct *tsk) 107void free_task(struct task_struct *tsk)
107{ 108{
@@ -237,7 +238,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
237 goto fail_nomem; 238 goto fail_nomem;
238 charge = len; 239 charge = len;
239 } 240 }
240 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 241 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
241 if (!tmp) 242 if (!tmp)
242 goto fail_nomem; 243 goto fail_nomem;
243 *tmp = *mpnt; 244 *tmp = *mpnt;
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
252 anon_vma_link(tmp); 253 anon_vma_link(tmp);
253 file = tmp->vm_file; 254 file = tmp->vm_file;
254 if (file) { 255 if (file) {
255 struct inode *inode = file->f_dentry->d_inode; 256 struct inode *inode = file->f_path.dentry->d_inode;
256 get_file(file); 257 get_file(file);
257 if (tmp->vm_flags & VM_DENYWRITE) 258 if (tmp->vm_flags & VM_DENYWRITE)
258 atomic_dec(&inode->i_writecount); 259 atomic_dec(&inode->i_writecount);
@@ -319,7 +320,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
319 320
320 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 321 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
321 322
322#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) 323#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
323#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 324#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
324 325
325#include <linux/init_task.h> 326#include <linux/init_task.h>
@@ -448,7 +449,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
448 tsk->vfork_done = NULL; 449 tsk->vfork_done = NULL;
449 complete(vfork_done); 450 complete(vfork_done);
450 } 451 }
451 if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { 452
453 /*
454 * If we're exiting normally, clear a user-space tid field if
455 * requested. We leave this alone when dying by signal, to leave
456 * the value intact in a core dump, and to save the unnecessary
457 * trouble otherwise. Userland only wants this done for a sys_exit.
458 */
459 if (tsk->clear_child_tid
460 && !(tsk->flags & PF_SIGNALED)
461 && atomic_read(&mm->mm_users) > 1) {
452 u32 __user * tidptr = tsk->clear_child_tid; 462 u32 __user * tidptr = tsk->clear_child_tid;
453 tsk->clear_child_tid = NULL; 463 tsk->clear_child_tid = NULL;
454 464
@@ -479,6 +489,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
479 489
480 memcpy(mm, oldmm, sizeof(*mm)); 490 memcpy(mm, oldmm, sizeof(*mm));
481 491
492 /* Initializing for Swap token stuff */
493 mm->token_priority = 0;
494 mm->last_interval = 0;
495
482 if (!mm_init(mm)) 496 if (!mm_init(mm))
483 goto fail_nomem; 497 goto fail_nomem;
484 498
@@ -542,6 +556,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
542 goto fail_nomem; 556 goto fail_nomem;
543 557
544good_mm: 558good_mm:
559 /* Initializing for Swap token stuff */
560 mm->token_priority = 0;
561 mm->last_interval = 0;
562
545 tsk->mm = mm; 563 tsk->mm = mm;
546 tsk->active_mm = mm; 564 tsk->active_mm = mm;
547 return 0; 565 return 0;
@@ -596,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
596 614
597static int count_open_files(struct fdtable *fdt) 615static int count_open_files(struct fdtable *fdt)
598{ 616{
599 int size = fdt->max_fdset; 617 int size = fdt->max_fds;
600 int i; 618 int i;
601 619
602 /* Find the last open fd */ 620 /* Find the last open fd */
@@ -613,7 +631,7 @@ static struct files_struct *alloc_files(void)
613 struct files_struct *newf; 631 struct files_struct *newf;
614 struct fdtable *fdt; 632 struct fdtable *fdt;
615 633
616 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); 634 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
617 if (!newf) 635 if (!newf)
618 goto out; 636 goto out;
619 637
@@ -623,12 +641,10 @@ static struct files_struct *alloc_files(void)
623 newf->next_fd = 0; 641 newf->next_fd = 0;
624 fdt = &newf->fdtab; 642 fdt = &newf->fdtab;
625 fdt->max_fds = NR_OPEN_DEFAULT; 643 fdt->max_fds = NR_OPEN_DEFAULT;
626 fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
627 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; 644 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
628 fdt->open_fds = (fd_set *)&newf->open_fds_init; 645 fdt->open_fds = (fd_set *)&newf->open_fds_init;
629 fdt->fd = &newf->fd_array[0]; 646 fdt->fd = &newf->fd_array[0];
630 INIT_RCU_HEAD(&fdt->rcu); 647 INIT_RCU_HEAD(&fdt->rcu);
631 fdt->free_files = NULL;
632 fdt->next = NULL; 648 fdt->next = NULL;
633 rcu_assign_pointer(newf->fdt, fdt); 649 rcu_assign_pointer(newf->fdt, fdt);
634out: 650out:
@@ -644,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
644{ 660{
645 struct files_struct *newf; 661 struct files_struct *newf;
646 struct file **old_fds, **new_fds; 662 struct file **old_fds, **new_fds;
647 int open_files, size, i, expand; 663 int open_files, size, i;
648 struct fdtable *old_fdt, *new_fdt; 664 struct fdtable *old_fdt, *new_fdt;
649 665
650 *errorp = -ENOMEM; 666 *errorp = -ENOMEM;
@@ -655,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
655 spin_lock(&oldf->file_lock); 671 spin_lock(&oldf->file_lock);
656 old_fdt = files_fdtable(oldf); 672 old_fdt = files_fdtable(oldf);
657 new_fdt = files_fdtable(newf); 673 new_fdt = files_fdtable(newf);
658 size = old_fdt->max_fdset;
659 open_files = count_open_files(old_fdt); 674 open_files = count_open_files(old_fdt);
660 expand = 0;
661 675
662 /* 676 /*
663 * Check whether we need to allocate a larger fd array or fd set. 677 * Check whether we need to allocate a larger fd array and fd set.
664 * Note: we're not a clone task, so the open count won't change. 678 * Note: we're not a clone task, so the open count won't change.
665 */ 679 */
666 if (open_files > new_fdt->max_fdset) {
667 new_fdt->max_fdset = 0;
668 expand = 1;
669 }
670 if (open_files > new_fdt->max_fds) { 680 if (open_files > new_fdt->max_fds) {
671 new_fdt->max_fds = 0; 681 new_fdt->max_fds = 0;
672 expand = 1;
673 }
674
675 /* if the old fdset gets grown now, we'll only copy up to "size" fds */
676 if (expand) {
677 spin_unlock(&oldf->file_lock); 682 spin_unlock(&oldf->file_lock);
678 spin_lock(&newf->file_lock); 683 spin_lock(&newf->file_lock);
679 *errorp = expand_files(newf, open_files-1); 684 *errorp = expand_files(newf, open_files-1);
@@ -693,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
693 old_fds = old_fdt->fd; 698 old_fds = old_fdt->fd;
694 new_fds = new_fdt->fd; 699 new_fds = new_fdt->fd;
695 700
696 memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); 701 memcpy(new_fdt->open_fds->fds_bits,
697 memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); 702 old_fdt->open_fds->fds_bits, open_files/8);
703 memcpy(new_fdt->close_on_exec->fds_bits,
704 old_fdt->close_on_exec->fds_bits, open_files/8);
698 705
699 for (i = open_files; i != 0; i--) { 706 for (i = open_files; i != 0; i--) {
700 struct file *f = *old_fds++; 707 struct file *f = *old_fds++;
@@ -719,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
719 /* This is long word aligned thus could use a optimized version */ 726 /* This is long word aligned thus could use a optimized version */
720 memset(new_fds, 0, size); 727 memset(new_fds, 0, size);
721 728
722 if (new_fdt->max_fdset > open_files) { 729 if (new_fdt->max_fds > open_files) {
723 int left = (new_fdt->max_fdset-open_files)/8; 730 int left = (new_fdt->max_fds-open_files)/8;
724 int start = open_files / (8 * sizeof(unsigned long)); 731 int start = open_files / (8 * sizeof(unsigned long));
725 732
726 memset(&new_fdt->open_fds->fds_bits[start], 0, left); 733 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
727 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); 734 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
728 } 735 }
729 736
730out:
731 return newf; 737 return newf;
732 738
733out_release: 739out_release:
734 free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
735 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
736 free_fd_array(new_fdt->fd, new_fdt->max_fds);
737 kmem_cache_free(files_cachep, newf); 740 kmem_cache_free(files_cachep, newf);
741out:
738 return NULL; 742 return NULL;
739} 743}
740 744
@@ -830,7 +834,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
830 if (clone_flags & CLONE_THREAD) { 834 if (clone_flags & CLONE_THREAD) {
831 atomic_inc(&current->signal->count); 835 atomic_inc(&current->signal->count);
832 atomic_inc(&current->signal->live); 836 atomic_inc(&current->signal->live);
833 taskstats_tgid_alloc(current->signal);
834 return 0; 837 return 0;
835 } 838 }
836 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 839 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -897,7 +900,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
897void __cleanup_signal(struct signal_struct *sig) 900void __cleanup_signal(struct signal_struct *sig)
898{ 901{
899 exit_thread_group_keys(sig); 902 exit_thread_group_keys(sig);
900 taskstats_tgid_free(sig);
901 kmem_cache_free(signal_cachep, sig); 903 kmem_cache_free(signal_cachep, sig);
902} 904}
903 905
@@ -984,6 +986,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
984 if (!p) 986 if (!p)
985 goto fork_out; 987 goto fork_out;
986 988
989 rt_mutex_init_task(p);
990
987#ifdef CONFIG_TRACE_IRQFLAGS 991#ifdef CONFIG_TRACE_IRQFLAGS
988 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 992 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
989 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 993 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
@@ -1038,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1038 p->wchar = 0; /* I/O counter: bytes written */ 1042 p->wchar = 0; /* I/O counter: bytes written */
1039 p->syscr = 0; /* I/O counter: read syscalls */ 1043 p->syscr = 0; /* I/O counter: read syscalls */
1040 p->syscw = 0; /* I/O counter: write syscalls */ 1044 p->syscw = 0; /* I/O counter: write syscalls */
1045 task_io_accounting_init(p);
1041 acct_clear_integrals(p); 1046 acct_clear_integrals(p);
1042 1047
1043 p->it_virt_expires = cputime_zero; 1048 p->it_virt_expires = cputime_zero;
@@ -1088,8 +1093,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1088 p->lockdep_recursion = 0; 1093 p->lockdep_recursion = 0;
1089#endif 1094#endif
1090 1095
1091 rt_mutex_init_task(p);
1092
1093#ifdef CONFIG_DEBUG_MUTEXES 1096#ifdef CONFIG_DEBUG_MUTEXES
1094 p->blocked_on = NULL; /* not blocked yet */ 1097 p->blocked_on = NULL; /* not blocked yet */
1095#endif 1098#endif
@@ -1244,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1244 if (thread_group_leader(p)) { 1247 if (thread_group_leader(p)) {
1245 p->signal->tty = current->signal->tty; 1248 p->signal->tty = current->signal->tty;
1246 p->signal->pgrp = process_group(current); 1249 p->signal->pgrp = process_group(current);
1247 p->signal->session = current->signal->session; 1250 set_signal_session(p->signal, process_session(current));
1248 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1251 attach_pid(p, PIDTYPE_PGID, process_group(p));
1249 attach_pid(p, PIDTYPE_SID, p->signal->session); 1252 attach_pid(p, PIDTYPE_SID, process_session(p));
1250 1253
1251 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1254 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1252 __get_cpu_var(process_counts)++; 1255 __get_cpu_var(process_counts)++;
@@ -1304,7 +1307,7 @@ fork_out:
1304 return ERR_PTR(retval); 1307 return ERR_PTR(retval);
1305} 1308}
1306 1309
1307struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) 1310noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1308{ 1311{
1309 memset(regs, 0, sizeof(struct pt_regs)); 1312 memset(regs, 0, sizeof(struct pt_regs));
1310 return regs; 1313 return regs;
@@ -1316,9 +1319,8 @@ struct task_struct * __devinit fork_idle(int cpu)
1316 struct pt_regs regs; 1319 struct pt_regs regs;
1317 1320
1318 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0); 1321 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
1319 if (!task) 1322 if (!IS_ERR(task))
1320 return ERR_PTR(-ENOMEM); 1323 init_idle(task, cpu);
1321 init_idle(task, cpu);
1322 1324
1323 return task; 1325 return task;
1324} 1326}
@@ -1415,7 +1417,7 @@ long do_fork(unsigned long clone_flags,
1415#define ARCH_MIN_MMSTRUCT_ALIGN 0 1417#define ARCH_MIN_MMSTRUCT_ALIGN 0
1416#endif 1418#endif
1417 1419
1418static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) 1420static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
1419{ 1421{
1420 struct sighand_struct *sighand = data; 1422 struct sighand_struct *sighand = data;
1421 1423
@@ -1511,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1511} 1513}
1512 1514
1513/* 1515/*
1514 * Unshare the namespace structure if it is being shared 1516 * Unshare the mnt_namespace structure if it is being shared
1515 */ 1517 */
1516static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) 1518static int unshare_mnt_namespace(unsigned long unshare_flags,
1519 struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
1517{ 1520{
1518 struct namespace *ns = current->nsproxy->namespace; 1521 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
1519 1522
1520 if ((unshare_flags & CLONE_NEWNS) && ns) { 1523 if ((unshare_flags & CLONE_NEWNS) && ns) {
1521 if (!capable(CAP_SYS_ADMIN)) 1524 if (!capable(CAP_SYS_ADMIN))
1522 return -EPERM; 1525 return -EPERM;
1523 1526
1524 *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); 1527 *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
1525 if (!*new_nsp) 1528 if (!*new_nsp)
1526 return -ENOMEM; 1529 return -ENOMEM;
1527 } 1530 }
@@ -1530,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
1530} 1533}
1531 1534
1532/* 1535/*
1533 * Unsharing of sighand for tasks created with CLONE_SIGHAND is not 1536 * Unsharing of sighand is not supported yet
1534 * supported yet
1535 */ 1537 */
1536static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) 1538static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1537{ 1539{
1538 struct sighand_struct *sigh = current->sighand; 1540 struct sighand_struct *sigh = current->sighand;
1539 1541
1540 if ((unshare_flags & CLONE_SIGHAND) && 1542 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1541 (sigh && atomic_read(&sigh->count) > 1))
1542 return -EINVAL; 1543 return -EINVAL;
1543 else 1544 else
1544 return 0; 1545 return 0;
@@ -1611,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1611{ 1612{
1612 int err = 0; 1613 int err = 0;
1613 struct fs_struct *fs, *new_fs = NULL; 1614 struct fs_struct *fs, *new_fs = NULL;
1614 struct namespace *ns, *new_ns = NULL; 1615 struct mnt_namespace *ns, *new_ns = NULL;
1615 struct sighand_struct *sigh, *new_sigh = NULL; 1616 struct sighand_struct *new_sigh = NULL;
1616 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1617 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1617 struct files_struct *fd, *new_fd = NULL; 1618 struct files_struct *fd, *new_fd = NULL;
1618 struct sem_undo_list *new_ulist = NULL; 1619 struct sem_undo_list *new_ulist = NULL;
@@ -1633,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1633 goto bad_unshare_out; 1634 goto bad_unshare_out;
1634 if ((err = unshare_fs(unshare_flags, &new_fs))) 1635 if ((err = unshare_fs(unshare_flags, &new_fs)))
1635 goto bad_unshare_cleanup_thread; 1636 goto bad_unshare_cleanup_thread;
1636 if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) 1637 if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
1637 goto bad_unshare_cleanup_fs; 1638 goto bad_unshare_cleanup_fs;
1638 if ((err = unshare_sighand(unshare_flags, &new_sigh))) 1639 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1639 goto bad_unshare_cleanup_ns; 1640 goto bad_unshare_cleanup_ns;
@@ -1657,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1657 } 1658 }
1658 } 1659 }
1659 1660
1660 if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || 1661 if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
1661 new_uts || new_ipc) { 1662 new_uts || new_ipc) {
1662 1663
1663 task_lock(current); 1664 task_lock(current);
@@ -1674,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1674 } 1675 }
1675 1676
1676 if (new_ns) { 1677 if (new_ns) {
1677 ns = current->nsproxy->namespace; 1678 ns = current->nsproxy->mnt_ns;
1678 current->nsproxy->namespace = new_ns; 1679 current->nsproxy->mnt_ns = new_ns;
1679 new_ns = ns; 1680 new_ns = ns;
1680 } 1681 }
1681 1682
1682 if (new_sigh) {
1683 sigh = current->sighand;
1684 rcu_assign_pointer(current->sighand, new_sigh);
1685 new_sigh = sigh;
1686 }
1687
1688 if (new_mm) { 1683 if (new_mm) {
1689 mm = current->mm; 1684 mm = current->mm;
1690 active_mm = current->active_mm; 1685 active_mm = current->active_mm;
@@ -1742,7 +1737,7 @@ bad_unshare_cleanup_sigh:
1742 1737
1743bad_unshare_cleanup_ns: 1738bad_unshare_cleanup_ns:
1744 if (new_ns) 1739 if (new_ns)
1745 put_namespace(new_ns); 1740 put_mnt_ns(new_ns);
1746 1741
1747bad_unshare_cleanup_fs: 1742bad_unshare_cleanup_fs:
1748 if (new_fs) 1743 if (new_fs)
diff --git a/kernel/futex.c b/kernel/futex.c
index b364e0026191..5a737de857d3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
166/* 166/*
167 * Get parameters which are the keys for a futex. 167 * Get parameters which are the keys for a futex.
168 * 168 *
169 * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, 169 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
170 * offset_within_page). For private mappings, it's (uaddr, current->mm). 170 * offset_within_page). For private mappings, it's (uaddr, current->mm).
171 * We can usually work out the index without swapping in the page. 171 * We can usually work out the index without swapping in the page.
172 * 172 *
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
223 /* 223 /*
224 * Linear file mappings are also simple. 224 * Linear file mappings are also simple.
225 */ 225 */
226 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
282{ 282{
283 int ret; 283 int ret;
284 284
285 inc_preempt_count(); 285 pagefault_disable();
286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
287 dec_preempt_count(); 287 pagefault_enable();
288 288
289 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
290} 290}
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
324 if (likely(current->pi_state_cache)) 324 if (likely(current->pi_state_cache))
325 return 0; 325 return 0;
326 326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); 327 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
328 328
329 if (!pi_state) 329 if (!pi_state)
330 return -ENOMEM; 330 return -ENOMEM;
331 331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list); 332 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */ 333 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL; 334 pi_state->owner = NULL;
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q)
553 * at the end of wake_up_all() does not prevent this store from 552 * at the end of wake_up_all() does not prevent this store from
554 * moving. 553 * moving.
555 */ 554 */
556 wmb(); 555 smp_wmb();
557 q->lock_ptr = NULL; 556 q->lock_ptr = NULL;
558} 557}
559 558
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
585 if (!(uval & FUTEX_OWNER_DIED)) { 584 if (!(uval & FUTEX_OWNER_DIED)) {
586 newval = FUTEX_WAITERS | new_owner->pid; 585 newval = FUTEX_WAITERS | new_owner->pid;
587 586
588 inc_preempt_count(); 587 pagefault_disable();
589 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 588 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
590 dec_preempt_count(); 589 pagefault_enable();
591 if (curval == -EFAULT) 590 if (curval == -EFAULT)
592 return -EFAULT; 591 return -EFAULT;
593 if (curval != uval) 592 if (curval != uval)
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
618 * There is no waiter, so we unlock the futex. The owner died 617 * There is no waiter, so we unlock the futex. The owner died
619 * bit has not to be preserved here. We are the owner: 618 * bit has not to be preserved here. We are the owner:
620 */ 619 */
621 inc_preempt_count(); 620 pagefault_disable();
622 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); 621 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
623 dec_preempt_count(); 622 pagefault_enable();
624 623
625 if (oldval == -EFAULT) 624 if (oldval == -EFAULT)
626 return oldval; 625 return oldval;
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1158 */ 1157 */
1159 newval = current->pid; 1158 newval = current->pid;
1160 1159
1161 inc_preempt_count(); 1160 pagefault_disable();
1162 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); 1161 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1163 dec_preempt_count(); 1162 pagefault_enable();
1164 1163
1165 if (unlikely(curval == -EFAULT)) 1164 if (unlikely(curval == -EFAULT))
1166 goto uaddr_faulted; 1165 goto uaddr_faulted;
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1183 uval = curval; 1182 uval = curval;
1184 newval = uval | FUTEX_WAITERS; 1183 newval = uval | FUTEX_WAITERS;
1185 1184
1186 inc_preempt_count(); 1185 pagefault_disable();
1187 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1186 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1188 dec_preempt_count(); 1187 pagefault_enable();
1189 1188
1190 if (unlikely(curval == -EFAULT)) 1189 if (unlikely(curval == -EFAULT))
1191 goto uaddr_faulted; 1190 goto uaddr_faulted;
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1215 newval = current->pid | 1214 newval = current->pid |
1216 FUTEX_OWNER_DIED | FUTEX_WAITERS; 1215 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1217 1216
1218 inc_preempt_count(); 1217 pagefault_disable();
1219 curval = futex_atomic_cmpxchg_inatomic(uaddr, 1218 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1220 uval, newval); 1219 uval, newval);
1221 dec_preempt_count(); 1220 pagefault_enable();
1222 1221
1223 if (unlikely(curval == -EFAULT)) 1222 if (unlikely(curval == -EFAULT))
1224 goto uaddr_faulted; 1223 goto uaddr_faulted;
@@ -1390,9 +1389,9 @@ retry_locked:
1390 * anyone else up: 1389 * anyone else up:
1391 */ 1390 */
1392 if (!(uval & FUTEX_OWNER_DIED)) { 1391 if (!(uval & FUTEX_OWNER_DIED)) {
1393 inc_preempt_count(); 1392 pagefault_disable();
1394 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1393 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1395 dec_preempt_count(); 1394 pagefault_enable();
1396 } 1395 }
1397 1396
1398 if (unlikely(uval == -EFAULT)) 1397 if (unlikely(uval == -EFAULT))
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
1493 return ret; 1492 return ret;
1494} 1493}
1495 1494
1496static struct file_operations futex_fops = { 1495static const struct file_operations futex_fops = {
1497 .release = futex_close, 1496 .release = futex_close,
1498 .poll = futex_poll, 1497 .poll = futex_poll,
1499}; 1498};
@@ -1507,6 +1506,13 @@ static int futex_fd(u32 __user *uaddr, int signal)
1507 struct futex_q *q; 1506 struct futex_q *q;
1508 struct file *filp; 1507 struct file *filp;
1509 int ret, err; 1508 int ret, err;
1509 static unsigned long printk_interval;
1510
1511 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
1512 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
1513 "will be removed from the kernel in June 2007\n",
1514 current->comm);
1515 }
1510 1516
1511 ret = -EINVAL; 1517 ret = -EINVAL;
1512 if (!valid_signal(signal)) 1518 if (!valid_signal(signal))
@@ -1522,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
1522 goto out; 1528 goto out;
1523 } 1529 }
1524 filp->f_op = &futex_fops; 1530 filp->f_op = &futex_fops;
1525 filp->f_vfsmnt = mntget(futex_mnt); 1531 filp->f_path.mnt = mntget(futex_mnt);
1526 filp->f_dentry = dget(futex_mnt->mnt_root); 1532 filp->f_path.dentry = dget(futex_mnt->mnt_root);
1527 filp->f_mapping = filp->f_dentry->d_inode->i_mapping; 1533 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
1528 1534
1529 if (signal) { 1535 if (signal) {
1530 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); 1536 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
@@ -1851,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
1851 1857
1852static int __init init(void) 1858static int __init init(void)
1853{ 1859{
1854 unsigned int i; 1860 int i = register_filesystem(&futex_fs_type);
1861
1862 if (i)
1863 return i;
1855 1864
1856 register_filesystem(&futex_fs_type);
1857 futex_mnt = kern_mount(&futex_fs_type); 1865 futex_mnt = kern_mount(&futex_fs_type);
1866 if (IS_ERR(futex_mnt)) {
1867 unregister_filesystem(&futex_fs_type);
1868 return PTR_ERR(futex_mnt);
1869 }
1858 1870
1859 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 1871 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1860 INIT_LIST_HEAD(&futex_queues[i].chain); 1872 INIT_LIST_HEAD(&futex_queues[i].chain);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 11c99697acfe..ebfd24a41858 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
233 chip->shutdown = chip->disable; 233 chip->shutdown = chip->disable;
234 if (!chip->name) 234 if (!chip->name)
235 chip->name = chip->typename; 235 chip->name = chip->typename;
236 if (!chip->end)
237 chip->end = dummy_irq_chip.end;
236} 238}
237 239
238static inline void mask_ack_irq(struct irq_desc *desc, int irq) 240static inline void mask_ack_irq(struct irq_desc *desc, int irq)
@@ -499,7 +501,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
499#endif /* CONFIG_SMP */ 501#endif /* CONFIG_SMP */
500 502
501void 503void
502__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained) 504__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
505 const char *name)
503{ 506{
504 struct irq_desc *desc; 507 struct irq_desc *desc;
505 unsigned long flags; 508 unsigned long flags;
@@ -540,6 +543,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained)
540 desc->depth = 1; 543 desc->depth = 1;
541 } 544 }
542 desc->handle_irq = handle; 545 desc->handle_irq = handle;
546 desc->name = name;
543 547
544 if (handle != handle_bad_irq && is_chained) { 548 if (handle != handle_bad_irq && is_chained) {
545 desc->status &= ~IRQ_DISABLED; 549 desc->status &= ~IRQ_DISABLED;
@@ -555,30 +559,13 @@ set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
555 irq_flow_handler_t handle) 559 irq_flow_handler_t handle)
556{ 560{
557 set_irq_chip(irq, chip); 561 set_irq_chip(irq, chip);
558 __set_irq_handler(irq, handle, 0); 562 __set_irq_handler(irq, handle, 0, NULL);
559} 563}
560 564
561/* 565void
562 * Get a descriptive string for the highlevel handler, for 566set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
563 * /proc/interrupts output: 567 irq_flow_handler_t handle, const char *name)
564 */
565const char *
566handle_irq_name(irq_flow_handler_t handle)
567{ 568{
568 if (handle == handle_level_irq) 569 set_irq_chip(irq, chip);
569 return "level "; 570 __set_irq_handler(irq, handle, 0, name);
570 if (handle == handle_fasteoi_irq)
571 return "fasteoi";
572 if (handle == handle_edge_irq)
573 return "edge ";
574 if (handle == handle_simple_irq)
575 return "simple ";
576#ifdef CONFIG_SMP
577 if (handle == handle_percpu_irq)
578 return "percpu ";
579#endif
580 if (handle == handle_bad_irq)
581 return "bad ";
582
583 return NULL;
584} 571}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 42aa6f1a3f0f..aff1f0fabb0d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
54 .chip = &no_irq_chip, 54 .chip = &no_irq_chip,
55 .handle_irq = handle_bad_irq, 55 .handle_irq = handle_bad_irq,
56 .depth = 1, 56 .depth = 1,
57 .lock = SPIN_LOCK_UNLOCKED, 57 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
58#ifdef CONFIG_SMP 58#ifdef CONFIG_SMP
59 .affinity = CPU_MASK_ALL 59 .affinity = CPU_MASK_ALL
60#endif 60#endif
@@ -231,10 +231,10 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
231 spin_unlock(&desc->lock); 231 spin_unlock(&desc->lock);
232 232
233 action_ret = handle_IRQ_event(irq, action); 233 action_ret = handle_IRQ_event(irq, action);
234
235 spin_lock(&desc->lock);
236 if (!noirqdebug) 234 if (!noirqdebug)
237 note_interrupt(irq, desc, action_ret); 235 note_interrupt(irq, desc, action_ret);
236
237 spin_lock(&desc->lock);
238 if (likely(!(desc->status & IRQ_PENDING))) 238 if (likely(!(desc->status & IRQ_PENDING)))
239 break; 239 break;
240 desc->status &= ~IRQ_PENDING; 240 desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6879202afe9a..b385878c6e80 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
216{ 216{
217 struct irq_desc *desc = irq_desc + irq; 217 struct irq_desc *desc = irq_desc + irq;
218 struct irqaction *old, **p; 218 struct irqaction *old, **p;
219 const char *old_name = NULL;
219 unsigned long flags; 220 unsigned long flags;
220 int shared = 0; 221 int shared = 0;
221 222
@@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
255 * set the trigger type must match. 256 * set the trigger type must match.
256 */ 257 */
257 if (!((old->flags & new->flags) & IRQF_SHARED) || 258 if (!((old->flags & new->flags) & IRQF_SHARED) ||
258 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) 259 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
260 old_name = old->name;
259 goto mismatch; 261 goto mismatch;
262 }
260 263
261#if defined(CONFIG_IRQ_PER_CPU) 264#if defined(CONFIG_IRQ_PER_CPU)
262 /* All handlers must agree on per-cpuness */ 265 /* All handlers must agree on per-cpuness */
@@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
322 return 0; 325 return 0;
323 326
324mismatch: 327mismatch:
325 spin_unlock_irqrestore(&desc->lock, flags);
326 if (!(new->flags & IRQF_PROBE_SHARED)) { 328 if (!(new->flags & IRQF_PROBE_SHARED)) {
327 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); 329 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
330 if (old_name)
331 printk(KERN_ERR "current handler: %s\n", old_name);
328 dump_stack(); 332 dump_stack();
329 } 333 }
334 spin_unlock_irqrestore(&desc->lock, flags);
330 return -EBUSY; 335 return -EBUSY;
331} 336}
332 337
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a352667007c..61f5c717a8f5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
54 unsigned int irq = (int)(long)data, full_count = count, err; 54 unsigned int irq = (int)(long)data, full_count = count, err;
55 cpumask_t new_value, tmp; 55 cpumask_t new_value, tmp;
56 56
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
58 CHECK_IRQ_PER_CPU(irq_desc[irq].status))
58 return -EIO; 59 return -EIO;
59 60
60 err = cpumask_parse_user(buffer, count, new_value); 61 err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2b..6f294ff4f9ee 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/ctype.h>
23 24
24#include <asm/sections.h> 25#include <asm/sections.h>
25 26
@@ -30,14 +31,14 @@
30#endif 31#endif
31 32
32/* These will be re-linked against their real values during the second link stage */ 33/* These will be re-linked against their real values during the second link stage */
33extern unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const unsigned long kallsyms_addresses[] __attribute__((weak));
34extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); 35extern const unsigned long kallsyms_num_syms __attribute__((weak));
35extern u8 kallsyms_names[] __attribute__((weak)); 36extern const u8 kallsyms_names[] __attribute__((weak));
36 37
37extern u8 kallsyms_token_table[] __attribute__((weak)); 38extern const u8 kallsyms_token_table[] __attribute__((weak));
38extern u16 kallsyms_token_index[] __attribute__((weak)); 39extern const u16 kallsyms_token_index[] __attribute__((weak));
39 40
40extern unsigned long kallsyms_markers[] __attribute__((weak)); 41extern const unsigned long kallsyms_markers[] __attribute__((weak));
41 42
42static inline int is_kernel_inittext(unsigned long addr) 43static inline int is_kernel_inittext(unsigned long addr)
43{ 44{
@@ -83,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
83static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 84static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
84{ 85{
85 int len, skipped_first = 0; 86 int len, skipped_first = 0;
86 u8 *tptr, *data; 87 const u8 *tptr, *data;
87 88
88 /* get the compressed symbol length from the first symbol byte */ 89 /* get the compressed symbol length from the first symbol byte */
89 data = &kallsyms_names[off]; 90 data = &kallsyms_names[off];
@@ -131,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
131 * kallsyms array */ 132 * kallsyms array */
132static unsigned int get_symbol_offset(unsigned long pos) 133static unsigned int get_symbol_offset(unsigned long pos)
133{ 134{
134 u8 *name; 135 const u8 *name;
135 int i; 136 int i;
136 137
137 /* use the closest marker we have. We have markers every 256 positions, 138 /* use the closest marker we have. We have markers every 256 positions,
@@ -301,13 +302,6 @@ struct kallsym_iter
301 char name[KSYM_NAME_LEN+1]; 302 char name[KSYM_NAME_LEN+1];
302}; 303};
303 304
304/* Only label it "global" if it is exported. */
305static void upcase_if_global(struct kallsym_iter *iter)
306{
307 if (is_exported(iter->name, iter->owner))
308 iter->type += 'A' - 'a';
309}
310
311static int get_ksymbol_mod(struct kallsym_iter *iter) 305static int get_ksymbol_mod(struct kallsym_iter *iter)
312{ 306{
313 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 307 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
316 if (iter->owner == NULL) 310 if (iter->owner == NULL)
317 return 0; 311 return 0;
318 312
319 upcase_if_global(iter); 313 /* Label it "global" if it is exported, "local" if not exported. */
314 iter->type = is_exported(iter->name, iter->owner)
315 ? toupper(iter->type) : tolower(iter->type);
316
320 return 1; 317 return 1;
321} 318}
322 319
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
401 return 0; 398 return 0;
402} 399}
403 400
404static struct seq_operations kallsyms_op = { 401static const struct seq_operations kallsyms_op = {
405 .start = s_start, 402 .start = s_start,
406 .next = s_next, 403 .next = s_next,
407 .stop = s_stop, 404 .stop = s_stop,
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
436 return seq_release(inode, file); 433 return seq_release(inode, file);
437} 434}
438 435
439static struct file_operations kallsyms_operations = { 436static const struct file_operations kallsyms_operations = {
440 .open = kallsyms_open, 437 .open = kallsyms_open,
441 .read = seq_read, 438 .read = seq_read,
442 .llseek = seq_lseek, 439 .llseek = seq_lseek,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f4..2a59c8a01ae0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/elf.h>
24#include <linux/elfcore.h>
23 25
24#include <asm/page.h> 26#include <asm/page.h>
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
108 110
109 /* Allocate a controlling structure */ 111 /* Allocate a controlling structure */
110 result = -ENOMEM; 112 result = -ENOMEM;
111 image = kmalloc(sizeof(*image), GFP_KERNEL); 113 image = kzalloc(sizeof(*image), GFP_KERNEL);
112 if (!image) 114 if (!image)
113 goto out; 115 goto out;
114 116
115 memset(image, 0, sizeof(*image));
116 image->head = 0; 117 image->head = 0;
117 image->entry = &image->head; 118 image->entry = &image->head;
118 image->last_entry = &image->head; 119 image->last_entry = &image->head;
@@ -851,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image,
851 memset(ptr + uchunk, 0, mchunk - uchunk); 852 memset(ptr + uchunk, 0, mchunk - uchunk);
852 } 853 }
853 result = copy_from_user(ptr, buf, uchunk); 854 result = copy_from_user(ptr, buf, uchunk);
855 kexec_flush_icache_page(page);
854 kunmap(page); 856 kunmap(page);
855 if (result) { 857 if (result) {
856 result = (result < 0) ? result : -EIO; 858 result = (result < 0) ? result : -EIO;
@@ -1067,6 +1069,60 @@ void crash_kexec(struct pt_regs *regs)
1067 } 1069 }
1068} 1070}
1069 1071
1072static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1073 size_t data_len)
1074{
1075 struct elf_note note;
1076
1077 note.n_namesz = strlen(name) + 1;
1078 note.n_descsz = data_len;
1079 note.n_type = type;
1080 memcpy(buf, &note, sizeof(note));
1081 buf += (sizeof(note) + 3)/4;
1082 memcpy(buf, name, note.n_namesz);
1083 buf += (note.n_namesz + 3)/4;
1084 memcpy(buf, data, note.n_descsz);
1085 buf += (note.n_descsz + 3)/4;
1086
1087 return buf;
1088}
1089
1090static void final_note(u32 *buf)
1091{
1092 struct elf_note note;
1093
1094 note.n_namesz = 0;
1095 note.n_descsz = 0;
1096 note.n_type = 0;
1097 memcpy(buf, &note, sizeof(note));
1098}
1099
1100void crash_save_cpu(struct pt_regs *regs, int cpu)
1101{
1102 struct elf_prstatus prstatus;
1103 u32 *buf;
1104
1105 if ((cpu < 0) || (cpu >= NR_CPUS))
1106 return;
1107
1108 /* Using ELF notes here is opportunistic.
1109 * I need a well defined structure format
1110 * for the data I pass, and I need tags
1111 * on the data to indicate what information I have
1112 * squirrelled away. ELF notes happen to provide
1113 * all of that, so there is no need to invent something new.
1114 */
1115 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1116 if (!buf)
1117 return;
1118 memset(&prstatus, 0, sizeof(prstatus));
1119 prstatus.pr_pid = current->pid;
1120 elf_core_copy_regs(&prstatus.pr_reg, regs);
1121 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
1122 sizeof(prstatus));
1123 final_note(buf);
1124}
1125
1070static int __init crash_notes_memory_init(void) 1126static int __init crash_notes_memory_init(void)
1071{ 1127{
1072 /* Allocate memory for saving cpu registers. */ 1128 /* Allocate memory for saving cpu registers. */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb4e29d924e4..3a7379aa31ca 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/namespace.h> 28#include <linux/mnt_namespace.h>
29#include <linux/completion.h> 29#include <linux/completion.h>
30#include <linux/file.h> 30#include <linux/file.h>
31#include <linux/workqueue.h> 31#include <linux/workqueue.h>
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
114#endif /* CONFIG_KMOD */ 114#endif /* CONFIG_KMOD */
115 115
116struct subprocess_info { 116struct subprocess_info {
117 struct work_struct work;
117 struct completion *complete; 118 struct completion *complete;
118 char *path; 119 char *path;
119 char **argv; 120 char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
221} 222}
222 223
223/* This is run by khelper thread */ 224/* This is run by khelper thread */
224static void __call_usermodehelper(void *data) 225static void __call_usermodehelper(struct work_struct *work)
225{ 226{
226 struct subprocess_info *sub_info = data; 227 struct subprocess_info *sub_info =
228 container_of(work, struct subprocess_info, work);
227 pid_t pid; 229 pid_t pid;
228 int wait = sub_info->wait; 230 int wait = sub_info->wait;
229 231
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
264{ 266{
265 DECLARE_COMPLETION_ONSTACK(done); 267 DECLARE_COMPLETION_ONSTACK(done);
266 struct subprocess_info sub_info = { 268 struct subprocess_info sub_info = {
269 .work = __WORK_INITIALIZER(sub_info.work,
270 __call_usermodehelper),
267 .complete = &done, 271 .complete = &done,
268 .path = path, 272 .path = path,
269 .argv = argv, 273 .argv = argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
272 .wait = wait, 276 .wait = wait,
273 .retval = 0, 277 .retval = 0,
274 }; 278 };
275 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
276 279
277 if (!khelper_wq) 280 if (!khelper_wq)
278 return -EBUSY; 281 return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
280 if (path[0] == '\0') 283 if (path[0] == '\0')
281 return 0; 284 return 0;
282 285
283 queue_work(khelper_wq, &work); 286 queue_work(khelper_wq, &sub_info.work);
284 wait_for_completion(&done); 287 wait_for_completion(&done);
285 return sub_info.retval; 288 return sub_info.retval;
286} 289}
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
291{ 294{
292 DECLARE_COMPLETION(done); 295 DECLARE_COMPLETION(done);
293 struct subprocess_info sub_info = { 296 struct subprocess_info sub_info = {
297 .work = __WORK_INITIALIZER(sub_info.work,
298 __call_usermodehelper),
294 .complete = &done, 299 .complete = &done,
295 .path = path, 300 .path = path,
296 .argv = argv, 301 .argv = argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
298 .retval = 0, 303 .retval = 0,
299 }; 304 };
300 struct file *f; 305 struct file *f;
301 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
302 306
303 if (!khelper_wq) 307 if (!khelper_wq)
304 return -EBUSY; 308 return -EBUSY;
@@ -307,18 +311,18 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
307 return 0; 311 return 0;
308 312
309 f = create_write_pipe(); 313 f = create_write_pipe();
310 if (!f) 314 if (IS_ERR(f))
311 return -ENOMEM; 315 return PTR_ERR(f);
312 *filp = f; 316 *filp = f;
313 317
314 f = create_read_pipe(f); 318 f = create_read_pipe(f);
315 if (!f) { 319 if (IS_ERR(f)) {
316 free_write_pipe(*filp); 320 free_write_pipe(*filp);
317 return -ENOMEM; 321 return PTR_ERR(f);
318 } 322 }
319 sub_info.stdin = f; 323 sub_info.stdin = f;
320 324
321 queue_work(khelper_wq, &work); 325 queue_work(khelper_wq, &sub_info.work);
322 wait_for_completion(&done); 326 wait_for_completion(&done);
323 return sub_info.retval; 327 return sub_info.retval;
324} 328}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e0..17ec4afb0994 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
41#include <linux/freezer.h>
41#include <asm-generic/sections.h> 42#include <asm-generic/sections.h>
42#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
43#include <asm/errno.h> 44#include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
83 kprobe_opcode_t *insns; /* Page of instruction slots */ 84 kprobe_opcode_t *insns; /* Page of instruction slots */
84 char slot_used[INSNS_PER_PAGE]; 85 char slot_used[INSNS_PER_PAGE];
85 int nused; 86 int nused;
87 int ngarbage;
86}; 88};
87 89
88static struct hlist_head kprobe_insn_pages; 90static struct hlist_head kprobe_insn_pages;
91static int kprobe_garbage_slots;
92static int collect_garbage_slots(void);
93
94static int __kprobes check_safety(void)
95{
96 int ret = 0;
97#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
98 ret = freeze_processes();
99 if (ret == 0) {
100 struct task_struct *p, *q;
101 do_each_thread(p, q) {
102 if (p != current && p->state == TASK_RUNNING &&
103 p->pid != 0) {
104 printk("Check failed: %s is running\n",p->comm);
105 ret = -1;
106 goto loop_end;
107 }
108 } while_each_thread(p, q);
109 }
110loop_end:
111 thaw_processes();
112#else
113 synchronize_sched();
114#endif
115 return ret;
116}
89 117
90/** 118/**
91 * get_insn_slot() - Find a slot on an executable page for an instruction. 119 * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
96 struct kprobe_insn_page *kip; 124 struct kprobe_insn_page *kip;
97 struct hlist_node *pos; 125 struct hlist_node *pos;
98 126
127 retry:
99 hlist_for_each(pos, &kprobe_insn_pages) { 128 hlist_for_each(pos, &kprobe_insn_pages) {
100 kip = hlist_entry(pos, struct kprobe_insn_page, hlist); 129 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
101 if (kip->nused < INSNS_PER_PAGE) { 130 if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
112 } 141 }
113 } 142 }
114 143
115 /* All out of space. Need to allocate a new page. Use slot 0.*/ 144 /* If there are any garbage slots, collect it and try again. */
145 if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
146 goto retry;
147 }
148 /* All out of space. Need to allocate a new page. Use slot 0. */
116 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 149 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
117 if (!kip) { 150 if (!kip) {
118 return NULL; 151 return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
133 memset(kip->slot_used, 0, INSNS_PER_PAGE); 166 memset(kip->slot_used, 0, INSNS_PER_PAGE);
134 kip->slot_used[0] = 1; 167 kip->slot_used[0] = 1;
135 kip->nused = 1; 168 kip->nused = 1;
169 kip->ngarbage = 0;
136 return kip->insns; 170 return kip->insns;
137} 171}
138 172
139void __kprobes free_insn_slot(kprobe_opcode_t *slot) 173/* Return 1 if all garbages are collected, otherwise 0. */
174static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
175{
176 kip->slot_used[idx] = 0;
177 kip->nused--;
178 if (kip->nused == 0) {
179 /*
180 * Page is no longer in use. Free it unless
181 * it's the last one. We keep the last one
182 * so as not to have to set it up again the
183 * next time somebody inserts a probe.
184 */
185 hlist_del(&kip->hlist);
186 if (hlist_empty(&kprobe_insn_pages)) {
187 INIT_HLIST_NODE(&kip->hlist);
188 hlist_add_head(&kip->hlist,
189 &kprobe_insn_pages);
190 } else {
191 module_free(NULL, kip->insns);
192 kfree(kip);
193 }
194 return 1;
195 }
196 return 0;
197}
198
199static int __kprobes collect_garbage_slots(void)
200{
201 struct kprobe_insn_page *kip;
202 struct hlist_node *pos, *next;
203
204 /* Ensure no-one is preepmted on the garbages */
205 if (check_safety() != 0)
206 return -EAGAIN;
207
208 hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
209 int i;
210 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
211 if (kip->ngarbage == 0)
212 continue;
213 kip->ngarbage = 0; /* we will collect all garbages */
214 for (i = 0; i < INSNS_PER_PAGE; i++) {
215 if (kip->slot_used[i] == -1 &&
216 collect_one_slot(kip, i))
217 break;
218 }
219 }
220 kprobe_garbage_slots = 0;
221 return 0;
222}
223
224void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
140{ 225{
141 struct kprobe_insn_page *kip; 226 struct kprobe_insn_page *kip;
142 struct hlist_node *pos; 227 struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
146 if (kip->insns <= slot && 231 if (kip->insns <= slot &&
147 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 232 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
148 int i = (slot - kip->insns) / MAX_INSN_SIZE; 233 int i = (slot - kip->insns) / MAX_INSN_SIZE;
149 kip->slot_used[i] = 0; 234 if (dirty) {
150 kip->nused--; 235 kip->slot_used[i] = -1;
151 if (kip->nused == 0) { 236 kip->ngarbage++;
152 /* 237 } else {
153 * Page is no longer in use. Free it unless 238 collect_one_slot(kip, i);
154 * it's the last one. We keep the last one
155 * so as not to have to set it up again the
156 * next time somebody inserts a probe.
157 */
158 hlist_del(&kip->hlist);
159 if (hlist_empty(&kprobe_insn_pages)) {
160 INIT_HLIST_NODE(&kip->hlist);
161 hlist_add_head(&kip->hlist,
162 &kprobe_insn_pages);
163 } else {
164 module_free(NULL, kip->insns);
165 kfree(kip);
166 }
167 } 239 }
168 return; 240 break;
169 } 241 }
170 } 242 }
243 if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
244 collect_garbage_slots();
245 }
171} 246}
172#endif 247#endif
173 248
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e8..1db8c72d0d38 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
31 /* Result passed back to kthread_create() from keventd. */ 31 /* Result passed back to kthread_create() from keventd. */
32 struct task_struct *result; 32 struct task_struct *result;
33 struct completion done; 33 struct completion done;
34
35 struct work_struct work;
34}; 36};
35 37
36struct kthread_stop_info 38struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
111} 113}
112 114
113/* We are keventd: create a thread. */ 115/* We are keventd: create a thread. */
114static void keventd_create_kthread(void *_create) 116static void keventd_create_kthread(struct work_struct *work)
115{ 117{
116 struct kthread_create_info *create = _create; 118 struct kthread_create_info *create =
119 container_of(work, struct kthread_create_info, work);
117 int pid; 120 int pid;
118 121
119 /* We want our own signal handler (we take no signals by default). */ 122 /* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
154 ...) 157 ...)
155{ 158{
156 struct kthread_create_info create; 159 struct kthread_create_info create;
157 DECLARE_WORK(work, keventd_create_kthread, &create);
158 160
159 create.threadfn = threadfn; 161 create.threadfn = threadfn;
160 create.data = data; 162 create.data = data;
161 init_completion(&create.started); 163 init_completion(&create.started);
162 init_completion(&create.done); 164 init_completion(&create.done);
165 INIT_WORK(&create.work, keventd_create_kthread);
163 166
164 /* 167 /*
165 * The workqueue needs to start up first: 168 * The workqueue needs to start up first:
166 */ 169 */
167 if (!helper_wq) 170 if (!helper_wq)
168 work.func(work.data); 171 create.work.func(&create.work);
169 else { 172 else {
170 queue_work(helper_wq, &work); 173 queue_work(helper_wq, &create.work);
171 wait_for_completion(&create.done); 174 wait_for_completion(&create.done);
172 } 175 }
173 if (!IS_ERR(create.result)) { 176 if (!IS_ERR(create.result)) {
diff --git a/kernel/latency.c b/kernel/latency.c
index 258f2555abbc..e63fcacb61a7 100644
--- a/kernel/latency.c
+++ b/kernel/latency.c
@@ -36,6 +36,7 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
39#include <linux/jiffies.h>
39#include <asm/atomic.h> 40#include <asm/atomic.h>
40 41
41struct latency_info { 42struct latency_info {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 805a322a5655..b02032476dc2 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -140,13 +140,6 @@ void lockdep_on(void)
140 140
141EXPORT_SYMBOL(lockdep_on); 141EXPORT_SYMBOL(lockdep_on);
142 142
143int lockdep_internal(void)
144{
145 return current->lockdep_recursion != 0;
146}
147
148EXPORT_SYMBOL(lockdep_internal);
149
150/* 143/*
151 * Debugging switches: 144 * Debugging switches:
152 */ 145 */
@@ -228,17 +221,15 @@ static int save_trace(struct stack_trace *trace)
228 trace->skip = 3; 221 trace->skip = 3;
229 trace->all_contexts = 0; 222 trace->all_contexts = 0;
230 223
231 /* Make sure to not recurse in case the the unwinder needs to tak
232e locks. */
233 lockdep_off();
234 save_stack_trace(trace, NULL); 224 save_stack_trace(trace, NULL);
235 lockdep_on();
236 225
237 trace->max_entries = trace->nr_entries; 226 trace->max_entries = trace->nr_entries;
238 227
239 nr_stack_trace_entries += trace->nr_entries; 228 nr_stack_trace_entries += trace->nr_entries;
240 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) 229 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) {
230 __raw_spin_unlock(&hash_lock);
241 return 0; 231 return 0;
232 }
242 233
243 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 234 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
244 __raw_spin_unlock(&hash_lock); 235 __raw_spin_unlock(&hash_lock);
@@ -357,7 +348,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
357 348
358static void print_lock_name(struct lock_class *class) 349static void print_lock_name(struct lock_class *class)
359{ 350{
360 char str[128], c1, c2, c3, c4; 351 char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
361 const char *name; 352 const char *name;
362 353
363 get_usage_chars(class, &c1, &c2, &c3, &c4); 354 get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -379,7 +370,7 @@ static void print_lock_name(struct lock_class *class)
379static void print_lockdep_cache(struct lockdep_map *lock) 370static void print_lockdep_cache(struct lockdep_map *lock)
380{ 371{
381 const char *name; 372 const char *name;
382 char str[128]; 373 char str[KSYM_NAME_LEN + 1];
383 374
384 name = lock->name; 375 name = lock->name;
385 if (!name) 376 if (!name)
@@ -449,7 +440,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
449 print_lock_class_header(class, depth); 440 print_lock_class_header(class, depth);
450 441
451 list_for_each_entry(entry, &class->locks_after, entry) { 442 list_for_each_entry(entry, &class->locks_after, entry) {
452 DEBUG_LOCKS_WARN_ON(!entry->class); 443 if (DEBUG_LOCKS_WARN_ON(!entry->class))
444 return;
445
453 print_lock_dependencies(entry->class, depth + 1); 446 print_lock_dependencies(entry->class, depth + 1);
454 447
455 printk("%*s ... acquired at:\n",depth,""); 448 printk("%*s ... acquired at:\n",depth,"");
@@ -474,7 +467,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
474 return 0; 467 return 0;
475 468
476 entry->class = this; 469 entry->class = this;
477 save_trace(&entry->trace); 470 if (!save_trace(&entry->trace))
471 return 0;
478 472
479 /* 473 /*
480 * Since we never remove from the dependency list, the list can 474 * Since we never remove from the dependency list, the list can
@@ -562,8 +556,12 @@ static noinline int print_circular_bug_tail(void)
562 if (debug_locks_silent) 556 if (debug_locks_silent)
563 return 0; 557 return 0;
564 558
559 /* hash_lock unlocked by the header */
560 __raw_spin_lock(&hash_lock);
565 this.class = check_source->class; 561 this.class = check_source->class;
566 save_trace(&this.trace); 562 if (!save_trace(&this.trace))
563 return 0;
564 __raw_spin_unlock(&hash_lock);
567 print_circular_bug_entry(&this, 0); 565 print_circular_bug_entry(&this, 0);
568 566
569 printk("\nother info that might help us debug this:\n\n"); 567 printk("\nother info that might help us debug this:\n\n");
@@ -575,6 +573,8 @@ static noinline int print_circular_bug_tail(void)
575 return 0; 573 return 0;
576} 574}
577 575
576#define RECURSION_LIMIT 40
577
578static int noinline print_infinite_recursion_bug(void) 578static int noinline print_infinite_recursion_bug(void)
579{ 579{
580 __raw_spin_unlock(&hash_lock); 580 __raw_spin_unlock(&hash_lock);
@@ -595,7 +595,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
595 debug_atomic_inc(&nr_cyclic_check_recursions); 595 debug_atomic_inc(&nr_cyclic_check_recursions);
596 if (depth > max_recursion_depth) 596 if (depth > max_recursion_depth)
597 max_recursion_depth = depth; 597 max_recursion_depth = depth;
598 if (depth >= 20) 598 if (depth >= RECURSION_LIMIT)
599 return print_infinite_recursion_bug(); 599 return print_infinite_recursion_bug();
600 /* 600 /*
601 * Check this lock's dependency list: 601 * Check this lock's dependency list:
@@ -645,7 +645,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
645 645
646 if (depth > max_recursion_depth) 646 if (depth > max_recursion_depth)
647 max_recursion_depth = depth; 647 max_recursion_depth = depth;
648 if (depth >= 20) 648 if (depth >= RECURSION_LIMIT)
649 return print_infinite_recursion_bug(); 649 return print_infinite_recursion_bug();
650 650
651 debug_atomic_inc(&nr_find_usage_forwards_checks); 651 debug_atomic_inc(&nr_find_usage_forwards_checks);
@@ -684,7 +684,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
684 684
685 if (depth > max_recursion_depth) 685 if (depth > max_recursion_depth)
686 max_recursion_depth = depth; 686 max_recursion_depth = depth;
687 if (depth >= 20) 687 if (depth >= RECURSION_LIMIT)
688 return print_infinite_recursion_bug(); 688 return print_infinite_recursion_bug();
689 689
690 debug_atomic_inc(&nr_find_usage_backwards_checks); 690 debug_atomic_inc(&nr_find_usage_backwards_checks);
@@ -964,14 +964,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
964 &prev->class->locks_after, next->acquire_ip); 964 &prev->class->locks_after, next->acquire_ip);
965 if (!ret) 965 if (!ret)
966 return 0; 966 return 0;
967 /* 967
968 * Return value of 2 signals 'dependency already added',
969 * in that case we dont have to add the backlink either.
970 */
971 if (ret == 2)
972 return 2;
973 ret = add_lock_to_list(next->class, prev->class, 968 ret = add_lock_to_list(next->class, prev->class,
974 &next->class->locks_before, next->acquire_ip); 969 &next->class->locks_before, next->acquire_ip);
970 if (!ret)
971 return 0;
975 972
976 /* 973 /*
977 * Debugging printouts: 974 * Debugging printouts:
@@ -1023,7 +1020,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1023 * added: 1020 * added:
1024 */ 1021 */
1025 if (hlock->read != 2) { 1022 if (hlock->read != 2) {
1026 check_prev_add(curr, hlock, next); 1023 if (!check_prev_add(curr, hlock, next))
1024 return 0;
1027 /* 1025 /*
1028 * Stop after the first non-trylock entry, 1026 * Stop after the first non-trylock entry,
1029 * as non-trylock entries have added their 1027 * as non-trylock entries have added their
@@ -1079,7 +1077,8 @@ static int static_obj(void *obj)
1079 */ 1077 */
1080 for_each_possible_cpu(i) { 1078 for_each_possible_cpu(i) {
1081 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); 1079 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
1082 end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); 1080 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
1081 + per_cpu_offset(i);
1083 1082
1084 if ((addr >= start) && (addr < end)) 1083 if ((addr >= start) && (addr < end))
1085 return 1; 1084 return 1;
@@ -1174,11 +1173,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
1174 * itself, so actual lookup of the hash should be once per lock object. 1173 * itself, so actual lookup of the hash should be once per lock object.
1175 */ 1174 */
1176static inline struct lock_class * 1175static inline struct lock_class *
1177register_lock_class(struct lockdep_map *lock, unsigned int subclass) 1176register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1178{ 1177{
1179 struct lockdep_subclass_key *key; 1178 struct lockdep_subclass_key *key;
1180 struct list_head *hash_head; 1179 struct list_head *hash_head;
1181 struct lock_class *class; 1180 struct lock_class *class;
1181 unsigned long flags;
1182 1182
1183 class = look_up_lock_class(lock, subclass); 1183 class = look_up_lock_class(lock, subclass);
1184 if (likely(class)) 1184 if (likely(class))
@@ -1200,6 +1200,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1200 key = lock->key->subkeys + subclass; 1200 key = lock->key->subkeys + subclass;
1201 hash_head = classhashentry(key); 1201 hash_head = classhashentry(key);
1202 1202
1203 raw_local_irq_save(flags);
1203 __raw_spin_lock(&hash_lock); 1204 __raw_spin_lock(&hash_lock);
1204 /* 1205 /*
1205 * We have to do the hash-walk again, to avoid races 1206 * We have to do the hash-walk again, to avoid races
@@ -1214,6 +1215,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1214 */ 1215 */
1215 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { 1216 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1216 __raw_spin_unlock(&hash_lock); 1217 __raw_spin_unlock(&hash_lock);
1218 raw_local_irq_restore(flags);
1217 debug_locks_off(); 1219 debug_locks_off();
1218 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 1220 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1219 printk("turning off the locking correctness validator.\n"); 1221 printk("turning off the locking correctness validator.\n");
@@ -1236,17 +1238,20 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1236 1238
1237 if (verbose(class)) { 1239 if (verbose(class)) {
1238 __raw_spin_unlock(&hash_lock); 1240 __raw_spin_unlock(&hash_lock);
1241 raw_local_irq_restore(flags);
1239 printk("\nnew class %p: %s", class->key, class->name); 1242 printk("\nnew class %p: %s", class->key, class->name);
1240 if (class->name_version > 1) 1243 if (class->name_version > 1)
1241 printk("#%d", class->name_version); 1244 printk("#%d", class->name_version);
1242 printk("\n"); 1245 printk("\n");
1243 dump_stack(); 1246 dump_stack();
1247 raw_local_irq_save(flags);
1244 __raw_spin_lock(&hash_lock); 1248 __raw_spin_lock(&hash_lock);
1245 } 1249 }
1246out_unlock_set: 1250out_unlock_set:
1247 __raw_spin_unlock(&hash_lock); 1251 __raw_spin_unlock(&hash_lock);
1252 raw_local_irq_restore(flags);
1248 1253
1249 if (!subclass) 1254 if (!subclass || force)
1250 lock->class_cache = class; 1255 lock->class_cache = class;
1251 1256
1252 DEBUG_LOCKS_WARN_ON(class->subclass != subclass); 1257 DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
@@ -1725,6 +1730,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1725 debug_atomic_dec(&nr_unused_locks); 1730 debug_atomic_dec(&nr_unused_locks);
1726 break; 1731 break;
1727 default: 1732 default:
1733 __raw_spin_unlock(&hash_lock);
1728 debug_locks_off(); 1734 debug_locks_off();
1729 WARN_ON(1); 1735 WARN_ON(1);
1730 return 0; 1736 return 0;
@@ -1934,7 +1940,7 @@ void trace_softirqs_off(unsigned long ip)
1934 * Initialize a lock instance's lock-class mapping info: 1940 * Initialize a lock instance's lock-class mapping info:
1935 */ 1941 */
1936void lockdep_init_map(struct lockdep_map *lock, const char *name, 1942void lockdep_init_map(struct lockdep_map *lock, const char *name,
1937 struct lock_class_key *key) 1943 struct lock_class_key *key, int subclass)
1938{ 1944{
1939 if (unlikely(!debug_locks)) 1945 if (unlikely(!debug_locks))
1940 return; 1946 return;
@@ -1954,6 +1960,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
1954 lock->name = name; 1960 lock->name = name;
1955 lock->key = key; 1961 lock->key = key;
1956 lock->class_cache = NULL; 1962 lock->class_cache = NULL;
1963 if (subclass)
1964 register_lock_class(lock, subclass, 1);
1957} 1965}
1958 1966
1959EXPORT_SYMBOL_GPL(lockdep_init_map); 1967EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -1992,7 +2000,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
1992 * Not cached yet or subclass? 2000 * Not cached yet or subclass?
1993 */ 2001 */
1994 if (unlikely(!class)) { 2002 if (unlikely(!class)) {
1995 class = register_lock_class(lock, subclass); 2003 class = register_lock_class(lock, subclass, 0);
1996 if (!class) 2004 if (!class)
1997 return 0; 2005 return 0;
1998 } 2006 }
@@ -2640,6 +2648,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
2640 } 2648 }
2641 local_irq_restore(flags); 2649 local_irq_restore(flags);
2642} 2650}
2651EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
2643 2652
2644static void print_held_locks_bug(struct task_struct *curr) 2653static void print_held_locks_bug(struct task_struct *curr)
2645{ 2654{
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb2..8ce09bc4613d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
20#define MAX_LOCKDEP_KEYS_BITS 11 20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) 21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22 22
23#define MAX_LOCKDEP_CHAINS_BITS 13 23#define MAX_LOCKDEP_CHAINS_BITS 14
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25 25
26/* 26/*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3fa..b554b40a4aa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
113 return 0; 113 return 0;
114} 114}
115 115
116static struct seq_operations lockdep_ops = { 116static const struct seq_operations lockdep_ops = {
117 .start = l_start, 117 .start = l_start,
118 .next = l_next, 118 .next = l_next,
119 .stop = l_stop, 119 .stop = l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
135 return res; 135 return res;
136} 136}
137 137
138static struct file_operations proc_lockdep_operations = { 138static const struct file_operations proc_lockdep_operations = {
139 .open = lockdep_open, 139 .open = lockdep_open,
140 .read = seq_read, 140 .read = seq_read,
141 .llseek = seq_lseek, 141 .llseek = seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
319 return single_open(file, lockdep_stats_show, NULL); 319 return single_open(file, lockdep_stats_show, NULL);
320} 320}
321 321
322static struct file_operations proc_lockdep_stats_operations = { 322static const struct file_operations proc_lockdep_stats_operations = {
323 .open = lockdep_stats_open, 323 .open = lockdep_stats_open,
324 .read = seq_read, 324 .read = seq_read,
325 .llseek = seq_lseek, 325 .llseek = seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index 67009bd56c52..d9eae45d0145 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -34,10 +34,10 @@
34#include <linux/err.h> 34#include <linux/err.h>
35#include <linux/vermagic.h> 35#include <linux/vermagic.h>
36#include <linux/notifier.h> 36#include <linux/notifier.h>
37#include <linux/sched.h>
37#include <linux/stop_machine.h> 38#include <linux/stop_machine.h>
38#include <linux/device.h> 39#include <linux/device.h>
39#include <linux/string.h> 40#include <linux/string.h>
40#include <linux/sched.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/unwind.h> 42#include <linux/unwind.h>
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -790,6 +790,19 @@ static struct module_attribute refcnt = {
790 .show = show_refcnt, 790 .show = show_refcnt,
791}; 791};
792 792
793void module_put(struct module *module)
794{
795 if (module) {
796 unsigned int cpu = get_cpu();
797 local_dec(&module->ref[cpu].count);
798 /* Maybe they're waiting for us to drop reference? */
799 if (unlikely(!module_is_live(module)))
800 wake_up_process(module->waiter);
801 put_cpu();
802 }
803}
804EXPORT_SYMBOL(module_put);
805
793#else /* !CONFIG_MODULE_UNLOAD */ 806#else /* !CONFIG_MODULE_UNLOAD */
794static void print_unload_info(struct seq_file *m, struct module *mod) 807static void print_unload_info(struct seq_file *m, struct module *mod)
795{ 808{
@@ -1086,22 +1099,35 @@ static int mod_sysfs_setup(struct module *mod,
1086 goto out; 1099 goto out;
1087 kobj_set_kset_s(&mod->mkobj, module_subsys); 1100 kobj_set_kset_s(&mod->mkobj, module_subsys);
1088 mod->mkobj.mod = mod; 1101 mod->mkobj.mod = mod;
1089 err = kobject_register(&mod->mkobj.kobj); 1102
1103 /* delay uevent until full sysfs population */
1104 kobject_init(&mod->mkobj.kobj);
1105 err = kobject_add(&mod->mkobj.kobj);
1090 if (err) 1106 if (err)
1091 goto out; 1107 goto out;
1092 1108
1109 mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers");
1110 if (!mod->drivers_dir)
1111 goto out_unreg;
1112
1093 err = module_param_sysfs_setup(mod, kparam, num_params); 1113 err = module_param_sysfs_setup(mod, kparam, num_params);
1094 if (err) 1114 if (err)
1095 goto out_unreg; 1115 goto out_unreg_drivers;
1096 1116
1097 err = module_add_modinfo_attrs(mod); 1117 err = module_add_modinfo_attrs(mod);
1098 if (err) 1118 if (err)
1099 goto out_unreg; 1119 goto out_unreg_param;
1100 1120
1121 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1101 return 0; 1122 return 0;
1102 1123
1124out_unreg_drivers:
1125 kobject_unregister(mod->drivers_dir);
1126out_unreg_param:
1127 module_param_sysfs_remove(mod);
1103out_unreg: 1128out_unreg:
1104 kobject_unregister(&mod->mkobj.kobj); 1129 kobject_del(&mod->mkobj.kobj);
1130 kobject_put(&mod->mkobj.kobj);
1105out: 1131out:
1106 return err; 1132 return err;
1107} 1133}
@@ -1110,6 +1136,7 @@ static void mod_kobject_remove(struct module *mod)
1110{ 1136{
1111 module_remove_modinfo_attrs(mod); 1137 module_remove_modinfo_attrs(mod);
1112 module_param_sysfs_remove(mod); 1138 module_param_sysfs_remove(mod);
1139 kobject_unregister(mod->drivers_dir);
1113 1140
1114 kobject_unregister(&mod->mkobj.kobj); 1141 kobject_unregister(&mod->mkobj.kobj);
1115} 1142}
@@ -1342,7 +1369,7 @@ static void set_license(struct module *mod, const char *license)
1342 1369
1343 if (!license_is_gpl_compatible(license)) { 1370 if (!license_is_gpl_compatible(license)) {
1344 if (!(tainted & TAINT_PROPRIETARY_MODULE)) 1371 if (!(tainted & TAINT_PROPRIETARY_MODULE))
1345 printk(KERN_WARNING "%s: module license '%s' taints" 1372 printk(KERN_WARNING "%s: module license '%s' taints "
1346 "kernel.\n", mod->name, license); 1373 "kernel.\n", mod->name, license);
1347 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1374 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1348 } 1375 }
@@ -1718,7 +1745,7 @@ static struct module *load_module(void __user *umod,
1718 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1745 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1719 1746
1720 if (strcmp(mod->name, "ndiswrapper") == 0) 1747 if (strcmp(mod->name, "ndiswrapper") == 0)
1721 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1748 add_taint(TAINT_PROPRIETARY_MODULE);
1722 if (strcmp(mod->name, "driverloader") == 0) 1749 if (strcmp(mod->name, "driverloader") == 0)
1723 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1750 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1724 1751
@@ -2182,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p)
2182 Where refcount is a number or -, and deps is a comma-separated list 2209 Where refcount is a number or -, and deps is a comma-separated list
2183 of depends or -. 2210 of depends or -.
2184*/ 2211*/
2185struct seq_operations modules_op = { 2212const struct seq_operations modules_op = {
2186 .start = m_start, 2213 .start = m_start,
2187 .next = m_next, 2214 .next = m_next,
2188 .stop = m_stop, 2215 .stop = m_stop,
@@ -2275,11 +2302,14 @@ void print_modules(void)
2275 2302
2276void module_add_driver(struct module *mod, struct device_driver *drv) 2303void module_add_driver(struct module *mod, struct device_driver *drv)
2277{ 2304{
2305 int no_warn;
2306
2278 if (!mod || !drv) 2307 if (!mod || !drv)
2279 return; 2308 return;
2280 2309
2281 /* Don't check return code; this call is idempotent */ 2310 /* Don't check return codes; these calls are idempotent */
2282 sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); 2311 no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
2312 no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name);
2283} 2313}
2284EXPORT_SYMBOL(module_add_driver); 2314EXPORT_SYMBOL(module_add_driver);
2285 2315
@@ -2288,6 +2318,8 @@ void module_remove_driver(struct device_driver *drv)
2288 if (!drv) 2318 if (!drv)
2289 return; 2319 return;
2290 sysfs_remove_link(&drv->kobj, "module"); 2320 sysfs_remove_link(&drv->kobj, "module");
2321 if (drv->owner && drv->owner->drivers_dir)
2322 sysfs_remove_link(drv->owner->drivers_dir, drv->name);
2291} 2323}
2292EXPORT_SYMBOL(module_remove_driver); 2324EXPORT_SYMBOL(module_remove_driver);
2293 2325
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index e3203c654dda..841539d72c55 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
77 77
78void debug_mutex_unlock(struct mutex *lock) 78void debug_mutex_unlock(struct mutex *lock)
79{ 79{
80 if (unlikely(!debug_locks))
81 return;
82
80 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
81 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 84 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 85 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
@@ -91,7 +94,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
91 * Make sure we are not reinitializing a held lock: 94 * Make sure we are not reinitializing a held lock:
92 */ 95 */
93 debug_check_no_locks_freed((void *)lock, sizeof(*lock)); 96 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
94 lockdep_init_map(&lock->dep_map, name, key); 97 lockdep_init_map(&lock->dep_map, name, key, 0);
95#endif 98#endif
96 lock->owner = NULL; 99 lock->owner = NULL;
97 lock->magic = lock; 100 lock->magic = lock;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a497..e7cbbb82765b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
206} 206}
207 207
208EXPORT_SYMBOL_GPL(mutex_lock_nested); 208EXPORT_SYMBOL_GPL(mutex_lock_nested);
209
210int __sched
211mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
212{
213 might_sleep();
214 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
215}
216
217EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
209#endif 218#endif
210 219
211/* 220/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 6ebdb82a0ce4..e2ce748e96af 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,8 +17,9 @@
17#include <linux/version.h> 17#include <linux/version.h>
18#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
19#include <linux/init_task.h> 19#include <linux/init_task.h>
20#include <linux/namespace.h> 20#include <linux/mnt_namespace.h>
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h>
22 23
23struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 24struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
24 25
@@ -44,10 +45,10 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
44{ 45{
45 struct nsproxy *ns; 46 struct nsproxy *ns;
46 47
47 ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); 48 ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
48 if (ns) { 49 if (ns) {
49 memcpy(ns, orig, sizeof(struct nsproxy));
50 atomic_set(&ns->count, 1); 50 atomic_set(&ns->count, 1);
51 ns->id = -1;
51 } 52 }
52 return ns; 53 return ns;
53} 54}
@@ -62,12 +63,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
62 struct nsproxy *ns = clone_namespaces(orig); 63 struct nsproxy *ns = clone_namespaces(orig);
63 64
64 if (ns) { 65 if (ns) {
65 if (ns->namespace) 66 if (ns->mnt_ns)
66 get_namespace(ns->namespace); 67 get_mnt_ns(ns->mnt_ns);
67 if (ns->uts_ns) 68 if (ns->uts_ns)
68 get_uts_ns(ns->uts_ns); 69 get_uts_ns(ns->uts_ns);
69 if (ns->ipc_ns) 70 if (ns->ipc_ns)
70 get_ipc_ns(ns->ipc_ns); 71 get_ipc_ns(ns->ipc_ns);
72 if (ns->pid_ns)
73 get_pid_ns(ns->pid_ns);
71 } 74 }
72 75
73 return ns; 76 return ns;
@@ -99,7 +102,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
99 102
100 tsk->nsproxy = new_ns; 103 tsk->nsproxy = new_ns;
101 104
102 err = copy_namespace(flags, tsk); 105 err = copy_mnt_ns(flags, tsk);
103 if (err) 106 if (err)
104 goto out_ns; 107 goto out_ns;
105 108
@@ -111,16 +114,23 @@ int copy_namespaces(int flags, struct task_struct *tsk)
111 if (err) 114 if (err)
112 goto out_ipc; 115 goto out_ipc;
113 116
117 err = copy_pid_ns(flags, tsk);
118 if (err)
119 goto out_pid;
120
114out: 121out:
115 put_nsproxy(old_ns); 122 put_nsproxy(old_ns);
116 return err; 123 return err;
117 124
125out_pid:
126 if (new_ns->ipc_ns)
127 put_ipc_ns(new_ns->ipc_ns);
118out_ipc: 128out_ipc:
119 if (new_ns->uts_ns) 129 if (new_ns->uts_ns)
120 put_uts_ns(new_ns->uts_ns); 130 put_uts_ns(new_ns->uts_ns);
121out_uts: 131out_uts:
122 if (new_ns->namespace) 132 if (new_ns->mnt_ns)
123 put_namespace(new_ns->namespace); 133 put_mnt_ns(new_ns->mnt_ns);
124out_ns: 134out_ns:
125 tsk->nsproxy = old_ns; 135 tsk->nsproxy = old_ns;
126 kfree(new_ns); 136 kfree(new_ns);
@@ -129,11 +139,13 @@ out_ns:
129 139
130void free_nsproxy(struct nsproxy *ns) 140void free_nsproxy(struct nsproxy *ns)
131{ 141{
132 if (ns->namespace) 142 if (ns->mnt_ns)
133 put_namespace(ns->namespace); 143 put_mnt_ns(ns->mnt_ns);
134 if (ns->uts_ns) 144 if (ns->uts_ns)
135 put_uts_ns(ns->uts_ns); 145 put_uts_ns(ns->uts_ns);
136 if (ns->ipc_ns) 146 if (ns->ipc_ns)
137 put_ipc_ns(ns->ipc_ns); 147 put_ipc_ns(ns->ipc_ns);
138 kfree(ns); 148 if (ns->pid_ns)
149 put_pid_ns(ns->pid_ns);
150 kfree(ns);
139} 151}
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f9..2efe9d8d367b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,12 +26,12 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/hash.h> 28#include <linux/hash.h>
29#include <linux/pspace.h> 29#include <linux/pid_namespace.h>
30 30
31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
32static struct hlist_head *pid_hash; 32static struct hlist_head *pid_hash;
33static int pidhash_shift; 33static int pidhash_shift;
34static kmem_cache_t *pid_cachep; 34static struct kmem_cache *pid_cachep;
35 35
36int pid_max = PID_MAX_DEFAULT; 36int pid_max = PID_MAX_DEFAULT;
37 37
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
43#define BITS_PER_PAGE (PAGE_SIZE*8) 43#define BITS_PER_PAGE (PAGE_SIZE*8)
44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
45 45
46static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) 46static inline int mk_pid(struct pid_namespace *pid_ns,
47 struct pidmap *map, int off)
47{ 48{
48 return (map - pspace->pidmap)*BITS_PER_PAGE + off; 49 return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
49} 50}
50 51
51#define find_next_offset(map, off) \ 52#define find_next_offset(map, off) \
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
57 * value does not cause lots of bitmaps to be allocated, but 58 * value does not cause lots of bitmaps to be allocated, but
58 * the scheme scales to up to 4 million PIDs, runtime. 59 * the scheme scales to up to 4 million PIDs, runtime.
59 */ 60 */
60struct pspace init_pspace = { 61struct pid_namespace init_pid_ns = {
62 .kref = {
63 .refcount = ATOMIC_INIT(2),
64 },
61 .pidmap = { 65 .pidmap = {
62 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 66 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
63 }, 67 },
64 .last_pid = 0 68 .last_pid = 0,
69 .child_reaper = &init_task
65}; 70};
66 71
67/* 72/*
@@ -80,25 +85,25 @@ struct pspace init_pspace = {
80 85
81static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 86static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
82 87
83static fastcall void free_pidmap(struct pspace *pspace, int pid) 88static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
84{ 89{
85 struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; 90 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
86 int offset = pid & BITS_PER_PAGE_MASK; 91 int offset = pid & BITS_PER_PAGE_MASK;
87 92
88 clear_bit(offset, map->page); 93 clear_bit(offset, map->page);
89 atomic_inc(&map->nr_free); 94 atomic_inc(&map->nr_free);
90} 95}
91 96
92static int alloc_pidmap(struct pspace *pspace) 97static int alloc_pidmap(struct pid_namespace *pid_ns)
93{ 98{
94 int i, offset, max_scan, pid, last = pspace->last_pid; 99 int i, offset, max_scan, pid, last = pid_ns->last_pid;
95 struct pidmap *map; 100 struct pidmap *map;
96 101
97 pid = last + 1; 102 pid = last + 1;
98 if (pid >= pid_max) 103 if (pid >= pid_max)
99 pid = RESERVED_PIDS; 104 pid = RESERVED_PIDS;
100 offset = pid & BITS_PER_PAGE_MASK; 105 offset = pid & BITS_PER_PAGE_MASK;
101 map = &pspace->pidmap[pid/BITS_PER_PAGE]; 106 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
102 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 107 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
103 for (i = 0; i <= max_scan; ++i) { 108 for (i = 0; i <= max_scan; ++i) {
104 if (unlikely(!map->page)) { 109 if (unlikely(!map->page)) {
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace)
120 do { 125 do {
121 if (!test_and_set_bit(offset, map->page)) { 126 if (!test_and_set_bit(offset, map->page)) {
122 atomic_dec(&map->nr_free); 127 atomic_dec(&map->nr_free);
123 pspace->last_pid = pid; 128 pid_ns->last_pid = pid;
124 return pid; 129 return pid;
125 } 130 }
126 offset = find_next_offset(map, offset); 131 offset = find_next_offset(map, offset);
127 pid = mk_pid(pspace, map, offset); 132 pid = mk_pid(pid_ns, map, offset);
128 /* 133 /*
129 * find_next_offset() found a bit, the pid from it 134 * find_next_offset() found a bit, the pid from it
130 * is in-bounds, and if we fell back to the last 135 * is in-bounds, and if we fell back to the last
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace)
135 (i != max_scan || pid < last || 140 (i != max_scan || pid < last ||
136 !((last+1) & BITS_PER_PAGE_MASK))); 141 !((last+1) & BITS_PER_PAGE_MASK)));
137 } 142 }
138 if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 143 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
139 ++map; 144 ++map;
140 offset = 0; 145 offset = 0;
141 } else { 146 } else {
142 map = &pspace->pidmap[0]; 147 map = &pid_ns->pidmap[0];
143 offset = RESERVED_PIDS; 148 offset = RESERVED_PIDS;
144 if (unlikely(last == offset)) 149 if (unlikely(last == offset))
145 break; 150 break;
146 } 151 }
147 pid = mk_pid(pspace, map, offset); 152 pid = mk_pid(pid_ns, map, offset);
148 } 153 }
149 return -1; 154 return -1;
150} 155}
151 156
152static int next_pidmap(struct pspace *pspace, int last) 157static int next_pidmap(struct pid_namespace *pid_ns, int last)
153{ 158{
154 int offset; 159 int offset;
155 struct pidmap *map, *end; 160 struct pidmap *map, *end;
156 161
157 offset = (last + 1) & BITS_PER_PAGE_MASK; 162 offset = (last + 1) & BITS_PER_PAGE_MASK;
158 map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; 163 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
159 end = &pspace->pidmap[PIDMAP_ENTRIES]; 164 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
160 for (; map < end; map++, offset = 0) { 165 for (; map < end; map++, offset = 0) {
161 if (unlikely(!map->page)) 166 if (unlikely(!map->page))
162 continue; 167 continue;
163 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); 168 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
164 if (offset < BITS_PER_PAGE) 169 if (offset < BITS_PER_PAGE)
165 return mk_pid(pspace, map, offset); 170 return mk_pid(pid_ns, map, offset);
166 } 171 }
167 return -1; 172 return -1;
168} 173}
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid)
192 hlist_del_rcu(&pid->pid_chain); 197 hlist_del_rcu(&pid->pid_chain);
193 spin_unlock_irqrestore(&pidmap_lock, flags); 198 spin_unlock_irqrestore(&pidmap_lock, flags);
194 199
195 free_pidmap(&init_pspace, pid->nr); 200 free_pidmap(current->nsproxy->pid_ns, pid->nr);
196 call_rcu(&pid->rcu, delayed_put_pid); 201 call_rcu(&pid->rcu, delayed_put_pid);
197} 202}
198 203
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void)
206 if (!pid) 211 if (!pid)
207 goto out; 212 goto out;
208 213
209 nr = alloc_pidmap(&init_pspace); 214 nr = alloc_pidmap(current->nsproxy->pid_ns);
210 if (nr < 0) 215 if (nr < 0)
211 goto out_free; 216 goto out_free;
212 217
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr)
348 pid = find_pid(nr); 353 pid = find_pid(nr);
349 if (pid) 354 if (pid)
350 break; 355 break;
351 nr = next_pidmap(&init_pspace, nr); 356 nr = next_pidmap(current->nsproxy->pid_ns, nr);
352 } while (nr > 0); 357 } while (nr > 0);
353 358
354 return pid; 359 return pid;
355} 360}
356EXPORT_SYMBOL_GPL(find_get_pid); 361EXPORT_SYMBOL_GPL(find_get_pid);
357 362
363int copy_pid_ns(int flags, struct task_struct *tsk)
364{
365 struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
366 int err = 0;
367
368 if (!old_ns)
369 return 0;
370
371 get_pid_ns(old_ns);
372 return err;
373}
374
375void free_pid_ns(struct kref *kref)
376{
377 struct pid_namespace *ns;
378
379 ns = container_of(kref, struct pid_namespace, kref);
380 kfree(ns);
381}
382
358/* 383/*
359 * The pid hash table is scaled according to the amount of memory in the 384 * The pid hash table is scaled according to the amount of memory in the
360 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 385 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -382,10 +407,10 @@ void __init pidhash_init(void)
382 407
383void __init pidmap_init(void) 408void __init pidmap_init(void)
384{ 409{
385 init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 410 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
386 /* Reserve PID 0. We never call free_pidmap(0) */ 411 /* Reserve PID 0. We never call free_pidmap(0) */
387 set_bit(0, init_pspace.pidmap[0].page); 412 set_bit(0, init_pid_ns.pidmap[0].page);
388 atomic_dec(&init_pspace.pidmap[0].nr_free); 413 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
389 414
390 pid_cachep = kmem_cache_create("pid", sizeof(struct pid), 415 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
391 __alignof__(struct pid), 416 __alignof__(struct pid),
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 479b16b44f79..7c3e1e6dfb5b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -88,6 +88,19 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
88} 88}
89 89
90/* 90/*
91 * Divide and limit the result to res >= 1
92 *
93 * This is necessary to prevent signal delivery starvation, when the result of
94 * the division would be rounded down to 0.
95 */
96static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
97{
98 cputime_t res = cputime_div(time, div);
99
100 return max_t(cputime_t, res, 1);
101}
102
103/*
91 * Update expiry time from increment, and increase overrun count, 104 * Update expiry time from increment, and increase overrun count,
92 * given the current clock sample. 105 * given the current clock sample.
93 */ 106 */
@@ -483,8 +496,8 @@ static void process_timer_rebalance(struct task_struct *p,
483 BUG(); 496 BUG();
484 break; 497 break;
485 case CPUCLOCK_PROF: 498 case CPUCLOCK_PROF:
486 left = cputime_div(cputime_sub(expires.cpu, val.cpu), 499 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
487 nthreads); 500 nthreads);
488 do { 501 do {
489 if (likely(!(t->flags & PF_EXITING))) { 502 if (likely(!(t->flags & PF_EXITING))) {
490 ticks = cputime_add(prof_ticks(t), left); 503 ticks = cputime_add(prof_ticks(t), left);
@@ -498,8 +511,8 @@ static void process_timer_rebalance(struct task_struct *p,
498 } while (t != p); 511 } while (t != p);
499 break; 512 break;
500 case CPUCLOCK_VIRT: 513 case CPUCLOCK_VIRT:
501 left = cputime_div(cputime_sub(expires.cpu, val.cpu), 514 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
502 nthreads); 515 nthreads);
503 do { 516 do {
504 if (likely(!(t->flags & PF_EXITING))) { 517 if (likely(!(t->flags & PF_EXITING))) {
505 ticks = cputime_add(virt_ticks(t), left); 518 ticks = cputime_add(virt_ticks(t), left);
@@ -515,6 +528,7 @@ static void process_timer_rebalance(struct task_struct *p,
515 case CPUCLOCK_SCHED: 528 case CPUCLOCK_SCHED:
516 nsleft = expires.sched - val.sched; 529 nsleft = expires.sched - val.sched;
517 do_div(nsleft, nthreads); 530 do_div(nsleft, nthreads);
531 nsleft = max_t(unsigned long long, nsleft, 1);
518 do { 532 do {
519 if (likely(!(t->flags & PF_EXITING))) { 533 if (likely(!(t->flags & PF_EXITING))) {
520 ns = t->sched_time + nsleft; 534 ns = t->sched_time + nsleft;
@@ -1159,12 +1173,13 @@ static void check_process_timers(struct task_struct *tsk,
1159 1173
1160 prof_left = cputime_sub(prof_expires, utime); 1174 prof_left = cputime_sub(prof_expires, utime);
1161 prof_left = cputime_sub(prof_left, stime); 1175 prof_left = cputime_sub(prof_left, stime);
1162 prof_left = cputime_div(prof_left, nthreads); 1176 prof_left = cputime_div_non_zero(prof_left, nthreads);
1163 virt_left = cputime_sub(virt_expires, utime); 1177 virt_left = cputime_sub(virt_expires, utime);
1164 virt_left = cputime_div(virt_left, nthreads); 1178 virt_left = cputime_div_non_zero(virt_left, nthreads);
1165 if (sched_expires) { 1179 if (sched_expires) {
1166 sched_left = sched_expires - sched_time; 1180 sched_left = sched_expires - sched_time;
1167 do_div(sched_left, nthreads); 1181 do_div(sched_left, nthreads);
1182 sched_left = max_t(unsigned long long, sched_left, 1);
1168 } else { 1183 } else {
1169 sched_left = 0; 1184 sched_left = 0;
1170 } 1185 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06f..5fe87de10ff0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
70/* 70/*
71 * Lets keep our timers in a slab cache :-) 71 * Lets keep our timers in a slab cache :-)
72 */ 72 */
73static kmem_cache_t *posix_timers_cache; 73static struct kmem_cache *posix_timers_cache;
74static struct idr posix_timers_id; 74static struct idr posix_timers_id;
75static DEFINE_SPINLOCK(idr_lock); 75static DEFINE_SPINLOCK(idr_lock);
76 76
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca3479..710ed084e7c5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED
78 78
79config SOFTWARE_SUSPEND 79config SOFTWARE_SUSPEND
80 bool "Software Suspend" 80 bool "Software Suspend"
81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) 81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
82 ---help--- 82 ---help---
83 Enable the possibility of suspending the machine. 83 Enable the possibility of suspending the machine.
84 It doesn't need ACPI or APM. 84 It doesn't need ACPI or APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d3a158a60312..0b00f56c2ad0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/console.h> 21#include <linux/console.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/freezer.h>
23 24
24#include "power.h" 25#include "power.h"
25 26
@@ -27,6 +28,23 @@
27static int noresume = 0; 28static int noresume = 0;
28char resume_file[256] = CONFIG_PM_STD_PARTITION; 29char resume_file[256] = CONFIG_PM_STD_PARTITION;
29dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block;
32
33/**
34 * platform_prepare - prepare the machine for hibernation using the
35 * platform driver if so configured and return an error code if it fails
36 */
37
38static inline int platform_prepare(void)
39{
40 int error = 0;
41
42 if (pm_disk_mode == PM_DISK_PLATFORM) {
43 if (pm_ops && pm_ops->prepare)
44 error = pm_ops->prepare(PM_SUSPEND_DISK);
45 }
46 return error;
47}
30 48
31/** 49/**
32 * power_down - Shut machine down for hibernate. 50 * power_down - Shut machine down for hibernate.
@@ -40,12 +58,10 @@ dev_t swsusp_resume_device;
40 58
41static void power_down(suspend_disk_method_t mode) 59static void power_down(suspend_disk_method_t mode)
42{ 60{
43 int error = 0;
44
45 switch(mode) { 61 switch(mode) {
46 case PM_DISK_PLATFORM: 62 case PM_DISK_PLATFORM:
47 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 63 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
48 error = pm_ops->enter(PM_SUSPEND_DISK); 64 pm_ops->enter(PM_SUSPEND_DISK);
49 break; 65 break;
50 case PM_DISK_SHUTDOWN: 66 case PM_DISK_SHUTDOWN:
51 kernel_power_off(); 67 kernel_power_off();
@@ -71,7 +87,7 @@ static inline void platform_finish(void)
71 87
72static int prepare_processes(void) 88static int prepare_processes(void)
73{ 89{
74 int error; 90 int error = 0;
75 91
76 pm_prepare_console(); 92 pm_prepare_console();
77 93
@@ -84,12 +100,24 @@ static int prepare_processes(void)
84 goto thaw; 100 goto thaw;
85 } 101 }
86 102
103 if (pm_disk_mode == PM_DISK_TESTPROC) {
104 printk("swsusp debug: Waiting for 5 seconds.\n");
105 mdelay(5000);
106 goto thaw;
107 }
108
109 error = platform_prepare();
110 if (error)
111 goto thaw;
112
87 /* Free memory before shutting down devices. */ 113 /* Free memory before shutting down devices. */
88 if (!(error = swsusp_shrink_memory())) 114 if (!(error = swsusp_shrink_memory()))
89 return 0; 115 return 0;
90thaw: 116
117 platform_finish();
118 thaw:
91 thaw_processes(); 119 thaw_processes();
92enable_cpus: 120 enable_cpus:
93 enable_nonboot_cpus(); 121 enable_nonboot_cpus();
94 pm_restore_console(); 122 pm_restore_console();
95 return error; 123 return error;
@@ -120,13 +148,21 @@ int pm_suspend_disk(void)
120 if (error) 148 if (error)
121 return error; 149 return error;
122 150
151 if (pm_disk_mode == PM_DISK_TESTPROC)
152 return 0;
153
123 suspend_console(); 154 suspend_console();
124 error = device_suspend(PMSG_FREEZE); 155 error = device_suspend(PMSG_FREEZE);
125 if (error) { 156 if (error) {
126 resume_console(); 157 resume_console();
127 printk("Some devices failed to suspend\n"); 158 printk("Some devices failed to suspend\n");
128 unprepare_processes(); 159 goto Thaw;
129 return error; 160 }
161
162 if (pm_disk_mode == PM_DISK_TEST) {
163 printk("swsusp debug: Waiting for 5 seconds.\n");
164 mdelay(5000);
165 goto Done;
130 } 166 }
131 167
132 pr_debug("PM: snapshotting memory.\n"); 168 pr_debug("PM: snapshotting memory.\n");
@@ -143,16 +179,17 @@ int pm_suspend_disk(void)
143 power_down(pm_disk_mode); 179 power_down(pm_disk_mode);
144 else { 180 else {
145 swsusp_free(); 181 swsusp_free();
146 unprepare_processes(); 182 goto Thaw;
147 return error;
148 } 183 }
149 } else 184 } else {
150 pr_debug("PM: Image restored successfully.\n"); 185 pr_debug("PM: Image restored successfully.\n");
186 }
151 187
152 swsusp_free(); 188 swsusp_free();
153 Done: 189 Done:
154 device_resume(); 190 device_resume();
155 resume_console(); 191 resume_console();
192 Thaw:
156 unprepare_processes(); 193 unprepare_processes();
157 return error; 194 return error;
158} 195}
@@ -174,10 +211,10 @@ static int software_resume(void)
174{ 211{
175 int error; 212 int error;
176 213
177 down(&pm_sem); 214 mutex_lock(&pm_mutex);
178 if (!swsusp_resume_device) { 215 if (!swsusp_resume_device) {
179 if (!strlen(resume_file)) { 216 if (!strlen(resume_file)) {
180 up(&pm_sem); 217 mutex_unlock(&pm_mutex);
181 return -ENOENT; 218 return -ENOENT;
182 } 219 }
183 swsusp_resume_device = name_to_dev_t(resume_file); 220 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -192,7 +229,7 @@ static int software_resume(void)
192 * FIXME: If noresume is specified, we need to find the partition 229 * FIXME: If noresume is specified, we need to find the partition
193 * and reset it back to normal swap space. 230 * and reset it back to normal swap space.
194 */ 231 */
195 up(&pm_sem); 232 mutex_unlock(&pm_mutex);
196 return 0; 233 return 0;
197 } 234 }
198 235
@@ -236,7 +273,7 @@ static int software_resume(void)
236 unprepare_processes(); 273 unprepare_processes();
237 Done: 274 Done:
238 /* For success case, the suspend path will release the lock */ 275 /* For success case, the suspend path will release the lock */
239 up(&pm_sem); 276 mutex_unlock(&pm_mutex);
240 pr_debug("PM: Resume from disk failed.\n"); 277 pr_debug("PM: Resume from disk failed.\n");
241 return 0; 278 return 0;
242} 279}
@@ -249,6 +286,8 @@ static const char * const pm_disk_modes[] = {
249 [PM_DISK_PLATFORM] = "platform", 286 [PM_DISK_PLATFORM] = "platform",
250 [PM_DISK_SHUTDOWN] = "shutdown", 287 [PM_DISK_SHUTDOWN] = "shutdown",
251 [PM_DISK_REBOOT] = "reboot", 288 [PM_DISK_REBOOT] = "reboot",
289 [PM_DISK_TEST] = "test",
290 [PM_DISK_TESTPROC] = "testproc",
252}; 291};
253 292
254/** 293/**
@@ -295,7 +334,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
295 p = memchr(buf, '\n', n); 334 p = memchr(buf, '\n', n);
296 len = p ? p - buf : n; 335 len = p ? p - buf : n;
297 336
298 down(&pm_sem); 337 mutex_lock(&pm_mutex);
299 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { 338 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
300 if (!strncmp(buf, pm_disk_modes[i], len)) { 339 if (!strncmp(buf, pm_disk_modes[i], len)) {
301 mode = i; 340 mode = i;
@@ -303,21 +342,23 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
303 } 342 }
304 } 343 }
305 if (mode) { 344 if (mode) {
306 if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) 345 if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT ||
346 mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) {
307 pm_disk_mode = mode; 347 pm_disk_mode = mode;
308 else { 348 } else {
309 if (pm_ops && pm_ops->enter && 349 if (pm_ops && pm_ops->enter &&
310 (mode == pm_ops->pm_disk_mode)) 350 (mode == pm_ops->pm_disk_mode))
311 pm_disk_mode = mode; 351 pm_disk_mode = mode;
312 else 352 else
313 error = -EINVAL; 353 error = -EINVAL;
314 } 354 }
315 } else 355 } else {
316 error = -EINVAL; 356 error = -EINVAL;
357 }
317 358
318 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 359 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
319 pm_disk_modes[mode]); 360 pm_disk_modes[mode]);
320 up(&pm_sem); 361 mutex_unlock(&pm_mutex);
321 return error ? error : n; 362 return error ? error : n;
322} 363}
323 364
@@ -342,14 +383,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
342 if (maj != MAJOR(res) || min != MINOR(res)) 383 if (maj != MAJOR(res) || min != MINOR(res))
343 goto out; 384 goto out;
344 385
345 down(&pm_sem); 386 mutex_lock(&pm_mutex);
346 swsusp_resume_device = res; 387 swsusp_resume_device = res;
347 up(&pm_sem); 388 mutex_unlock(&pm_mutex);
348 printk("Attempting manual resume\n"); 389 printk("Attempting manual resume\n");
349 noresume = 0; 390 noresume = 0;
350 software_resume(); 391 software_resume();
351 ret = n; 392 ret = n;
352out: 393 out:
353 return ret; 394 return ret;
354} 395}
355 396
@@ -404,6 +445,19 @@ static int __init resume_setup(char *str)
404 return 1; 445 return 1;
405} 446}
406 447
448static int __init resume_offset_setup(char *str)
449{
450 unsigned long long offset;
451
452 if (noresume)
453 return 1;
454
455 if (sscanf(str, "%llu", &offset) == 1)
456 swsusp_resume_block = offset;
457
458 return 1;
459}
460
407static int __init noresume_setup(char *str) 461static int __init noresume_setup(char *str)
408{ 462{
409 noresume = 1; 463 noresume = 1;
@@ -411,4 +465,5 @@ static int __init noresume_setup(char *str)
411} 465}
412 466
413__setup("noresume", noresume_setup); 467__setup("noresume", noresume_setup);
468__setup("resume_offset=", resume_offset_setup);
414__setup("resume=", resume_setup); 469__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 873228c71dab..500eb87f643d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/module.h>
11#include <linux/suspend.h> 12#include <linux/suspend.h>
12#include <linux/kobject.h> 13#include <linux/kobject.h>
13#include <linux/string.h> 14#include <linux/string.h>
@@ -18,13 +19,14 @@
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/resume-trace.h> 21#include <linux/resume-trace.h>
22#include <linux/freezer.h>
21 23
22#include "power.h" 24#include "power.h"
23 25
24/*This is just an arbitrary number */ 26/*This is just an arbitrary number */
25#define FREE_PAGE_NUMBER (100) 27#define FREE_PAGE_NUMBER (100)
26 28
27DECLARE_MUTEX(pm_sem); 29DEFINE_MUTEX(pm_mutex);
28 30
29struct pm_ops *pm_ops; 31struct pm_ops *pm_ops;
30suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; 32suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
36 38
37void pm_set_ops(struct pm_ops * ops) 39void pm_set_ops(struct pm_ops * ops)
38{ 40{
39 down(&pm_sem); 41 mutex_lock(&pm_mutex);
40 pm_ops = ops; 42 pm_ops = ops;
41 up(&pm_sem); 43 mutex_unlock(&pm_mutex);
42} 44}
43 45
44 46
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state)
182 184
183 if (!valid_state(state)) 185 if (!valid_state(state))
184 return -ENODEV; 186 return -ENODEV;
185 if (down_trylock(&pm_sem)) 187 if (!mutex_trylock(&pm_mutex))
186 return -EBUSY; 188 return -EBUSY;
187 189
188 if (state == PM_SUSPEND_DISK) { 190 if (state == PM_SUSPEND_DISK) {
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state)
200 pr_debug("PM: Finishing wakeup.\n"); 202 pr_debug("PM: Finishing wakeup.\n");
201 suspend_finish(state); 203 suspend_finish(state);
202 Unlock: 204 Unlock:
203 up(&pm_sem); 205 mutex_unlock(&pm_mutex);
204 return error; 206 return error;
205} 207}
206 208
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state)
229 return -EINVAL; 231 return -EINVAL;
230} 232}
231 233
232 234EXPORT_SYMBOL(pm_suspend);
233 235
234decl_subsys(power,NULL,NULL); 236decl_subsys(power,NULL,NULL);
235 237
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b272..eb461b816bf4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
22 return -EPERM; 22 return -EPERM;
23} 23}
24#endif 24#endif
25extern struct semaphore pm_sem; 25
26extern struct mutex pm_mutex;
27
26#define power_attr(_name) \ 28#define power_attr(_name) \
27static struct subsys_attribute _name##_attr = { \ 29static struct subsys_attribute _name##_attr = { \
28 .attr = { \ 30 .attr = { \
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end;
42extern unsigned long image_size; 44extern unsigned long image_size;
43extern int in_suspend; 45extern int in_suspend;
44extern dev_t swsusp_resume_device; 46extern dev_t swsusp_resume_device;
47extern sector_t swsusp_resume_block;
45 48
46extern asmlinkage int swsusp_arch_suspend(void); 49extern asmlinkage int swsusp_arch_suspend(void);
47extern asmlinkage int swsusp_arch_resume(void); 50extern asmlinkage int swsusp_arch_resume(void);
@@ -102,8 +105,18 @@ struct snapshot_handle {
102extern unsigned int snapshot_additional_pages(struct zone *zone); 105extern unsigned int snapshot_additional_pages(struct zone *zone);
103extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 106extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
104extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 107extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
108extern void snapshot_write_finalize(struct snapshot_handle *handle);
105extern int snapshot_image_loaded(struct snapshot_handle *handle); 109extern int snapshot_image_loaded(struct snapshot_handle *handle);
106extern void snapshot_free_unused_memory(struct snapshot_handle *handle); 110
111/*
112 * This structure is used to pass the values needed for the identification
113 * of the resume swap area from a user space to the kernel via the
114 * SNAPSHOT_SET_SWAP_AREA ioctl
115 */
116struct resume_swap_area {
117 loff_t offset;
118 u_int32_t dev;
119} __attribute__((packed));
107 120
108#define SNAPSHOT_IOC_MAGIC '3' 121#define SNAPSHOT_IOC_MAGIC '3'
109#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) 122#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
117#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) 130#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
118#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) 131#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
119#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) 132#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
120#define SNAPSHOT_IOC_MAXNR 11 133#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
134#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \
135 struct resume_swap_area)
136#define SNAPSHOT_IOC_MAXNR 13
137
138#define PMOPS_PREPARE 1
139#define PMOPS_ENTER 2
140#define PMOPS_FINISH 3
121 141
122/** 142/**
123 * The bitmap is used for tracing allocated swap pages 143 * The bitmap is used for tracing allocated swap pages
@@ -141,7 +161,7 @@ struct bitmap_page {
141 161
142extern void free_bitmap(struct bitmap_page *bitmap); 162extern void free_bitmap(struct bitmap_page *bitmap);
143extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); 163extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
144extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); 164extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
145extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); 165extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
146 166
147extern int swsusp_check(void); 167extern int swsusp_check(void);
@@ -153,3 +173,7 @@ extern int swsusp_read(void);
153extern int swsusp_write(void); 173extern int swsusp_write(void);
154extern void swsusp_close(void); 174extern void swsusp_close(void);
155extern int suspend_enter(suspend_state_t state); 175extern int suspend_enter(suspend_state_t state);
176
177struct timeval;
178extern void swsusp_show_speed(struct timeval *, struct timeval *,
179 unsigned int, char *);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac3164..678ec736076b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
16 * callback we use. 16 * callback we use.
17 */ 17 */
18 18
19static void do_poweroff(void *dummy) 19static void do_poweroff(struct work_struct *dummy)
20{ 20{
21 kernel_power_off(); 21 kernel_power_off();
22} 22}
23 23
24static DECLARE_WORK(poweroff_work, do_poweroff, NULL); 24static DECLARE_WORK(poweroff_work, do_poweroff);
25 25
26static void handle_poweroff(int key, struct tty_struct *tty) 26static void handle_poweroff(int key, struct tty_struct *tty)
27{ 27{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e6..99eeb119b06d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,12 +13,15 @@
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h>
16 17
17/* 18/*
18 * Timeout for stopping processes 19 * Timeout for stopping processes
19 */ 20 */
20#define TIMEOUT (20 * HZ) 21#define TIMEOUT (20 * HZ)
21 22
23#define FREEZER_KERNEL_THREADS 0
24#define FREEZER_USER_SPACE 1
22 25
23static inline int freezeable(struct task_struct * p) 26static inline int freezeable(struct task_struct * p)
24{ 27{
@@ -39,7 +42,6 @@ void refrigerator(void)
39 long save; 42 long save;
40 save = current->state; 43 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 44 pr_debug("%s entered refrigerator\n", current->comm);
42 printk("=");
43 45
44 frozen_process(current); 46 frozen_process(current);
45 spin_lock_irq(&current->sighand->siglock); 47 spin_lock_irq(&current->sighand->siglock);
@@ -79,96 +81,136 @@ static void cancel_freezing(struct task_struct *p)
79 } 81 }
80} 82}
81 83
82/* 0 = success, else # of processes that we failed to stop */ 84static inline int is_user_space(struct task_struct *p)
83int freeze_processes(void) 85{
86 return p->mm && !(p->flags & PF_BORROWED_MM);
87}
88
89static unsigned int try_to_freeze_tasks(int freeze_user_space)
84{ 90{
85 int todo, nr_user, user_frozen;
86 unsigned long start_time;
87 struct task_struct *g, *p; 91 struct task_struct *g, *p;
92 unsigned long end_time;
93 unsigned int todo;
88 94
89 printk( "Stopping tasks: " ); 95 end_time = jiffies + TIMEOUT;
90 start_time = jiffies;
91 user_frozen = 0;
92 do { 96 do {
93 nr_user = todo = 0; 97 todo = 0;
94 read_lock(&tasklist_lock); 98 read_lock(&tasklist_lock);
95 do_each_thread(g, p) { 99 do_each_thread(g, p) {
96 if (!freezeable(p)) 100 if (!freezeable(p))
97 continue; 101 continue;
102
98 if (frozen(p)) 103 if (frozen(p))
99 continue; 104 continue;
100 if (p->state == TASK_TRACED && frozen(p->parent)) { 105
106 if (p->state == TASK_TRACED &&
107 (frozen(p->parent) ||
108 p->parent->state == TASK_STOPPED)) {
101 cancel_freezing(p); 109 cancel_freezing(p);
102 continue; 110 continue;
103 } 111 }
104 if (p->mm && !(p->flags & PF_BORROWED_MM)) { 112 if (is_user_space(p)) {
105 /* The task is a user-space one. 113 if (!freeze_user_space)
106 * Freeze it unless there's a vfork completion 114 continue;
107 * pending 115
116 /* Freeze the task unless there is a vfork
117 * completion pending
108 */ 118 */
109 if (!p->vfork_done) 119 if (!p->vfork_done)
110 freeze_process(p); 120 freeze_process(p);
111 nr_user++;
112 } else { 121 } else {
113 /* Freeze only if the user space is frozen */ 122 if (freeze_user_space)
114 if (user_frozen) 123 continue;
115 freeze_process(p); 124
116 todo++; 125 freeze_process(p);
117 } 126 }
127 todo++;
118 } while_each_thread(g, p); 128 } while_each_thread(g, p);
119 read_unlock(&tasklist_lock); 129 read_unlock(&tasklist_lock);
120 todo += nr_user;
121 if (!user_frozen && !nr_user) {
122 sys_sync();
123 start_time = jiffies;
124 }
125 user_frozen = !nr_user;
126 yield(); /* Yield is okay here */ 130 yield(); /* Yield is okay here */
127 if (todo && time_after(jiffies, start_time + TIMEOUT)) 131 if (todo && time_after(jiffies, end_time))
128 break; 132 break;
129 } while(todo); 133 } while (todo);
130 134
131 /* This does not unfreeze processes that are already frozen
132 * (we have slightly ugly calling convention in that respect,
133 * and caller must call thaw_processes() if something fails),
134 * but it cleans up leftover PF_FREEZE requests.
135 */
136 if (todo) { 135 if (todo) {
137 printk( "\n" ); 136 /* This does not unfreeze processes that are already frozen
138 printk(KERN_ERR " stopping tasks timed out " 137 * (we have slightly ugly calling convention in that respect,
139 "after %d seconds (%d tasks remaining):\n", 138 * and caller must call thaw_processes() if something fails),
140 TIMEOUT / HZ, todo); 139 * but it cleans up leftover PF_FREEZE requests.
140 */
141 printk("\n");
142 printk(KERN_ERR "Stopping %s timed out after %d seconds "
143 "(%d tasks refusing to freeze):\n",
144 freeze_user_space ? "user space processes" :
145 "kernel threads",
146 TIMEOUT / HZ, todo);
141 read_lock(&tasklist_lock); 147 read_lock(&tasklist_lock);
142 do_each_thread(g, p) { 148 do_each_thread(g, p) {
149 if (is_user_space(p) == !freeze_user_space)
150 continue;
151
143 if (freezeable(p) && !frozen(p)) 152 if (freezeable(p) && !frozen(p))
144 printk(KERN_ERR " %s\n", p->comm); 153 printk(KERN_ERR " %s\n", p->comm);
154
145 cancel_freezing(p); 155 cancel_freezing(p);
146 } while_each_thread(g, p); 156 } while_each_thread(g, p);
147 read_unlock(&tasklist_lock); 157 read_unlock(&tasklist_lock);
148 return todo;
149 } 158 }
150 159
151 printk( "|\n" ); 160 return todo;
161}
162
163/**
164 * freeze_processes - tell processes to enter the refrigerator
165 *
166 * Returns 0 on success, or the number of processes that didn't freeze,
167 * although they were told to.
168 */
169int freeze_processes(void)
170{
171 unsigned int nr_unfrozen;
172
173 printk("Stopping tasks ... ");
174 nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
175 if (nr_unfrozen)
176 return nr_unfrozen;
177
178 sys_sync();
179 nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
180 if (nr_unfrozen)
181 return nr_unfrozen;
182
183 printk("done.\n");
152 BUG_ON(in_atomic()); 184 BUG_ON(in_atomic());
153 return 0; 185 return 0;
154} 186}
155 187
156void thaw_processes(void) 188static void thaw_tasks(int thaw_user_space)
157{ 189{
158 struct task_struct *g, *p; 190 struct task_struct *g, *p;
159 191
160 printk( "Restarting tasks..." );
161 read_lock(&tasklist_lock); 192 read_lock(&tasklist_lock);
162 do_each_thread(g, p) { 193 do_each_thread(g, p) {
163 if (!freezeable(p)) 194 if (!freezeable(p))
164 continue; 195 continue;
196
197 if (is_user_space(p) == !thaw_user_space)
198 continue;
199
165 if (!thaw_process(p)) 200 if (!thaw_process(p))
166 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 201 printk(KERN_WARNING " Strange, %s not stopped\n",
202 p->comm );
167 } while_each_thread(g, p); 203 } while_each_thread(g, p);
168
169 read_unlock(&tasklist_lock); 204 read_unlock(&tasklist_lock);
205}
206
207void thaw_processes(void)
208{
209 printk("Restarting tasks ... ");
210 thaw_tasks(FREEZER_KERNEL_THREADS);
211 thaw_tasks(FREEZER_USER_SPACE);
170 schedule(); 212 schedule();
171 printk( " done\n" ); 213 printk("done.\n");
172} 214}
173 215
174EXPORT_SYMBOL(refrigerator); 216EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d6..c024606221c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
1/* 1/*
2 * linux/kernel/power/snapshot.c 2 * linux/kernel/power/snapshot.c
3 * 3 *
4 * This file provide system snapshot/restore functionality. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 * 8 *
8 * This file is released under the GPLv2, and is based on swsusp.c. 9 * This file is released under the GPLv2.
9 * 10 *
10 */ 11 */
11 12
12
13#include <linux/version.h> 13#include <linux/version.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
@@ -34,137 +34,24 @@
34 34
35#include "power.h" 35#include "power.h"
36 36
37/* List of PBEs used for creating and restoring the suspend image */ 37/* List of PBEs needed for restoring the pages that were allocated before
38 * the suspend and included in the suspend image, but have also been
39 * allocated by the "resume" kernel, so their contents cannot be written
40 * directly to their "original" page frames.
41 */
38struct pbe *restore_pblist; 42struct pbe *restore_pblist;
39 43
40static unsigned int nr_copy_pages; 44/* Pointer to an auxiliary buffer (1 page) */
41static unsigned int nr_meta_pages;
42static void *buffer; 45static void *buffer;
43 46
44#ifdef CONFIG_HIGHMEM
45unsigned int count_highmem_pages(void)
46{
47 struct zone *zone;
48 unsigned long zone_pfn;
49 unsigned int n = 0;
50
51 for_each_zone (zone)
52 if (is_highmem(zone)) {
53 mark_free_pages(zone);
54 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
55 struct page *page;
56 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
57 if (!pfn_valid(pfn))
58 continue;
59 page = pfn_to_page(pfn);
60 if (PageReserved(page))
61 continue;
62 if (PageNosaveFree(page))
63 continue;
64 n++;
65 }
66 }
67 return n;
68}
69
70struct highmem_page {
71 char *data;
72 struct page *page;
73 struct highmem_page *next;
74};
75
76static struct highmem_page *highmem_copy;
77
78static int save_highmem_zone(struct zone *zone)
79{
80 unsigned long zone_pfn;
81 mark_free_pages(zone);
82 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
83 struct page *page;
84 struct highmem_page *save;
85 void *kaddr;
86 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
87
88 if (!(pfn%10000))
89 printk(".");
90 if (!pfn_valid(pfn))
91 continue;
92 page = pfn_to_page(pfn);
93 /*
94 * This condition results from rvmalloc() sans vmalloc_32()
95 * and architectural memory reservations. This should be
96 * corrected eventually when the cases giving rise to this
97 * are better understood.
98 */
99 if (PageReserved(page))
100 continue;
101 BUG_ON(PageNosave(page));
102 if (PageNosaveFree(page))
103 continue;
104 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
105 if (!save)
106 return -ENOMEM;
107 save->next = highmem_copy;
108 save->page = page;
109 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
110 if (!save->data) {
111 kfree(save);
112 return -ENOMEM;
113 }
114 kaddr = kmap_atomic(page, KM_USER0);
115 memcpy(save->data, kaddr, PAGE_SIZE);
116 kunmap_atomic(kaddr, KM_USER0);
117 highmem_copy = save;
118 }
119 return 0;
120}
121
122int save_highmem(void)
123{
124 struct zone *zone;
125 int res = 0;
126
127 pr_debug("swsusp: Saving Highmem");
128 drain_local_pages();
129 for_each_zone (zone) {
130 if (is_highmem(zone))
131 res = save_highmem_zone(zone);
132 if (res)
133 return res;
134 }
135 printk("\n");
136 return 0;
137}
138
139int restore_highmem(void)
140{
141 printk("swsusp: Restoring Highmem\n");
142 while (highmem_copy) {
143 struct highmem_page *save = highmem_copy;
144 void *kaddr;
145 highmem_copy = save->next;
146
147 kaddr = kmap_atomic(save->page, KM_USER0);
148 memcpy(kaddr, save->data, PAGE_SIZE);
149 kunmap_atomic(kaddr, KM_USER0);
150 free_page((long) save->data);
151 kfree(save);
152 }
153 return 0;
154}
155#else
156static inline unsigned int count_highmem_pages(void) {return 0;}
157static inline int save_highmem(void) {return 0;}
158static inline int restore_highmem(void) {return 0;}
159#endif
160
161/** 47/**
162 * @safe_needed - on resume, for storing the PBE list and the image, 48 * @safe_needed - on resume, for storing the PBE list and the image,
163 * we can only use memory pages that do not conflict with the pages 49 * we can only use memory pages that do not conflict with the pages
164 * used before suspend. 50 * used before suspend. The unsafe pages have PageNosaveFree set
51 * and we count them using unsafe_pages.
165 * 52 *
166 * The unsafe pages are marked with the PG_nosave_free flag 53 * Each allocated image page is marked as PageNosave and PageNosaveFree
167 * and we count them using unsafe_pages 54 * so that swsusp_free() can release it.
168 */ 55 */
169 56
170#define PG_ANY 0 57#define PG_ANY 0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
174 61
175static unsigned int allocated_unsafe_pages; 62static unsigned int allocated_unsafe_pages;
176 63
177static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 64static void *get_image_page(gfp_t gfp_mask, int safe_needed)
178{ 65{
179 void *res; 66 void *res;
180 67
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
195 82
196unsigned long get_safe_page(gfp_t gfp_mask) 83unsigned long get_safe_page(gfp_t gfp_mask)
197{ 84{
198 return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); 85 return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
86}
87
88static struct page *alloc_image_page(gfp_t gfp_mask)
89{
90 struct page *page;
91
92 page = alloc_page(gfp_mask);
93 if (page) {
94 SetPageNosave(page);
95 SetPageNosaveFree(page);
96 }
97 return page;
199} 98}
200 99
201/** 100/**
202 * free_image_page - free page represented by @addr, allocated with 101 * free_image_page - free page represented by @addr, allocated with
203 * alloc_image_page (page flags set by it must be cleared) 102 * get_image_page (page flags set by it must be cleared)
204 */ 103 */
205 104
206static inline void free_image_page(void *addr, int clear_nosave_free) 105static inline void free_image_page(void *addr, int clear_nosave_free)
207{ 106{
208 ClearPageNosave(virt_to_page(addr)); 107 struct page *page;
108
109 BUG_ON(!virt_addr_valid(addr));
110
111 page = virt_to_page(addr);
112
113 ClearPageNosave(page);
209 if (clear_nosave_free) 114 if (clear_nosave_free)
210 ClearPageNosaveFree(virt_to_page(addr)); 115 ClearPageNosaveFree(page);
211 free_page((unsigned long)addr); 116
117 __free_page(page);
212} 118}
213 119
214/* struct linked_page is used to build chains of pages */ 120/* struct linked_page is used to build chains of pages */
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
269 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { 175 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
270 struct linked_page *lp; 176 struct linked_page *lp;
271 177
272 lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); 178 lp = get_image_page(ca->gfp_mask, ca->safe_needed);
273 if (!lp) 179 if (!lp)
274 return NULL; 180 return NULL;
275 181
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
446 352
447 /* Compute the number of zones */ 353 /* Compute the number of zones */
448 nr = 0; 354 nr = 0;
449 for_each_zone (zone) 355 for_each_zone(zone)
450 if (populated_zone(zone) && !is_highmem(zone)) 356 if (populated_zone(zone))
451 nr++; 357 nr++;
452 358
453 /* Allocate the list of zones bitmap objects */ 359 /* Allocate the list of zones bitmap objects */
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
459 } 365 }
460 366
461 /* Initialize the zone bitmap objects */ 367 /* Initialize the zone bitmap objects */
462 for_each_zone (zone) { 368 for_each_zone(zone) {
463 unsigned long pfn; 369 unsigned long pfn;
464 370
465 if (!populated_zone(zone) || is_highmem(zone)) 371 if (!populated_zone(zone))
466 continue; 372 continue;
467 373
468 zone_bm->start_pfn = zone->zone_start_pfn; 374 zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
481 while (bb) { 387 while (bb) {
482 unsigned long *ptr; 388 unsigned long *ptr;
483 389
484 ptr = alloc_image_page(gfp_mask, safe_needed); 390 ptr = get_image_page(gfp_mask, safe_needed);
485 bb->data = ptr; 391 bb->data = ptr;
486 if (!ptr) 392 if (!ptr)
487 goto Free; 393 goto Free;
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
505 memory_bm_position_reset(bm); 411 memory_bm_position_reset(bm);
506 return 0; 412 return 0;
507 413
508Free: 414 Free:
509 bm->p_list = ca.chain; 415 bm->p_list = ca.chain;
510 memory_bm_free(bm, PG_UNSAFE_CLEAR); 416 memory_bm_free(bm, PG_UNSAFE_CLEAR);
511 return -ENOMEM; 417 return -ENOMEM;
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
651 memory_bm_position_reset(bm); 557 memory_bm_position_reset(bm);
652 return BM_END_OF_MAP; 558 return BM_END_OF_MAP;
653 559
654Return_pfn: 560 Return_pfn:
655 bm->cur.chunk = chunk; 561 bm->cur.chunk = chunk;
656 bm->cur.bit = bit; 562 bm->cur.bit = bit;
657 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; 563 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone)
669 575
670 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 576 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
671 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); 577 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
672 return res; 578 return 2 * res;
579}
580
581#ifdef CONFIG_HIGHMEM
582/**
583 * count_free_highmem_pages - compute the total number of free highmem
584 * pages, system-wide.
585 */
586
587static unsigned int count_free_highmem_pages(void)
588{
589 struct zone *zone;
590 unsigned int cnt = 0;
591
592 for_each_zone(zone)
593 if (populated_zone(zone) && is_highmem(zone))
594 cnt += zone->free_pages;
595
596 return cnt;
597}
598
599/**
600 * saveable_highmem_page - Determine whether a highmem page should be
601 * included in the suspend image.
602 *
603 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
604 * and it isn't a part of a free chunk of pages.
605 */
606
607static struct page *saveable_highmem_page(unsigned long pfn)
608{
609 struct page *page;
610
611 if (!pfn_valid(pfn))
612 return NULL;
613
614 page = pfn_to_page(pfn);
615
616 BUG_ON(!PageHighMem(page));
617
618 if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
619 return NULL;
620
621 return page;
673} 622}
674 623
675/** 624/**
625 * count_highmem_pages - compute the total number of saveable highmem
626 * pages.
627 */
628
629unsigned int count_highmem_pages(void)
630{
631 struct zone *zone;
632 unsigned int n = 0;
633
634 for_each_zone(zone) {
635 unsigned long pfn, max_zone_pfn;
636
637 if (!is_highmem(zone))
638 continue;
639
640 mark_free_pages(zone);
641 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
642 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
643 if (saveable_highmem_page(pfn))
644 n++;
645 }
646 return n;
647}
648#else
649static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
650static inline unsigned int count_highmem_pages(void) { return 0; }
651#endif /* CONFIG_HIGHMEM */
652
653/**
676 * pfn_is_nosave - check if given pfn is in the 'nosave' section 654 * pfn_is_nosave - check if given pfn is in the 'nosave' section
677 */ 655 */
678 656
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
684} 662}
685 663
686/** 664/**
687 * saveable - Determine whether a page should be cloned or not. 665 * saveable - Determine whether a non-highmem page should be included in
688 * @pfn: The page 666 * the suspend image.
689 * 667 *
690 * We save a page if it isn't Nosave, and is not in the range of pages 668 * We should save the page if it isn't Nosave, and is not in the range
691 * statically defined as 'unsaveable', and it 669 * of pages statically defined as 'unsaveable', and it isn't a part of
692 * isn't a part of a free chunk of pages. 670 * a free chunk of pages.
693 */ 671 */
694 672
695static struct page *saveable_page(unsigned long pfn) 673static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn)
701 679
702 page = pfn_to_page(pfn); 680 page = pfn_to_page(pfn);
703 681
704 if (PageNosave(page)) 682 BUG_ON(PageHighMem(page));
683
684 if (PageNosave(page) || PageNosaveFree(page))
705 return NULL; 685 return NULL;
686
706 if (PageReserved(page) && pfn_is_nosave(pfn)) 687 if (PageReserved(page) && pfn_is_nosave(pfn))
707 return NULL; 688 return NULL;
708 if (PageNosaveFree(page))
709 return NULL;
710 689
711 return page; 690 return page;
712} 691}
713 692
693/**
694 * count_data_pages - compute the total number of saveable non-highmem
695 * pages.
696 */
697
714unsigned int count_data_pages(void) 698unsigned int count_data_pages(void)
715{ 699{
716 struct zone *zone; 700 struct zone *zone;
717 unsigned long pfn, max_zone_pfn; 701 unsigned long pfn, max_zone_pfn;
718 unsigned int n = 0; 702 unsigned int n = 0;
719 703
720 for_each_zone (zone) { 704 for_each_zone(zone) {
721 if (is_highmem(zone)) 705 if (is_highmem(zone))
722 continue; 706 continue;
707
723 mark_free_pages(zone); 708 mark_free_pages(zone);
724 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 709 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
725 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 710 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
726 n += !!saveable_page(pfn); 711 if(saveable_page(pfn))
712 n++;
727 } 713 }
728 return n; 714 return n;
729} 715}
730 716
731static inline void copy_data_page(long *dst, long *src) 717/* This is needed, because copy_page and memcpy are not usable for copying
718 * task structs.
719 */
720static inline void do_copy_page(long *dst, long *src)
732{ 721{
733 int n; 722 int n;
734 723
735 /* copy_page and memcpy are not usable for copying task structs. */
736 for (n = PAGE_SIZE / sizeof(long); n; n--) 724 for (n = PAGE_SIZE / sizeof(long); n; n--)
737 *dst++ = *src++; 725 *dst++ = *src++;
738} 726}
739 727
728#ifdef CONFIG_HIGHMEM
729static inline struct page *
730page_is_saveable(struct zone *zone, unsigned long pfn)
731{
732 return is_highmem(zone) ?
733 saveable_highmem_page(pfn) : saveable_page(pfn);
734}
735
736static inline void
737copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
738{
739 struct page *s_page, *d_page;
740 void *src, *dst;
741
742 s_page = pfn_to_page(src_pfn);
743 d_page = pfn_to_page(dst_pfn);
744 if (PageHighMem(s_page)) {
745 src = kmap_atomic(s_page, KM_USER0);
746 dst = kmap_atomic(d_page, KM_USER1);
747 do_copy_page(dst, src);
748 kunmap_atomic(src, KM_USER0);
749 kunmap_atomic(dst, KM_USER1);
750 } else {
751 src = page_address(s_page);
752 if (PageHighMem(d_page)) {
753 /* Page pointed to by src may contain some kernel
754 * data modified by kmap_atomic()
755 */
756 do_copy_page(buffer, src);
757 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
758 memcpy(dst, buffer, PAGE_SIZE);
759 kunmap_atomic(dst, KM_USER0);
760 } else {
761 dst = page_address(d_page);
762 do_copy_page(dst, src);
763 }
764 }
765}
766#else
767#define page_is_saveable(zone, pfn) saveable_page(pfn)
768
769static inline void
770copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
771{
772 do_copy_page(page_address(pfn_to_page(dst_pfn)),
773 page_address(pfn_to_page(src_pfn)));
774}
775#endif /* CONFIG_HIGHMEM */
776
740static void 777static void
741copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) 778copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
742{ 779{
743 struct zone *zone; 780 struct zone *zone;
744 unsigned long pfn; 781 unsigned long pfn;
745 782
746 for_each_zone (zone) { 783 for_each_zone(zone) {
747 unsigned long max_zone_pfn; 784 unsigned long max_zone_pfn;
748 785
749 if (is_highmem(zone))
750 continue;
751
752 mark_free_pages(zone); 786 mark_free_pages(zone);
753 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 787 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
754 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 788 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
755 if (saveable_page(pfn)) 789 if (page_is_saveable(zone, pfn))
756 memory_bm_set_bit(orig_bm, pfn); 790 memory_bm_set_bit(orig_bm, pfn);
757 } 791 }
758 memory_bm_position_reset(orig_bm); 792 memory_bm_position_reset(orig_bm);
759 memory_bm_position_reset(copy_bm); 793 memory_bm_position_reset(copy_bm);
760 do { 794 do {
761 pfn = memory_bm_next_pfn(orig_bm); 795 pfn = memory_bm_next_pfn(orig_bm);
762 if (likely(pfn != BM_END_OF_MAP)) { 796 if (likely(pfn != BM_END_OF_MAP))
763 struct page *page; 797 copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
764 void *src;
765
766 page = pfn_to_page(pfn);
767 src = page_address(page);
768 page = pfn_to_page(memory_bm_next_pfn(copy_bm));
769 copy_data_page(page_address(page), src);
770 }
771 } while (pfn != BM_END_OF_MAP); 798 } while (pfn != BM_END_OF_MAP);
772} 799}
773 800
801/* Total number of image pages */
802static unsigned int nr_copy_pages;
803/* Number of pages needed for saving the original pfns of the image pages */
804static unsigned int nr_meta_pages;
805
774/** 806/**
775 * swsusp_free - free pages allocated for the suspend. 807 * swsusp_free - free pages allocated for the suspend.
776 * 808 *
@@ -792,7 +824,7 @@ void swsusp_free(void)
792 if (PageNosave(page) && PageNosaveFree(page)) { 824 if (PageNosave(page) && PageNosaveFree(page)) {
793 ClearPageNosave(page); 825 ClearPageNosave(page);
794 ClearPageNosaveFree(page); 826 ClearPageNosaveFree(page);
795 free_page((long) page_address(page)); 827 __free_page(page);
796 } 828 }
797 } 829 }
798 } 830 }
@@ -802,34 +834,108 @@ void swsusp_free(void)
802 buffer = NULL; 834 buffer = NULL;
803} 835}
804 836
837#ifdef CONFIG_HIGHMEM
838/**
839 * count_pages_for_highmem - compute the number of non-highmem pages
840 * that will be necessary for creating copies of highmem pages.
841 */
842
843static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
844{
845 unsigned int free_highmem = count_free_highmem_pages();
846
847 if (free_highmem >= nr_highmem)
848 nr_highmem = 0;
849 else
850 nr_highmem -= free_highmem;
851
852 return nr_highmem;
853}
854#else
855static unsigned int
856count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
857#endif /* CONFIG_HIGHMEM */
805 858
806/** 859/**
807 * enough_free_mem - Make sure we enough free memory to snapshot. 860 * enough_free_mem - Make sure we have enough free memory for the
808 * 861 * snapshot image.
809 * Returns TRUE or FALSE after checking the number of available
810 * free pages.
811 */ 862 */
812 863
813static int enough_free_mem(unsigned int nr_pages) 864static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
814{ 865{
815 struct zone *zone; 866 struct zone *zone;
816 unsigned int free = 0, meta = 0; 867 unsigned int free = 0, meta = 0;
817 868
818 for_each_zone (zone) 869 for_each_zone(zone) {
819 if (!is_highmem(zone)) { 870 meta += snapshot_additional_pages(zone);
871 if (!is_highmem(zone))
820 free += zone->free_pages; 872 free += zone->free_pages;
821 meta += snapshot_additional_pages(zone); 873 }
822 }
823 874
824 pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", 875 nr_pages += count_pages_for_highmem(nr_highmem);
876 pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
825 nr_pages, PAGES_FOR_IO, meta, free); 877 nr_pages, PAGES_FOR_IO, meta, free);
826 878
827 return free > nr_pages + PAGES_FOR_IO + meta; 879 return free > nr_pages + PAGES_FOR_IO + meta;
828} 880}
829 881
882#ifdef CONFIG_HIGHMEM
883/**
884 * get_highmem_buffer - if there are some highmem pages in the suspend
885 * image, we may need the buffer to copy them and/or load their data.
886 */
887
888static inline int get_highmem_buffer(int safe_needed)
889{
890 buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
891 return buffer ? 0 : -ENOMEM;
892}
893
894/**
895 * alloc_highmem_image_pages - allocate some highmem pages for the image.
896 * Try to allocate as many pages as needed, but if the number of free
897 * highmem pages is lesser than that, allocate them all.
898 */
899
900static inline unsigned int
901alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
902{
903 unsigned int to_alloc = count_free_highmem_pages();
904
905 if (to_alloc > nr_highmem)
906 to_alloc = nr_highmem;
907
908 nr_highmem -= to_alloc;
909 while (to_alloc-- > 0) {
910 struct page *page;
911
912 page = alloc_image_page(__GFP_HIGHMEM);
913 memory_bm_set_bit(bm, page_to_pfn(page));
914 }
915 return nr_highmem;
916}
917#else
918static inline int get_highmem_buffer(int safe_needed) { return 0; }
919
920static inline unsigned int
921alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
922#endif /* CONFIG_HIGHMEM */
923
924/**
925 * swsusp_alloc - allocate memory for the suspend image
926 *
927 * We first try to allocate as many highmem pages as there are
928 * saveable highmem pages in the system. If that fails, we allocate
929 * non-highmem pages for the copies of the remaining highmem ones.
930 *
931 * In this approach it is likely that the copies of highmem pages will
932 * also be located in the high memory, because of the way in which
933 * copy_data_pages() works.
934 */
935
830static int 936static int
831swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 937swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
832 unsigned int nr_pages) 938 unsigned int nr_pages, unsigned int nr_highmem)
833{ 939{
834 int error; 940 int error;
835 941
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
841 if (error) 947 if (error)
842 goto Free; 948 goto Free;
843 949
950 if (nr_highmem > 0) {
951 error = get_highmem_buffer(PG_ANY);
952 if (error)
953 goto Free;
954
955 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
956 }
844 while (nr_pages-- > 0) { 957 while (nr_pages-- > 0) {
845 struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); 958 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
959
846 if (!page) 960 if (!page)
847 goto Free; 961 goto Free;
848 962
849 SetPageNosave(page);
850 SetPageNosaveFree(page);
851 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 963 memory_bm_set_bit(copy_bm, page_to_pfn(page));
852 } 964 }
853 return 0; 965 return 0;
854 966
855Free: 967 Free:
856 swsusp_free(); 968 swsusp_free();
857 return -ENOMEM; 969 return -ENOMEM;
858} 970}
859 971
860/* Memory bitmap used for marking saveable pages */ 972/* Memory bitmap used for marking saveable pages (during suspend) or the
973 * suspend image pages (during resume)
974 */
861static struct memory_bitmap orig_bm; 975static struct memory_bitmap orig_bm;
862/* Memory bitmap used for marking allocated pages that will contain the copies 976/* Memory bitmap used on suspend for marking allocated pages that will contain
863 * of saveable pages 977 * the copies of saveable pages. During resume it is initially used for
978 * marking the suspend image pages, but then its set bits are duplicated in
979 * @orig_bm and it is released. Next, on systems with high memory, it may be
980 * used for marking "safe" highmem pages, but it has to be reinitialized for
981 * this purpose.
864 */ 982 */
865static struct memory_bitmap copy_bm; 983static struct memory_bitmap copy_bm;
866 984
867asmlinkage int swsusp_save(void) 985asmlinkage int swsusp_save(void)
868{ 986{
869 unsigned int nr_pages; 987 unsigned int nr_pages, nr_highmem;
870 988
871 pr_debug("swsusp: critical section: \n"); 989 printk("swsusp: critical section: \n");
872 990
873 drain_local_pages(); 991 drain_local_pages();
874 nr_pages = count_data_pages(); 992 nr_pages = count_data_pages();
875 printk("swsusp: Need to copy %u pages\n", nr_pages); 993 nr_highmem = count_highmem_pages();
994 printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
876 995
877 if (!enough_free_mem(nr_pages)) { 996 if (!enough_free_mem(nr_pages, nr_highmem)) {
878 printk(KERN_ERR "swsusp: Not enough free memory\n"); 997 printk(KERN_ERR "swsusp: Not enough free memory\n");
879 return -ENOMEM; 998 return -ENOMEM;
880 } 999 }
881 1000
882 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages)) 1001 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
1002 printk(KERN_ERR "swsusp: Memory allocation failed\n");
883 return -ENOMEM; 1003 return -ENOMEM;
1004 }
884 1005
885 /* During allocating of suspend pagedir, new cold pages may appear. 1006 /* During allocating of suspend pagedir, new cold pages may appear.
886 * Kill them. 1007 * Kill them.
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void)
894 * touch swap space! Except we must write out our image of course. 1015 * touch swap space! Except we must write out our image of course.
895 */ 1016 */
896 1017
1018 nr_pages += nr_highmem;
897 nr_copy_pages = nr_pages; 1019 nr_copy_pages = nr_pages;
898 nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1020 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
899 1021
900 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 1022 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
1023
901 return 0; 1024 return 0;
902} 1025}
903 1026
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
960 1083
961 if (!buffer) { 1084 if (!buffer) {
962 /* This makes the buffer be freed by swsusp_free() */ 1085 /* This makes the buffer be freed by swsusp_free() */
963 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1086 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
964 if (!buffer) 1087 if (!buffer)
965 return -ENOMEM; 1088 return -ENOMEM;
966 } 1089 }
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
975 memset(buffer, 0, PAGE_SIZE); 1098 memset(buffer, 0, PAGE_SIZE);
976 pack_pfns(buffer, &orig_bm); 1099 pack_pfns(buffer, &orig_bm);
977 } else { 1100 } else {
978 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1101 struct page *page;
979 1102
980 handle->buffer = page_address(pfn_to_page(pfn)); 1103 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1104 if (PageHighMem(page)) {
1105 /* Highmem pages are copied to the buffer,
1106 * because we can't return with a kmapped
1107 * highmem page (we may not be called again).
1108 */
1109 void *kaddr;
1110
1111 kaddr = kmap_atomic(page, KM_USER0);
1112 memcpy(buffer, kaddr, PAGE_SIZE);
1113 kunmap_atomic(kaddr, KM_USER0);
1114 handle->buffer = buffer;
1115 } else {
1116 handle->buffer = page_address(page);
1117 }
981 } 1118 }
982 handle->prev = handle->cur; 1119 handle->prev = handle->cur;
983 } 1120 }
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1005 unsigned long pfn, max_zone_pfn; 1142 unsigned long pfn, max_zone_pfn;
1006 1143
1007 /* Clear page flags */ 1144 /* Clear page flags */
1008 for_each_zone (zone) { 1145 for_each_zone(zone) {
1009 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1146 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1010 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1147 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1011 if (pfn_valid(pfn)) 1148 if (pfn_valid(pfn))
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1101 } 1238 }
1102} 1239}
1103 1240
1241/* List of "safe" pages that may be used to store data loaded from the suspend
1242 * image
1243 */
1244static struct linked_page *safe_pages_list;
1245
1246#ifdef CONFIG_HIGHMEM
1247/* struct highmem_pbe is used for creating the list of highmem pages that
1248 * should be restored atomically during the resume from disk, because the page
1249 * frames they have occupied before the suspend are in use.
1250 */
1251struct highmem_pbe {
1252 struct page *copy_page; /* data is here now */
1253 struct page *orig_page; /* data was here before the suspend */
1254 struct highmem_pbe *next;
1255};
1256
1257/* List of highmem PBEs needed for restoring the highmem pages that were
1258 * allocated before the suspend and included in the suspend image, but have
1259 * also been allocated by the "resume" kernel, so their contents cannot be
1260 * written directly to their "original" page frames.
1261 */
1262static struct highmem_pbe *highmem_pblist;
1263
1264/**
1265 * count_highmem_image_pages - compute the number of highmem pages in the
1266 * suspend image. The bits in the memory bitmap @bm that correspond to the
1267 * image pages are assumed to be set.
1268 */
1269
1270static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
1271{
1272 unsigned long pfn;
1273 unsigned int cnt = 0;
1274
1275 memory_bm_position_reset(bm);
1276 pfn = memory_bm_next_pfn(bm);
1277 while (pfn != BM_END_OF_MAP) {
1278 if (PageHighMem(pfn_to_page(pfn)))
1279 cnt++;
1280
1281 pfn = memory_bm_next_pfn(bm);
1282 }
1283 return cnt;
1284}
1285
1286/**
1287 * prepare_highmem_image - try to allocate as many highmem pages as
1288 * there are highmem image pages (@nr_highmem_p points to the variable
1289 * containing the number of highmem image pages). The pages that are
1290 * "safe" (ie. will not be overwritten when the suspend image is
1291 * restored) have the corresponding bits set in @bm (it must be
1292 * unitialized).
1293 *
1294 * NOTE: This function should not be called if there are no highmem
1295 * image pages.
1296 */
1297
1298static unsigned int safe_highmem_pages;
1299
1300static struct memory_bitmap *safe_highmem_bm;
1301
1302static int
1303prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1304{
1305 unsigned int to_alloc;
1306
1307 if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
1308 return -ENOMEM;
1309
1310 if (get_highmem_buffer(PG_SAFE))
1311 return -ENOMEM;
1312
1313 to_alloc = count_free_highmem_pages();
1314 if (to_alloc > *nr_highmem_p)
1315 to_alloc = *nr_highmem_p;
1316 else
1317 *nr_highmem_p = to_alloc;
1318
1319 safe_highmem_pages = 0;
1320 while (to_alloc-- > 0) {
1321 struct page *page;
1322
1323 page = alloc_page(__GFP_HIGHMEM);
1324 if (!PageNosaveFree(page)) {
1325 /* The page is "safe", set its bit the bitmap */
1326 memory_bm_set_bit(bm, page_to_pfn(page));
1327 safe_highmem_pages++;
1328 }
1329 /* Mark the page as allocated */
1330 SetPageNosave(page);
1331 SetPageNosaveFree(page);
1332 }
1333 memory_bm_position_reset(bm);
1334 safe_highmem_bm = bm;
1335 return 0;
1336}
1337
1338/**
1339 * get_highmem_page_buffer - for given highmem image page find the buffer
1340 * that suspend_write_next() should set for its caller to write to.
1341 *
1342 * If the page is to be saved to its "original" page frame or a copy of
1343 * the page is to be made in the highmem, @buffer is returned. Otherwise,
1344 * the copy of the page is to be made in normal memory, so the address of
1345 * the copy is returned.
1346 *
1347 * If @buffer is returned, the caller of suspend_write_next() will write
1348 * the page's contents to @buffer, so they will have to be copied to the
1349 * right location on the next call to suspend_write_next() and it is done
1350 * with the help of copy_last_highmem_page(). For this purpose, if
1351 * @buffer is returned, @last_highmem page is set to the page to which
1352 * the data will have to be copied from @buffer.
1353 */
1354
1355static struct page *last_highmem_page;
1356
1357static void *
1358get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1359{
1360 struct highmem_pbe *pbe;
1361 void *kaddr;
1362
1363 if (PageNosave(page) && PageNosaveFree(page)) {
1364 /* We have allocated the "original" page frame and we can
1365 * use it directly to store the loaded page.
1366 */
1367 last_highmem_page = page;
1368 return buffer;
1369 }
1370 /* The "original" page frame has not been allocated and we have to
1371 * use a "safe" page frame to store the loaded page.
1372 */
1373 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1374 if (!pbe) {
1375 swsusp_free();
1376 return NULL;
1377 }
1378 pbe->orig_page = page;
1379 if (safe_highmem_pages > 0) {
1380 struct page *tmp;
1381
1382 /* Copy of the page will be stored in high memory */
1383 kaddr = buffer;
1384 tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
1385 safe_highmem_pages--;
1386 last_highmem_page = tmp;
1387 pbe->copy_page = tmp;
1388 } else {
1389 /* Copy of the page will be stored in normal memory */
1390 kaddr = safe_pages_list;
1391 safe_pages_list = safe_pages_list->next;
1392 pbe->copy_page = virt_to_page(kaddr);
1393 }
1394 pbe->next = highmem_pblist;
1395 highmem_pblist = pbe;
1396 return kaddr;
1397}
1398
1399/**
1400 * copy_last_highmem_page - copy the contents of a highmem image from
1401 * @buffer, where the caller of snapshot_write_next() has place them,
1402 * to the right location represented by @last_highmem_page .
1403 */
1404
1405static void copy_last_highmem_page(void)
1406{
1407 if (last_highmem_page) {
1408 void *dst;
1409
1410 dst = kmap_atomic(last_highmem_page, KM_USER0);
1411 memcpy(dst, buffer, PAGE_SIZE);
1412 kunmap_atomic(dst, KM_USER0);
1413 last_highmem_page = NULL;
1414 }
1415}
1416
1417static inline int last_highmem_page_copied(void)
1418{
1419 return !last_highmem_page;
1420}
1421
1422static inline void free_highmem_data(void)
1423{
1424 if (safe_highmem_bm)
1425 memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
1426
1427 if (buffer)
1428 free_image_page(buffer, PG_UNSAFE_CLEAR);
1429}
1430#else
1431static inline int get_safe_write_buffer(void) { return 0; }
1432
1433static unsigned int
1434count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
1435
1436static inline int
1437prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1438{
1439 return 0;
1440}
1441
1442static inline void *
1443get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1444{
1445 return NULL;
1446}
1447
1448static inline void copy_last_highmem_page(void) {}
1449static inline int last_highmem_page_copied(void) { return 1; }
1450static inline void free_highmem_data(void) {}
1451#endif /* CONFIG_HIGHMEM */
1452
1104/** 1453/**
1105 * prepare_image - use the memory bitmap @bm to mark the pages that will 1454 * prepare_image - use the memory bitmap @bm to mark the pages that will
1106 * be overwritten in the process of restoring the system memory state 1455 * be overwritten in the process of restoring the system memory state
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1110 * The idea is to allocate a new memory bitmap first and then allocate 1459 * The idea is to allocate a new memory bitmap first and then allocate
1111 * as many pages as needed for the image data, but not to assign these 1460 * as many pages as needed for the image data, but not to assign these
1112 * pages to specific tasks initially. Instead, we just mark them as 1461 * pages to specific tasks initially. Instead, we just mark them as
1113 * allocated and create a list of "safe" pages that will be used later. 1462 * allocated and create a lists of "safe" pages that will be used
1463 * later. On systems with high memory a list of "safe" highmem pages is
1464 * also created.
1114 */ 1465 */
1115 1466
1116#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) 1467#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
1117 1468
1118static struct linked_page *safe_pages_list;
1119
1120static int 1469static int
1121prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) 1470prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1122{ 1471{
1123 unsigned int nr_pages; 1472 unsigned int nr_pages, nr_highmem;
1124 struct linked_page *sp_list, *lp; 1473 struct linked_page *sp_list, *lp;
1125 int error; 1474 int error;
1126 1475
1476 /* If there is no highmem, the buffer will not be necessary */
1477 free_image_page(buffer, PG_UNSAFE_CLEAR);
1478 buffer = NULL;
1479
1480 nr_highmem = count_highmem_image_pages(bm);
1127 error = mark_unsafe_pages(bm); 1481 error = mark_unsafe_pages(bm);
1128 if (error) 1482 if (error)
1129 goto Free; 1483 goto Free;
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1134 1488
1135 duplicate_memory_bitmap(new_bm, bm); 1489 duplicate_memory_bitmap(new_bm, bm);
1136 memory_bm_free(bm, PG_UNSAFE_KEEP); 1490 memory_bm_free(bm, PG_UNSAFE_KEEP);
1491 if (nr_highmem > 0) {
1492 error = prepare_highmem_image(bm, &nr_highmem);
1493 if (error)
1494 goto Free;
1495 }
1137 /* Reserve some safe pages for potential later use. 1496 /* Reserve some safe pages for potential later use.
1138 * 1497 *
1139 * NOTE: This way we make sure there will be enough safe pages for the 1498 * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1142 */ 1501 */
1143 sp_list = NULL; 1502 sp_list = NULL;
1144 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ 1503 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1145 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1504 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1146 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); 1505 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1147 while (nr_pages > 0) { 1506 while (nr_pages > 0) {
1148 lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); 1507 lp = get_image_page(GFP_ATOMIC, PG_SAFE);
1149 if (!lp) { 1508 if (!lp) {
1150 error = -ENOMEM; 1509 error = -ENOMEM;
1151 goto Free; 1510 goto Free;
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1156 } 1515 }
1157 /* Preallocate memory for the image */ 1516 /* Preallocate memory for the image */
1158 safe_pages_list = NULL; 1517 safe_pages_list = NULL;
1159 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1518 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1160 while (nr_pages > 0) { 1519 while (nr_pages > 0) {
1161 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); 1520 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
1162 if (!lp) { 1521 if (!lp) {
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1181 } 1540 }
1182 return 0; 1541 return 0;
1183 1542
1184Free: 1543 Free:
1185 swsusp_free(); 1544 swsusp_free();
1186 return error; 1545 return error;
1187} 1546}
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1196 struct pbe *pbe; 1555 struct pbe *pbe;
1197 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1556 struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
1198 1557
1558 if (PageHighMem(page))
1559 return get_highmem_page_buffer(page, ca);
1560
1199 if (PageNosave(page) && PageNosaveFree(page)) 1561 if (PageNosave(page) && PageNosaveFree(page))
1200 /* We have allocated the "original" page frame and we can 1562 /* We have allocated the "original" page frame and we can
1201 * use it directly to store the loaded page. 1563 * use it directly to store the loaded page.
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1210 swsusp_free(); 1572 swsusp_free();
1211 return NULL; 1573 return NULL;
1212 } 1574 }
1213 pbe->orig_address = (unsigned long)page_address(page); 1575 pbe->orig_address = page_address(page);
1214 pbe->address = (unsigned long)safe_pages_list; 1576 pbe->address = safe_pages_list;
1215 safe_pages_list = safe_pages_list->next; 1577 safe_pages_list = safe_pages_list->next;
1216 pbe->next = restore_pblist; 1578 pbe->next = restore_pblist;
1217 restore_pblist = pbe; 1579 restore_pblist = pbe;
1218 return (void *)pbe->address; 1580 return pbe->address;
1219} 1581}
1220 1582
1221/** 1583/**
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1249 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1611 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1250 return 0; 1612 return 0;
1251 1613
1252 if (!buffer) { 1614 if (handle->offset == 0) {
1253 /* This makes the buffer be freed by swsusp_free() */ 1615 if (!buffer)
1254 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1616 /* This makes the buffer be freed by swsusp_free() */
1617 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
1618
1255 if (!buffer) 1619 if (!buffer)
1256 return -ENOMEM; 1620 return -ENOMEM;
1257 } 1621
1258 if (!handle->offset)
1259 handle->buffer = buffer; 1622 handle->buffer = buffer;
1623 }
1260 handle->sync_read = 1; 1624 handle->sync_read = 1;
1261 if (handle->prev < handle->cur) { 1625 if (handle->prev < handle->cur) {
1262 if (handle->prev == 0) { 1626 if (handle->prev == 0) {
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1284 return -ENOMEM; 1648 return -ENOMEM;
1285 } 1649 }
1286 } else { 1650 } else {
1651 copy_last_highmem_page();
1287 handle->buffer = get_buffer(&orig_bm, &ca); 1652 handle->buffer = get_buffer(&orig_bm, &ca);
1288 handle->sync_read = 0; 1653 if (handle->buffer != buffer)
1654 handle->sync_read = 0;
1289 } 1655 }
1290 handle->prev = handle->cur; 1656 handle->prev = handle->cur;
1291 } 1657 }
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1301 return count; 1667 return count;
1302} 1668}
1303 1669
1670/**
1671 * snapshot_write_finalize - must be called after the last call to
1672 * snapshot_write_next() in case the last page in the image happens
1673 * to be a highmem page and its contents should be stored in the
1674 * highmem. Additionally, it releases the memory that will not be
1675 * used any more.
1676 */
1677
1678void snapshot_write_finalize(struct snapshot_handle *handle)
1679{
1680 copy_last_highmem_page();
1681 /* Free only if we have loaded the image entirely */
1682 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
1683 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
1684 free_highmem_data();
1685 }
1686}
1687
1304int snapshot_image_loaded(struct snapshot_handle *handle) 1688int snapshot_image_loaded(struct snapshot_handle *handle)
1305{ 1689{
1306 return !(!nr_copy_pages || 1690 return !(!nr_copy_pages || !last_highmem_page_copied() ||
1307 handle->cur <= nr_meta_pages + nr_copy_pages); 1691 handle->cur <= nr_meta_pages + nr_copy_pages);
1308} 1692}
1309 1693
1310void snapshot_free_unused_memory(struct snapshot_handle *handle) 1694#ifdef CONFIG_HIGHMEM
1695/* Assumes that @buf is ready and points to a "safe" page */
1696static inline void
1697swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
1311{ 1698{
1312 /* Free only if we have loaded the image entirely */ 1699 void *kaddr1, *kaddr2;
1313 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1700
1314 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 1701 kaddr1 = kmap_atomic(p1, KM_USER0);
1702 kaddr2 = kmap_atomic(p2, KM_USER1);
1703 memcpy(buf, kaddr1, PAGE_SIZE);
1704 memcpy(kaddr1, kaddr2, PAGE_SIZE);
1705 memcpy(kaddr2, buf, PAGE_SIZE);
1706 kunmap_atomic(kaddr1, KM_USER0);
1707 kunmap_atomic(kaddr2, KM_USER1);
1708}
1709
1710/**
1711 * restore_highmem - for each highmem page that was allocated before
1712 * the suspend and included in the suspend image, and also has been
1713 * allocated by the "resume" kernel swap its current (ie. "before
1714 * resume") contents with the previous (ie. "before suspend") one.
1715 *
1716 * If the resume eventually fails, we can call this function once
1717 * again and restore the "before resume" highmem state.
1718 */
1719
1720int restore_highmem(void)
1721{
1722 struct highmem_pbe *pbe = highmem_pblist;
1723 void *buf;
1724
1725 if (!pbe)
1726 return 0;
1727
1728 buf = get_image_page(GFP_ATOMIC, PG_SAFE);
1729 if (!buf)
1730 return -ENOMEM;
1731
1732 while (pbe) {
1733 swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
1734 pbe = pbe->next;
1735 }
1736 free_image_page(buf, PG_UNSAFE_CLEAR);
1737 return 0;
1315} 1738}
1739#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 9b2ee5344dee..f133d4a6d817 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,34 +34,123 @@ extern char resume_file[];
34#define SWSUSP_SIG "S1SUSPEND" 34#define SWSUSP_SIG "S1SUSPEND"
35 35
36static struct swsusp_header { 36static struct swsusp_header {
37 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; 37 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
38 swp_entry_t image; 38 sector_t image;
39 char orig_sig[10]; 39 char orig_sig[10];
40 char sig[10]; 40 char sig[10];
41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
42 42
43/* 43/*
44 * Saving part... 44 * General things
45 */ 45 */
46 46
47static unsigned short root_swap = 0xffff; 47static unsigned short root_swap = 0xffff;
48static struct block_device *resume_bdev;
49
50/**
51 * submit - submit BIO request.
52 * @rw: READ or WRITE.
53 * @off physical offset of page.
54 * @page: page we're reading or writing.
55 * @bio_chain: list of pending biod (for async reading)
56 *
57 * Straight from the textbook - allocate and initialize the bio.
58 * If we're reading, make sure the page is marked as dirty.
59 * Then submit it and, if @bio_chain == NULL, wait.
60 */
61static int submit(int rw, pgoff_t page_off, struct page *page,
62 struct bio **bio_chain)
63{
64 struct bio *bio;
65
66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
67 if (!bio)
68 return -ENOMEM;
69 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
70 bio->bi_bdev = resume_bdev;
71 bio->bi_end_io = end_swap_bio_read;
72
73 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
74 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
75 bio_put(bio);
76 return -EFAULT;
77 }
78
79 lock_page(page);
80 bio_get(bio);
48 81
49static int mark_swapfiles(swp_entry_t start) 82 if (bio_chain == NULL) {
83 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
84 wait_on_page_locked(page);
85 if (rw == READ)
86 bio_set_pages_dirty(bio);
87 bio_put(bio);
88 } else {
89 if (rw == READ)
90 get_page(page); /* These pages are freed later */
91 bio->bi_private = *bio_chain;
92 *bio_chain = bio;
93 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
94 }
95 return 0;
96}
97
98static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
99{
100 return submit(READ, page_off, virt_to_page(addr), bio_chain);
101}
102
103static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
104{
105 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
106}
107
108static int wait_on_bio_chain(struct bio **bio_chain)
109{
110 struct bio *bio;
111 struct bio *next_bio;
112 int ret = 0;
113
114 if (bio_chain == NULL)
115 return 0;
116
117 bio = *bio_chain;
118 if (bio == NULL)
119 return 0;
120 while (bio) {
121 struct page *page;
122
123 next_bio = bio->bi_private;
124 page = bio->bi_io_vec[0].bv_page;
125 wait_on_page_locked(page);
126 if (!PageUptodate(page) || PageError(page))
127 ret = -EIO;
128 put_page(page);
129 bio_put(bio);
130 bio = next_bio;
131 }
132 *bio_chain = NULL;
133 return ret;
134}
135
136/*
137 * Saving part
138 */
139
140static int mark_swapfiles(sector_t start)
50{ 141{
51 int error; 142 int error;
52 143
53 rw_swap_page_sync(READ, swp_entry(root_swap, 0), 144 bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
54 virt_to_page((unsigned long)&swsusp_header), NULL);
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 145 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 146 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 147 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 148 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start; 149 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), 150 error = bio_write_page(swsusp_resume_block,
61 virt_to_page((unsigned long)&swsusp_header), 151 &swsusp_header, NULL);
62 NULL);
63 } else { 152 } else {
64 pr_debug("swsusp: Partition is not swap space.\n"); 153 printk(KERN_ERR "swsusp: Swap header not found!\n");
65 error = -ENODEV; 154 error = -ENODEV;
66 } 155 }
67 return error; 156 return error;
@@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start)
74 163
75static int swsusp_swap_check(void) /* This is called before saving image */ 164static int swsusp_swap_check(void) /* This is called before saving image */
76{ 165{
77 int res = swap_type_of(swsusp_resume_device); 166 int res;
167
168 res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
169 if (res < 0)
170 return res;
171
172 root_swap = res;
173 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
174 if (IS_ERR(resume_bdev))
175 return PTR_ERR(resume_bdev);
176
177 res = set_blocksize(resume_bdev, PAGE_SIZE);
178 if (res < 0)
179 blkdev_put(resume_bdev);
78 180
79 if (res >= 0) {
80 root_swap = res;
81 return 0;
82 }
83 return res; 181 return res;
84} 182}
85 183
@@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
90 * @bio_chain: Link the next write BIO here 188 * @bio_chain: Link the next write BIO here
91 */ 189 */
92 190
93static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) 191static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
94{ 192{
95 swp_entry_t entry; 193 void *src;
96 int error = -ENOSPC; 194
97 195 if (!offset)
98 if (offset) { 196 return -ENOSPC;
99 struct page *page = virt_to_page(buf); 197
100 198 if (bio_chain) {
101 if (bio_chain) { 199 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
102 /* 200 if (src) {
103 * Whether or not we successfully allocated a copy page, 201 memcpy(src, buf, PAGE_SIZE);
104 * we take a ref on the page here. It gets undone in 202 } else {
105 * wait_on_bio_chain(). 203 WARN_ON_ONCE(1);
106 */ 204 bio_chain = NULL; /* Go synchronous */
107 struct page *page_copy; 205 src = buf;
108 page_copy = alloc_page(GFP_ATOMIC);
109 if (page_copy == NULL) {
110 WARN_ON_ONCE(1);
111 bio_chain = NULL; /* Go synchronous */
112 get_page(page);
113 } else {
114 memcpy(page_address(page_copy),
115 page_address(page), PAGE_SIZE);
116 page = page_copy;
117 }
118 } 206 }
119 entry = swp_entry(root_swap, offset); 207 } else {
120 error = rw_swap_page_sync(WRITE, entry, page, bio_chain); 208 src = buf;
121 } 209 }
122 return error; 210 return bio_write_page(offset, src, bio_chain);
123} 211}
124 212
125/* 213/*
@@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
137 * at a time. 225 * at a time.
138 */ 226 */
139 227
140#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) 228#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
141 229
142struct swap_map_page { 230struct swap_map_page {
143 unsigned long entries[MAP_PAGE_ENTRIES]; 231 sector_t entries[MAP_PAGE_ENTRIES];
144 unsigned long next_swap; 232 sector_t next_swap;
145}; 233};
146 234
147/** 235/**
@@ -151,7 +239,7 @@ struct swap_map_page {
151 239
152struct swap_map_handle { 240struct swap_map_handle {
153 struct swap_map_page *cur; 241 struct swap_map_page *cur;
154 unsigned long cur_swap; 242 sector_t cur_swap;
155 struct bitmap_page *bitmap; 243 struct bitmap_page *bitmap;
156 unsigned int k; 244 unsigned int k;
157}; 245};
@@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
166 handle->bitmap = NULL; 254 handle->bitmap = NULL;
167} 255}
168 256
169static void show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
185 centisecs / 100, centisecs % 100,
186 kps / 1000, (kps % 1000) / 10);
187}
188
189static int get_swap_writer(struct swap_map_handle *handle) 257static int get_swap_writer(struct swap_map_handle *handle)
190{ 258{
191 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 259 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
196 release_swap_writer(handle); 264 release_swap_writer(handle);
197 return -ENOMEM; 265 return -ENOMEM;
198 } 266 }
199 handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); 267 handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
200 if (!handle->cur_swap) { 268 if (!handle->cur_swap) {
201 release_swap_writer(handle); 269 release_swap_writer(handle);
202 return -ENOSPC; 270 return -ENOSPC;
@@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle)
205 return 0; 273 return 0;
206} 274}
207 275
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235
236static int swap_write_page(struct swap_map_handle *handle, void *buf, 276static int swap_write_page(struct swap_map_handle *handle, void *buf,
237 struct bio **bio_chain) 277 struct bio **bio_chain)
238{ 278{
239 int error = 0; 279 int error = 0;
240 unsigned long offset; 280 sector_t offset;
241 281
242 if (!handle->cur) 282 if (!handle->cur)
243 return -EINVAL; 283 return -EINVAL;
244 offset = alloc_swap_page(root_swap, handle->bitmap); 284 offset = alloc_swapdev_block(root_swap, handle->bitmap);
245 error = write_page(buf, offset, bio_chain); 285 error = write_page(buf, offset, bio_chain);
246 if (error) 286 if (error)
247 return error; 287 return error;
@@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
250 error = wait_on_bio_chain(bio_chain); 290 error = wait_on_bio_chain(bio_chain);
251 if (error) 291 if (error)
252 goto out; 292 goto out;
253 offset = alloc_swap_page(root_swap, handle->bitmap); 293 offset = alloc_swapdev_block(root_swap, handle->bitmap);
254 if (!offset) 294 if (!offset)
255 return -ENOSPC; 295 return -ENOSPC;
256 handle->cur->next_swap = offset; 296 handle->cur->next_swap = offset;
@@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
261 handle->cur_swap = offset; 301 handle->cur_swap = offset;
262 handle->k = 0; 302 handle->k = 0;
263 } 303 }
264out: 304 out:
265 return error; 305 return error;
266} 306}
267 307
@@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
315 error = err2; 355 error = err2;
316 if (!error) 356 if (!error)
317 printk("\b\b\b\bdone\n"); 357 printk("\b\b\b\bdone\n");
318 show_speed(&start, &stop, nr_to_write, "Wrote"); 358 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
319 return error; 359 return error;
320} 360}
321 361
@@ -350,99 +390,50 @@ int swsusp_write(void)
350 struct swsusp_info *header; 390 struct swsusp_info *header;
351 int error; 391 int error;
352 392
353 if ((error = swsusp_swap_check())) { 393 error = swsusp_swap_check();
394 if (error) {
354 printk(KERN_ERR "swsusp: Cannot find swap device, try " 395 printk(KERN_ERR "swsusp: Cannot find swap device, try "
355 "swapon -a.\n"); 396 "swapon -a.\n");
356 return error; 397 return error;
357 } 398 }
358 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 399 memset(&snapshot, 0, sizeof(struct snapshot_handle));
359 error = snapshot_read_next(&snapshot, PAGE_SIZE); 400 error = snapshot_read_next(&snapshot, PAGE_SIZE);
360 if (error < PAGE_SIZE) 401 if (error < PAGE_SIZE) {
361 return error < 0 ? error : -EFAULT; 402 if (error >= 0)
403 error = -EFAULT;
404
405 goto out;
406 }
362 header = (struct swsusp_info *)data_of(snapshot); 407 header = (struct swsusp_info *)data_of(snapshot);
363 if (!enough_swap(header->pages)) { 408 if (!enough_swap(header->pages)) {
364 printk(KERN_ERR "swsusp: Not enough free swap\n"); 409 printk(KERN_ERR "swsusp: Not enough free swap\n");
365 return -ENOSPC; 410 error = -ENOSPC;
411 goto out;
366 } 412 }
367 error = get_swap_writer(&handle); 413 error = get_swap_writer(&handle);
368 if (!error) { 414 if (!error) {
369 unsigned long start = handle.cur_swap; 415 sector_t start = handle.cur_swap;
416
370 error = swap_write_page(&handle, header, NULL); 417 error = swap_write_page(&handle, header, NULL);
371 if (!error) 418 if (!error)
372 error = save_image(&handle, &snapshot, 419 error = save_image(&handle, &snapshot,
373 header->pages - 1); 420 header->pages - 1);
421
374 if (!error) { 422 if (!error) {
375 flush_swap_writer(&handle); 423 flush_swap_writer(&handle);
376 printk("S"); 424 printk("S");
377 error = mark_swapfiles(swp_entry(root_swap, start)); 425 error = mark_swapfiles(start);
378 printk("|\n"); 426 printk("|\n");
379 } 427 }
380 } 428 }
381 if (error) 429 if (error)
382 free_all_swap_pages(root_swap, handle.bitmap); 430 free_all_swap_pages(root_swap, handle.bitmap);
383 release_swap_writer(&handle); 431 release_swap_writer(&handle);
432 out:
433 swsusp_close();
384 return error; 434 return error;
385} 435}
386 436
387static struct block_device *resume_bdev;
388
389/**
390 * submit - submit BIO request.
391 * @rw: READ or WRITE.
392 * @off physical offset of page.
393 * @page: page we're reading or writing.
394 * @bio_chain: list of pending biod (for async reading)
395 *
396 * Straight from the textbook - allocate and initialize the bio.
397 * If we're reading, make sure the page is marked as dirty.
398 * Then submit it and, if @bio_chain == NULL, wait.
399 */
400static int submit(int rw, pgoff_t page_off, struct page *page,
401 struct bio **bio_chain)
402{
403 struct bio *bio;
404
405 bio = bio_alloc(GFP_ATOMIC, 1);
406 if (!bio)
407 return -ENOMEM;
408 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
409 bio->bi_bdev = resume_bdev;
410 bio->bi_end_io = end_swap_bio_read;
411
412 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
413 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
414 bio_put(bio);
415 return -EFAULT;
416 }
417
418 lock_page(page);
419 bio_get(bio);
420
421 if (bio_chain == NULL) {
422 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
423 wait_on_page_locked(page);
424 if (rw == READ)
425 bio_set_pages_dirty(bio);
426 bio_put(bio);
427 } else {
428 get_page(page);
429 bio->bi_private = *bio_chain;
430 *bio_chain = bio;
431 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
432 }
433 return 0;
434}
435
436static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
437{
438 return submit(READ, page_off, virt_to_page(addr), bio_chain);
439}
440
441static int bio_write_page(pgoff_t page_off, void *addr)
442{
443 return submit(WRITE, page_off, virt_to_page(addr), NULL);
444}
445
446/** 437/**
447 * The following functions allow us to read data using a swap map 438 * The following functions allow us to read data using a swap map
448 * in a file-alike way 439 * in a file-alike way
@@ -455,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
455 handle->cur = NULL; 446 handle->cur = NULL;
456} 447}
457 448
458static int get_swap_reader(struct swap_map_handle *handle, 449static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
459 swp_entry_t start)
460{ 450{
461 int error; 451 int error;
462 452
463 if (!swp_offset(start)) 453 if (!start)
464 return -EINVAL; 454 return -EINVAL;
465 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 455
456 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
466 if (!handle->cur) 457 if (!handle->cur)
467 return -ENOMEM; 458 return -ENOMEM;
468 error = bio_read_page(swp_offset(start), handle->cur, NULL); 459
460 error = bio_read_page(start, handle->cur, NULL);
469 if (error) { 461 if (error) {
470 release_swap_reader(handle); 462 release_swap_reader(handle);
471 return error; 463 return error;
@@ -477,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
477static int swap_read_page(struct swap_map_handle *handle, void *buf, 469static int swap_read_page(struct swap_map_handle *handle, void *buf,
478 struct bio **bio_chain) 470 struct bio **bio_chain)
479{ 471{
480 unsigned long offset; 472 sector_t offset;
481 int error; 473 int error;
482 474
483 if (!handle->cur) 475 if (!handle->cur)
@@ -546,11 +538,11 @@ static int load_image(struct swap_map_handle *handle,
546 error = err2; 538 error = err2;
547 if (!error) { 539 if (!error) {
548 printk("\b\b\b\bdone\n"); 540 printk("\b\b\b\bdone\n");
549 snapshot_free_unused_memory(snapshot); 541 snapshot_write_finalize(snapshot);
550 if (!snapshot_image_loaded(snapshot)) 542 if (!snapshot_image_loaded(snapshot))
551 error = -ENODATA; 543 error = -ENODATA;
552 } 544 }
553 show_speed(&start, &stop, nr_to_read, "Read"); 545 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
554 return error; 546 return error;
555} 547}
556 548
@@ -599,12 +591,16 @@ int swsusp_check(void)
599 if (!IS_ERR(resume_bdev)) { 591 if (!IS_ERR(resume_bdev)) {
600 set_blocksize(resume_bdev, PAGE_SIZE); 592 set_blocksize(resume_bdev, PAGE_SIZE);
601 memset(&swsusp_header, 0, sizeof(swsusp_header)); 593 memset(&swsusp_header, 0, sizeof(swsusp_header));
602 if ((error = bio_read_page(0, &swsusp_header, NULL))) 594 error = bio_read_page(swsusp_resume_block,
595 &swsusp_header, NULL);
596 if (error)
603 return error; 597 return error;
598
604 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 599 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
605 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 600 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
606 /* Reset swap signature now */ 601 /* Reset swap signature now */
607 error = bio_write_page(0, &swsusp_header); 602 error = bio_write_page(swsusp_resume_block,
603 &swsusp_header, NULL);
608 } else { 604 } else {
609 return -EINVAL; 605 return -EINVAL;
610 } 606 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc516..31aa0390c777 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
49#include <linux/bootmem.h> 49#include <linux/bootmem.h>
50#include <linux/syscalls.h> 50#include <linux/syscalls.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h>
52 53
53#include "power.h" 54#include "power.h"
54 55
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0;
64 65
65#ifdef CONFIG_HIGHMEM 66#ifdef CONFIG_HIGHMEM
66unsigned int count_highmem_pages(void); 67unsigned int count_highmem_pages(void);
67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static inline int save_highmem(void) { return 0; }
71static inline int restore_highmem(void) { return 0; } 70static inline int restore_highmem(void) { return 0; }
72static inline unsigned int count_highmem_pages(void) { return 0; } 71static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 72#endif
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
134 return 0; 133 return 0;
135} 134}
136 135
137unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) 136sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
138{ 137{
139 unsigned long offset; 138 unsigned long offset;
140 139
141 offset = swp_offset(get_swap_page_of_type(swap)); 140 offset = swp_offset(get_swap_page_of_type(swap));
142 if (offset) { 141 if (offset) {
143 if (bitmap_set(bitmap, offset)) { 142 if (bitmap_set(bitmap, offset))
144 swap_free(swp_entry(swap, offset)); 143 swap_free(swp_entry(swap, offset));
145 offset = 0; 144 else
146 } 145 return swapdev_block(swap, offset);
147 } 146 }
148 return offset; 147 return 0;
149} 148}
150 149
151void free_all_swap_pages(int swap, struct bitmap_page *bitmap) 150void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
166} 165}
167 166
168/** 167/**
168 * swsusp_show_speed - print the time elapsed between two events represented by
169 * @start and @stop
170 *
171 * @nr_pages - number of pages processed between @start and @stop
172 * @msg - introductory message to print
173 */
174
175void swsusp_show_speed(struct timeval *start, struct timeval *stop,
176 unsigned nr_pages, char *msg)
177{
178 s64 elapsed_centisecs64;
179 int centisecs;
180 int k;
181 int kps;
182
183 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
184 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
185 centisecs = elapsed_centisecs64;
186 if (centisecs == 0)
187 centisecs = 1; /* avoid div-by-zero */
188 k = nr_pages * (PAGE_SIZE / 1024);
189 kps = (k * 100) / centisecs;
190 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
191 centisecs / 100, centisecs % 100,
192 kps / 1000, (kps % 1000) / 10);
193}
194
195/**
169 * swsusp_shrink_memory - Try to free as much memory as needed 196 * swsusp_shrink_memory - Try to free as much memory as needed
170 * 197 *
171 * ... but do not OOM-kill anyone 198 * ... but do not OOM-kill anyone
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp)
184 211
185int swsusp_shrink_memory(void) 212int swsusp_shrink_memory(void)
186{ 213{
187 long size, tmp; 214 long tmp;
188 struct zone *zone; 215 struct zone *zone;
189 unsigned long pages = 0; 216 unsigned long pages = 0;
190 unsigned int i = 0; 217 unsigned int i = 0;
191 char *p = "-\\|/"; 218 char *p = "-\\|/";
219 struct timeval start, stop;
192 220
193 printk("Shrinking memory... "); 221 printk("Shrinking memory... ");
222 do_gettimeofday(&start);
194 do { 223 do {
195 size = 2 * count_highmem_pages(); 224 long size, highmem_size;
196 size += size / 50 + count_data_pages() + PAGES_FOR_IO; 225
226 highmem_size = count_highmem_pages();
227 size = count_data_pages() + PAGES_FOR_IO;
197 tmp = size; 228 tmp = size;
229 size += highmem_size;
198 for_each_zone (zone) 230 for_each_zone (zone)
199 if (!is_highmem(zone) && populated_zone(zone)) { 231 if (populated_zone(zone)) {
200 tmp -= zone->free_pages; 232 if (is_highmem(zone)) {
201 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 233 highmem_size -= zone->free_pages;
202 tmp += snapshot_additional_pages(zone); 234 } else {
235 tmp -= zone->free_pages;
236 tmp += zone->lowmem_reserve[ZONE_NORMAL];
237 tmp += snapshot_additional_pages(zone);
238 }
203 } 239 }
240
241 if (highmem_size < 0)
242 highmem_size = 0;
243
244 tmp += highmem_size;
204 if (tmp > 0) { 245 if (tmp > 0) {
205 tmp = __shrink_memory(tmp); 246 tmp = __shrink_memory(tmp);
206 if (!tmp) 247 if (!tmp)
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void)
212 } 253 }
213 printk("\b%c", p[i++%4]); 254 printk("\b%c", p[i++%4]);
214 } while (tmp > 0); 255 } while (tmp > 0);
256 do_gettimeofday(&stop);
215 printk("\bdone (%lu pages freed)\n", pages); 257 printk("\bdone (%lu pages freed)\n", pages);
258 swsusp_show_speed(&start, &stop, pages, "Freed");
216 259
217 return 0; 260 return 0;
218} 261}
@@ -223,6 +266,7 @@ int swsusp_suspend(void)
223 266
224 if ((error = arch_prepare_suspend())) 267 if ((error = arch_prepare_suspend()))
225 return error; 268 return error;
269
226 local_irq_disable(); 270 local_irq_disable();
227 /* At this point, device_suspend() has been called, but *not* 271 /* At this point, device_suspend() has been called, but *not*
228 * device_power_down(). We *must* device_power_down() now. 272 * device_power_down(). We *must* device_power_down() now.
@@ -235,23 +279,16 @@ int swsusp_suspend(void)
235 goto Enable_irqs; 279 goto Enable_irqs;
236 } 280 }
237 281
238 if ((error = save_highmem())) {
239 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
240 goto Restore_highmem;
241 }
242
243 save_processor_state(); 282 save_processor_state();
244 if ((error = swsusp_arch_suspend())) 283 if ((error = swsusp_arch_suspend()))
245 printk(KERN_ERR "Error %d suspending\n", error); 284 printk(KERN_ERR "Error %d suspending\n", error);
246 /* Restore control flow magically appears here */ 285 /* Restore control flow magically appears here */
247 restore_processor_state(); 286 restore_processor_state();
248Restore_highmem:
249 restore_highmem();
250 /* NOTE: device_power_up() is just a resume() for devices 287 /* NOTE: device_power_up() is just a resume() for devices
251 * that suspended with irqs off ... no overall powerup. 288 * that suspended with irqs off ... no overall powerup.
252 */ 289 */
253 device_power_up(); 290 device_power_up();
254Enable_irqs: 291 Enable_irqs:
255 local_irq_enable(); 292 local_irq_enable();
256 return error; 293 return error;
257} 294}
@@ -268,18 +305,23 @@ int swsusp_resume(void)
268 printk(KERN_ERR "Some devices failed to power down, very bad\n"); 305 printk(KERN_ERR "Some devices failed to power down, very bad\n");
269 /* We'll ignore saved state, but this gets preempt count (etc) right */ 306 /* We'll ignore saved state, but this gets preempt count (etc) right */
270 save_processor_state(); 307 save_processor_state();
271 error = swsusp_arch_resume(); 308 error = restore_highmem();
272 /* Code below is only ever reached in case of failure. Otherwise 309 if (!error) {
273 * execution continues at place where swsusp_arch_suspend was called 310 error = swsusp_arch_resume();
274 */ 311 /* The code below is only ever reached in case of a failure.
275 BUG_ON(!error); 312 * Otherwise execution continues at place where
313 * swsusp_arch_suspend() was called
314 */
315 BUG_ON(!error);
316 /* This call to restore_highmem() undos the previous one */
317 restore_highmem();
318 }
276 /* The only reason why swsusp_arch_resume() can fail is memory being 319 /* The only reason why swsusp_arch_resume() can fail is memory being
277 * very tight, so we have to free it as soon as we can to avoid 320 * very tight, so we have to free it as soon as we can to avoid
278 * subsequent failures 321 * subsequent failures
279 */ 322 */
280 swsusp_free(); 323 swsusp_free();
281 restore_processor_state(); 324 restore_processor_state();
282 restore_highmem();
283 touch_softlockup_watchdog(); 325 touch_softlockup_watchdog();
284 device_power_up(); 326 device_power_up();
285 local_irq_enable(); 327 local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a4..89443b85163b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/reboot.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/device.h> 16#include <linux/device.h>
16#include <linux/miscdevice.h> 17#include <linux/miscdevice.h>
@@ -21,6 +22,7 @@
21#include <linux/fs.h> 22#include <linux/fs.h>
22#include <linux/console.h> 23#include <linux/console.h>
23#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h>
24 26
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
26 28
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
54 filp->private_data = data; 56 filp->private_data = data;
55 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 57 memset(&data->handle, 0, sizeof(struct snapshot_handle));
56 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { 58 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
57 data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; 59 data->swap = swsusp_resume_device ?
60 swap_type_of(swsusp_resume_device, 0) : -1;
58 data->mode = O_RDONLY; 61 data->mode = O_RDONLY;
59 } else { 62 } else {
60 data->swap = -1; 63 data->swap = -1;
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
76 free_all_swap_pages(data->swap, data->bitmap); 79 free_all_swap_pages(data->swap, data->bitmap);
77 free_bitmap(data->bitmap); 80 free_bitmap(data->bitmap);
78 if (data->frozen) { 81 if (data->frozen) {
79 down(&pm_sem); 82 mutex_lock(&pm_mutex);
80 thaw_processes(); 83 thaw_processes();
81 enable_nonboot_cpus(); 84 enable_nonboot_cpus();
82 up(&pm_sem); 85 mutex_unlock(&pm_mutex);
83 } 86 }
84 atomic_inc(&device_available); 87 atomic_inc(&device_available);
85 return 0; 88 return 0;
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
124{ 127{
125 int error = 0; 128 int error = 0;
126 struct snapshot_data *data; 129 struct snapshot_data *data;
127 loff_t offset, avail; 130 loff_t avail;
131 sector_t offset;
128 132
129 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) 133 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
130 return -ENOTTY; 134 return -ENOTTY;
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
140 case SNAPSHOT_FREEZE: 144 case SNAPSHOT_FREEZE:
141 if (data->frozen) 145 if (data->frozen)
142 break; 146 break;
143 down(&pm_sem); 147 mutex_lock(&pm_mutex);
144 error = disable_nonboot_cpus(); 148 error = disable_nonboot_cpus();
145 if (!error) { 149 if (!error) {
146 error = freeze_processes(); 150 error = freeze_processes();
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
150 error = -EBUSY; 154 error = -EBUSY;
151 } 155 }
152 } 156 }
153 up(&pm_sem); 157 mutex_unlock(&pm_mutex);
154 if (!error) 158 if (!error)
155 data->frozen = 1; 159 data->frozen = 1;
156 break; 160 break;
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
158 case SNAPSHOT_UNFREEZE: 162 case SNAPSHOT_UNFREEZE:
159 if (!data->frozen) 163 if (!data->frozen)
160 break; 164 break;
161 down(&pm_sem); 165 mutex_lock(&pm_mutex);
162 thaw_processes(); 166 thaw_processes();
163 enable_nonboot_cpus(); 167 enable_nonboot_cpus();
164 up(&pm_sem); 168 mutex_unlock(&pm_mutex);
165 data->frozen = 0; 169 data->frozen = 0;
166 break; 170 break;
167 171
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
170 error = -EPERM; 174 error = -EPERM;
171 break; 175 break;
172 } 176 }
173 down(&pm_sem); 177 mutex_lock(&pm_mutex);
174 /* Free memory before shutting down devices. */ 178 /* Free memory before shutting down devices. */
175 error = swsusp_shrink_memory(); 179 error = swsusp_shrink_memory();
176 if (!error) { 180 if (!error) {
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
183 } 187 }
184 resume_console(); 188 resume_console();
185 } 189 }
186 up(&pm_sem); 190 mutex_unlock(&pm_mutex);
187 if (!error) 191 if (!error)
188 error = put_user(in_suspend, (unsigned int __user *)arg); 192 error = put_user(in_suspend, (unsigned int __user *)arg);
189 if (!error) 193 if (!error)
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
191 break; 195 break;
192 196
193 case SNAPSHOT_ATOMIC_RESTORE: 197 case SNAPSHOT_ATOMIC_RESTORE:
198 snapshot_write_finalize(&data->handle);
194 if (data->mode != O_WRONLY || !data->frozen || 199 if (data->mode != O_WRONLY || !data->frozen ||
195 !snapshot_image_loaded(&data->handle)) { 200 !snapshot_image_loaded(&data->handle)) {
196 error = -EPERM; 201 error = -EPERM;
197 break; 202 break;
198 } 203 }
199 snapshot_free_unused_memory(&data->handle); 204 mutex_lock(&pm_mutex);
200 down(&pm_sem);
201 pm_prepare_console(); 205 pm_prepare_console();
202 suspend_console(); 206 suspend_console();
203 error = device_suspend(PMSG_PRETHAW); 207 error = device_suspend(PMSG_PRETHAW);
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
207 } 211 }
208 resume_console(); 212 resume_console();
209 pm_restore_console(); 213 pm_restore_console();
210 up(&pm_sem); 214 mutex_unlock(&pm_mutex);
211 break; 215 break;
212 216
213 case SNAPSHOT_FREE: 217 case SNAPSHOT_FREE:
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
238 break; 242 break;
239 } 243 }
240 } 244 }
241 offset = alloc_swap_page(data->swap, data->bitmap); 245 offset = alloc_swapdev_block(data->swap, data->bitmap);
242 if (offset) { 246 if (offset) {
243 offset <<= PAGE_SHIFT; 247 offset <<= PAGE_SHIFT;
244 error = put_user(offset, (loff_t __user *)arg); 248 error = put_user(offset, (sector_t __user *)arg);
245 } else { 249 } else {
246 error = -ENOSPC; 250 error = -ENOSPC;
247 } 251 }
@@ -264,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
264 * so we need to recode them 268 * so we need to recode them
265 */ 269 */
266 if (old_decode_dev(arg)) { 270 if (old_decode_dev(arg)) {
267 data->swap = swap_type_of(old_decode_dev(arg)); 271 data->swap = swap_type_of(old_decode_dev(arg), 0);
268 if (data->swap < 0) 272 if (data->swap < 0)
269 error = -ENODEV; 273 error = -ENODEV;
270 } else { 274 } else {
@@ -282,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
282 break; 286 break;
283 } 287 }
284 288
285 if (down_trylock(&pm_sem)) { 289 if (!mutex_trylock(&pm_mutex)) {
286 error = -EBUSY; 290 error = -EBUSY;
287 break; 291 break;
288 } 292 }
@@ -309,8 +313,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
309 if (pm_ops->finish) 313 if (pm_ops->finish)
310 pm_ops->finish(PM_SUSPEND_MEM); 314 pm_ops->finish(PM_SUSPEND_MEM);
311 315
312OutS3: 316 OutS3:
313 up(&pm_sem); 317 mutex_unlock(&pm_mutex);
318 break;
319
320 case SNAPSHOT_PMOPS:
321 switch (arg) {
322
323 case PMOPS_PREPARE:
324 if (pm_ops->prepare) {
325 error = pm_ops->prepare(PM_SUSPEND_DISK);
326 }
327 break;
328
329 case PMOPS_ENTER:
330 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
331 error = pm_ops->enter(PM_SUSPEND_DISK);
332 break;
333
334 case PMOPS_FINISH:
335 if (pm_ops && pm_ops->finish) {
336 pm_ops->finish(PM_SUSPEND_DISK);
337 }
338 break;
339
340 default:
341 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
342 error = -EINVAL;
343
344 }
345 break;
346
347 case SNAPSHOT_SET_SWAP_AREA:
348 if (data->bitmap) {
349 error = -EPERM;
350 } else {
351 struct resume_swap_area swap_area;
352 dev_t swdev;
353
354 error = copy_from_user(&swap_area, (void __user *)arg,
355 sizeof(struct resume_swap_area));
356 if (error) {
357 error = -EFAULT;
358 break;
359 }
360
361 /*
362 * User space encodes device types as two-byte values,
363 * so we need to recode them
364 */
365 swdev = old_decode_dev(swap_area.dev);
366 if (swdev) {
367 offset = swap_area.offset;
368 data->swap = swap_type_of(swdev, offset);
369 if (data->swap < 0)
370 error = -ENODEV;
371 } else {
372 data->swap = -1;
373 error = -EINVAL;
374 }
375 }
314 break; 376 break;
315 377
316 default: 378 default:
@@ -321,7 +383,7 @@ OutS3:
321 return error; 383 return error;
322} 384}
323 385
324static struct file_operations snapshot_fops = { 386static const struct file_operations snapshot_fops = {
325 .open = snapshot_open, 387 .open = snapshot_open,
326 .release = snapshot_release, 388 .release = snapshot_release,
327 .read = snapshot_read, 389 .read = snapshot_read,
diff --git a/kernel/printk.c b/kernel/printk.c
index f7d427ef5038..185bb45eacf7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/bootmem.h> 32#include <linux/bootmem.h>
33#include <linux/syscalls.h> 33#include <linux/syscalls.h>
34#include <linux/jiffies.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
@@ -52,8 +53,6 @@ int console_printk[4] = {
52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 53 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
53}; 54};
54 55
55EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
56
57/* 56/*
58 * Low lever drivers may need that to know if they can schedule in 57 * Low lever drivers may need that to know if they can schedule in
59 * their unblank() callback or not. So let's export it. 58 * their unblank() callback or not. So let's export it.
@@ -334,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
334 } 333 }
335} 334}
336 335
336static int __read_mostly ignore_loglevel;
337
338int __init ignore_loglevel_setup(char *str)
339{
340 ignore_loglevel = 1;
341 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
342
343 return 1;
344}
345
346__setup("ignore_loglevel", ignore_loglevel_setup);
347
337/* 348/*
338 * Write out chars from start to end - 1 inclusive 349 * Write out chars from start to end - 1 inclusive
339 */ 350 */
340static void _call_console_drivers(unsigned long start, 351static void _call_console_drivers(unsigned long start,
341 unsigned long end, int msg_log_level) 352 unsigned long end, int msg_log_level)
342{ 353{
343 if (msg_log_level < console_loglevel && 354 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
344 console_drivers && start != end) { 355 console_drivers && start != end) {
345 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { 356 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
346 /* wrapped write */ 357 /* wrapped write */
@@ -630,12 +641,7 @@ EXPORT_SYMBOL(vprintk);
630 641
631asmlinkage long sys_syslog(int type, char __user *buf, int len) 642asmlinkage long sys_syslog(int type, char __user *buf, int len)
632{ 643{
633 return 0; 644 return -ENOSYS;
634}
635
636int do_syslog(int type, char __user *buf, int len)
637{
638 return 0;
639} 645}
640 646
641static void call_console_drivers(unsigned long start, unsigned long end) 647static void call_console_drivers(unsigned long start, unsigned long end)
@@ -776,7 +782,6 @@ int is_console_locked(void)
776{ 782{
777 return console_locked; 783 return console_locked;
778} 784}
779EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
780 785
781/** 786/**
782 * release_console_sem - unlock the console system 787 * release_console_sem - unlock the console system
@@ -1101,3 +1106,23 @@ int printk_ratelimit(void)
1101 printk_ratelimit_burst); 1106 printk_ratelimit_burst);
1102} 1107}
1103EXPORT_SYMBOL(printk_ratelimit); 1108EXPORT_SYMBOL(printk_ratelimit);
1109
1110/**
1111 * printk_timed_ratelimit - caller-controlled printk ratelimiting
1112 * @caller_jiffies: pointer to caller's state
1113 * @interval_msecs: minimum interval between prints
1114 *
1115 * printk_timed_ratelimit() returns true if more than @interval_msecs
1116 * milliseconds have elapsed since the last time printk_timed_ratelimit()
1117 * returned true.
1118 */
1119bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1120 unsigned int interval_msecs)
1121{
1122 if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
1123 *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
1124 return true;
1125 }
1126 return false;
1127}
1128EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec9..fb5e03d57e9d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
40 40
41static atomic_t *prof_buffer; 41static atomic_t *prof_buffer;
42static unsigned long prof_len, prof_shift; 42static unsigned long prof_len, prof_shift;
43static int prof_on __read_mostly; 43int prof_on __read_mostly;
44static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 44static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
45#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
51static int __init profile_setup(char * str) 51static int __init profile_setup(char * str)
52{ 52{
53 static char __initdata schedstr[] = "schedule"; 53 static char __initdata schedstr[] = "schedule";
54 static char __initdata sleepstr[] = "sleep";
54 int par; 55 int par;
55 56
56 if (!strncmp(str, schedstr, strlen(schedstr))) { 57 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
58 prof_on = SLEEP_PROFILING;
59 if (str[strlen(sleepstr)] == ',')
60 str += strlen(sleepstr) + 1;
61 if (get_option(&str, &par))
62 prof_shift = par;
63 printk(KERN_INFO
64 "kernel sleep profiling enabled (shift: %ld)\n",
65 prof_shift);
66 } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
57 prof_on = SCHED_PROFILING; 67 prof_on = SCHED_PROFILING;
58 if (str[strlen(schedstr)] == ',') 68 if (str[strlen(schedstr)] == ',')
59 str += strlen(schedstr) + 1; 69 str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
204 * positions to which hits are accounted during short intervals (e.g. 214 * positions to which hits are accounted during short intervals (e.g.
205 * several seconds) is usually very small. Exclusion from buffer 215 * several seconds) is usually very small. Exclusion from buffer
206 * flipping is provided by interrupt disablement (note that for 216 * flipping is provided by interrupt disablement (note that for
207 * SCHED_PROFILING profile_hit() may be called from process context). 217 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
218 * process context).
208 * The hash function is meant to be lightweight as opposed to strong, 219 * The hash function is meant to be lightweight as opposed to strong,
209 * and was vaguely inspired by ppc64 firmware-supported inverted 220 * and was vaguely inspired by ppc64 firmware-supported inverted
210 * pagetable hash functions, but uses a full hashtable full of finite 221 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
257 mutex_unlock(&profile_flip_mutex); 268 mutex_unlock(&profile_flip_mutex);
258} 269}
259 270
260void profile_hit(int type, void *__pc) 271void profile_hits(int type, void *__pc, unsigned int nr_hits)
261{ 272{
262 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 273 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
263 int i, j, cpu; 274 int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
274 put_cpu(); 285 put_cpu();
275 return; 286 return;
276 } 287 }
288 /*
289 * We buffer the global profiler buffer into a per-CPU
290 * queue and thus reduce the number of global (and possibly
291 * NUMA-alien) accesses. The write-queue is self-coalescing:
292 */
277 local_irq_save(flags); 293 local_irq_save(flags);
278 do { 294 do {
279 for (j = 0; j < PROFILE_GRPSZ; ++j) { 295 for (j = 0; j < PROFILE_GRPSZ; ++j) {
280 if (hits[i + j].pc == pc) { 296 if (hits[i + j].pc == pc) {
281 hits[i + j].hits++; 297 hits[i + j].hits += nr_hits;
282 goto out; 298 goto out;
283 } else if (!hits[i + j].hits) { 299 } else if (!hits[i + j].hits) {
284 hits[i + j].pc = pc; 300 hits[i + j].pc = pc;
285 hits[i + j].hits = 1; 301 hits[i + j].hits = nr_hits;
286 goto out; 302 goto out;
287 } 303 }
288 } 304 }
289 i = (i + secondary) & (NR_PROFILE_HIT - 1); 305 i = (i + secondary) & (NR_PROFILE_HIT - 1);
290 } while (i != primary); 306 } while (i != primary);
291 atomic_inc(&prof_buffer[pc]); 307
308 /*
309 * Add the current hit(s) and flush the write-queue out
310 * to the global buffer:
311 */
312 atomic_add(nr_hits, &prof_buffer[pc]);
292 for (i = 0; i < NR_PROFILE_HIT; ++i) { 313 for (i = 0; i < NR_PROFILE_HIT; ++i) {
293 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 314 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
294 hits[i].pc = hits[i].hits = 0; 315 hits[i].pc = hits[i].hits = 0;
@@ -298,7 +319,6 @@ out:
298 put_cpu(); 319 put_cpu();
299} 320}
300 321
301#ifdef CONFIG_HOTPLUG_CPU
302static int __devinit profile_cpu_callback(struct notifier_block *info, 322static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 323 unsigned long action, void *__cpu)
304{ 324{
@@ -351,19 +371,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
351 } 371 }
352 return NOTIFY_OK; 372 return NOTIFY_OK;
353} 373}
354#endif /* CONFIG_HOTPLUG_CPU */
355#else /* !CONFIG_SMP */ 374#else /* !CONFIG_SMP */
356#define profile_flip_buffers() do { } while (0) 375#define profile_flip_buffers() do { } while (0)
357#define profile_discard_flip_buffers() do { } while (0) 376#define profile_discard_flip_buffers() do { } while (0)
377#define profile_cpu_callback NULL
358 378
359void profile_hit(int type, void *__pc) 379void profile_hits(int type, void *__pc, unsigned int nr_hits)
360{ 380{
361 unsigned long pc; 381 unsigned long pc;
362 382
363 if (prof_on != type || !prof_buffer) 383 if (prof_on != type || !prof_buffer)
364 return; 384 return;
365 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 385 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
366 atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); 386 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
367} 387}
368#endif /* !CONFIG_SMP */ 388#endif /* !CONFIG_SMP */
369 389
@@ -442,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
442 read = 0; 462 read = 0;
443 463
444 while (p < sizeof(unsigned int) && count > 0) { 464 while (p < sizeof(unsigned int) && count > 0) {
445 put_user(*((char *)(&sample_step)+p),buf); 465 if (put_user(*((char *)(&sample_step)+p),buf))
466 return -EFAULT;
446 buf++; p++; count--; read++; 467 buf++; p++; count--; read++;
447 } 468 }
448 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 469 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
@@ -480,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
480 return count; 501 return count;
481} 502}
482 503
483static struct file_operations proc_profile_operations = { 504static const struct file_operations proc_profile_operations = {
484 .read = read_profile, 505 .read = read_profile,
485 .write = write_profile, 506 .write = write_profile,
486}; 507};
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef1..3554b76da84c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
235 235
236 list = rdp->donelist; 236 list = rdp->donelist;
237 while (list) { 237 while (list) {
238 next = rdp->donelist = list->next; 238 next = list->next;
239 prefetch(next);
239 list->func(list); 240 list->func(list);
240 list = next; 241 list = next;
241 if (++count >= rdp->blimit) 242 if (++count >= rdp->blimit)
242 break; 243 break;
243 } 244 }
245 rdp->donelist = list;
244 246
245 local_irq_disable(); 247 local_irq_disable();
246 rdp->qlen -= count; 248 rdp->qlen -= count;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f42..c52f981ea008 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
401 cleanup_srcu_struct(&srcu_ctl); 401 cleanup_srcu_struct(&srcu_ctl);
402} 402}
403 403
404static int srcu_torture_read_lock(void) 404static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
405{ 405{
406 return srcu_read_lock(&srcu_ctl); 406 return srcu_read_lock(&srcu_ctl);
407} 407}
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
419 schedule_timeout_interruptible(longdelay); 419 schedule_timeout_interruptible(longdelay);
420} 420}
421 421
422static void srcu_torture_read_unlock(int idx) 422static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
423{ 423{
424 srcu_read_unlock(&srcu_ctl, idx); 424 srcu_read_unlock(&srcu_ctl, idx);
425} 425}
diff --git a/kernel/relay.c b/kernel/relay.c
index f04bbdb56ac2..818e514729cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = {
308 * reason waking is deferred is that calling directly from write 308 * reason waking is deferred is that calling directly from write
309 * causes problems if you're writing from say the scheduler. 309 * causes problems if you're writing from say the scheduler.
310 */ 310 */
311static void wakeup_readers(void *private) 311static void wakeup_readers(struct work_struct *work)
312{ 312{
313 struct rchan_buf *buf = private; 313 struct rchan_buf *buf =
314 container_of(work, struct rchan_buf, wake_readers.work);
314 wake_up_interruptible(&buf->read_wait); 315 wake_up_interruptible(&buf->read_wait);
315} 316}
316 317
@@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
328 if (init) { 329 if (init) {
329 init_waitqueue_head(&buf->read_wait); 330 init_waitqueue_head(&buf->read_wait);
330 kref_init(&buf->kref); 331 kref_init(&buf->kref);
331 INIT_WORK(&buf->wake_readers, NULL, NULL); 332 INIT_DELAYED_WORK(&buf->wake_readers, NULL);
332 } else { 333 } else {
333 cancel_delayed_work(&buf->wake_readers); 334 cancel_delayed_work(&buf->wake_readers);
334 flush_scheduled_work(); 335 flush_scheduled_work();
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
549 buf->padding[old_subbuf]; 550 buf->padding[old_subbuf];
550 smp_mb(); 551 smp_mb();
551 if (waitqueue_active(&buf->read_wait)) { 552 if (waitqueue_active(&buf->read_wait)) {
552 PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); 553 PREPARE_DELAYED_WORK(&buf->wake_readers,
554 wakeup_readers);
553 schedule_delayed_work(&buf->wake_readers, 1); 555 schedule_delayed_work(&buf->wake_readers, 1);
554 } 556 }
555 } 557 }
@@ -957,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
957 if (!desc->count) 959 if (!desc->count)
958 return 0; 960 return 0;
959 961
960 mutex_lock(&filp->f_dentry->d_inode->i_mutex); 962 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
961 do { 963 do {
962 if (!relay_file_read_avail(buf, *ppos)) 964 if (!relay_file_read_avail(buf, *ppos))
963 break; 965 break;
@@ -977,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
977 *ppos = relay_file_read_end_pos(buf, read_start, ret); 979 *ppos = relay_file_read_end_pos(buf, read_start, ret);
978 } 980 }
979 } while (desc->count && ret); 981 } while (desc->count && ret);
980 mutex_unlock(&filp->f_dentry->d_inode->i_mutex); 982 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
981 983
982 return desc->written; 984 return desc->written;
983} 985}
@@ -1011,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
1011 actor, &desc); 1013 actor, &desc);
1012} 1014}
1013 1015
1014struct file_operations relay_file_operations = { 1016const struct file_operations relay_file_operations = {
1015 .open = relay_file_open, 1017 .open = relay_file_open,
1016 .poll = relay_file_poll, 1018 .poll = relay_file_poll,
1017 .mmap = relay_file_mmap, 1019 .mmap = relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143e..7b9a497419d9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
88 return 0; 88 return 0;
89} 89}
90 90
91static struct seq_operations resource_op = { 91static const struct seq_operations resource_op = {
92 .start = r_start, 92 .start = r_start,
93 .next = r_next, 93 .next = r_next,
94 .stop = r_stop, 94 .stop = r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
115 return res; 115 return res;
116} 116}
117 117
118static struct file_operations proc_ioports_operations = { 118static const struct file_operations proc_ioports_operations = {
119 .open = ioports_open, 119 .open = ioports_open,
120 .read = seq_read, 120 .read = seq_read,
121 .llseek = seq_lseek, 121 .llseek = seq_lseek,
122 .release = seq_release, 122 .release = seq_release,
123}; 123};
124 124
125static struct file_operations proc_iomem_operations = { 125static const struct file_operations proc_iomem_operations = {
126 .open = iomem_open, 126 .open = iomem_open,
127 .read = seq_read, 127 .read = seq_read,
128 .llseek = seq_lseek, 128 .llseek = seq_lseek,
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c94..015fc633c96c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/sysdev.h> 14#include <linux/sysdev.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16#include <linux/freezer.h>
16 17
17#include "rtmutex.h" 18#include "rtmutex.h"
18 19
diff --git a/kernel/sched.c b/kernel/sched.c
index 094b5687eef6..8a0afb97af71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
34#include <linux/security.h> 34#include <linux/security.h>
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/profile.h> 36#include <linux/profile.h>
37#include <linux/suspend.h> 37#include <linux/freezer.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
@@ -160,15 +160,6 @@
160#define TASK_PREEMPTS_CURR(p, rq) \ 160#define TASK_PREEMPTS_CURR(p, rq) \
161 ((p)->prio < (rq)->curr->prio) 161 ((p)->prio < (rq)->curr->prio)
162 162
163/*
164 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
165 * to time slice values: [800ms ... 100ms ... 5ms]
166 *
167 * The higher a thread's priority, the bigger timeslices
168 * it gets during one round of execution. But even the lowest
169 * priority thread gets MIN_TIMESLICE worth of execution time.
170 */
171
172#define SCALE_PRIO(x, prio) \ 163#define SCALE_PRIO(x, prio) \
173 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) 164 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
174 165
@@ -180,6 +171,15 @@ static unsigned int static_prio_timeslice(int static_prio)
180 return SCALE_PRIO(DEF_TIMESLICE, static_prio); 171 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
181} 172}
182 173
174/*
175 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
176 * to time slice values: [800ms ... 100ms ... 5ms]
177 *
178 * The higher a thread's priority, the bigger timeslices
179 * it gets during one round of execution. But even the lowest
180 * priority thread gets MIN_TIMESLICE worth of execution time.
181 */
182
183static inline unsigned int task_timeslice(struct task_struct *p) 183static inline unsigned int task_timeslice(struct task_struct *p)
184{ 184{
185 return static_prio_timeslice(p->static_prio); 185 return static_prio_timeslice(p->static_prio);
@@ -225,8 +225,10 @@ struct rq {
225 unsigned long nr_uninterruptible; 225 unsigned long nr_uninterruptible;
226 226
227 unsigned long expired_timestamp; 227 unsigned long expired_timestamp;
228 unsigned long long timestamp_last_tick; 228 /* Cached timestamp set by update_cpu_clock() */
229 unsigned long long most_recent_timestamp;
229 struct task_struct *curr, *idle; 230 struct task_struct *curr, *idle;
231 unsigned long next_balance;
230 struct mm_struct *prev_mm; 232 struct mm_struct *prev_mm;
231 struct prio_array *active, *expired, arrays[2]; 233 struct prio_array *active, *expired, arrays[2];
232 int best_expired_prio; 234 int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
426 * bump this up when changing the output format or the meaning of an existing 428 * bump this up when changing the output format or the meaning of an existing
427 * format, so that tools can adapt (or abort) 429 * format, so that tools can adapt (or abort)
428 */ 430 */
429#define SCHEDSTAT_VERSION 12 431#define SCHEDSTAT_VERSION 14
430 432
431static int show_schedstat(struct seq_file *seq, void *v) 433static int show_schedstat(struct seq_file *seq, void *v)
432{ 434{
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
464 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 466 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
465 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
466 itype++) { 468 itype++) {
467 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
470 "%lu",
468 sd->lb_cnt[itype], 471 sd->lb_cnt[itype],
469 sd->lb_balanced[itype], 472 sd->lb_balanced[itype],
470 sd->lb_failed[itype], 473 sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
474 sd->lb_nobusyq[itype], 477 sd->lb_nobusyq[itype],
475 sd->lb_nobusyg[itype]); 478 sd->lb_nobusyg[itype]);
476 } 479 }
477 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 480 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
481 " %lu %lu %lu\n",
478 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 482 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
479 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 483 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
480 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 484 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
481 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 485 sd->ttwu_wake_remote, sd->ttwu_move_affine,
486 sd->ttwu_move_balance);
482 } 487 }
483 preempt_enable(); 488 preempt_enable();
484#endif 489#endif
@@ -505,7 +510,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
505 return res; 510 return res;
506} 511}
507 512
508struct file_operations proc_schedstat_operations = { 513const struct file_operations proc_schedstat_operations = {
509 .open = schedstat_open, 514 .open = schedstat_open,
510 .read = seq_read, 515 .read = seq_read,
511 .llseek = seq_lseek, 516 .llseek = seq_lseek,
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
547#endif 552#endif
548 553
549/* 554/*
550 * rq_lock - lock a given runqueue and disable interrupts. 555 * this_rq_lock - lock this runqueue and disable interrupts.
551 */ 556 */
552static inline struct rq *this_rq_lock(void) 557static inline struct rq *this_rq_lock(void)
553 __acquires(rq->lock) 558 __acquires(rq->lock)
@@ -938,18 +943,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
938{ 943{
939 unsigned long long now; 944 unsigned long long now;
940 945
946 if (rt_task(p))
947 goto out;
948
941 now = sched_clock(); 949 now = sched_clock();
942#ifdef CONFIG_SMP 950#ifdef CONFIG_SMP
943 if (!local) { 951 if (!local) {
944 /* Compensate for drifting sched_clock */ 952 /* Compensate for drifting sched_clock */
945 struct rq *this_rq = this_rq(); 953 struct rq *this_rq = this_rq();
946 now = (now - this_rq->timestamp_last_tick) 954 now = (now - this_rq->most_recent_timestamp)
947 + rq->timestamp_last_tick; 955 + rq->most_recent_timestamp;
948 } 956 }
949#endif 957#endif
950 958
951 if (!rt_task(p)) 959 /*
952 p->prio = recalc_task_prio(p, now); 960 * Sleep time is in units of nanosecs, so shift by 20 to get a
961 * milliseconds-range estimation of the amount of time that the task
962 * spent sleeping:
963 */
964 if (unlikely(prof_on == SLEEP_PROFILING)) {
965 if (p->state == TASK_UNINTERRUPTIBLE)
966 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
967 (now - p->timestamp) >> 20);
968 }
969
970 p->prio = recalc_task_prio(p, now);
953 971
954 /* 972 /*
955 * This checks to make sure it's not an uninterruptible task 973 * This checks to make sure it's not an uninterruptible task
@@ -974,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
974 } 992 }
975 } 993 }
976 p->timestamp = now; 994 p->timestamp = now;
977 995out:
978 __activate_task(p, rq); 996 __activate_task(p, rq);
979} 997}
980 998
@@ -1439,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1439 1457
1440 if (this_sd->flags & SD_WAKE_AFFINE) { 1458 if (this_sd->flags & SD_WAKE_AFFINE) {
1441 unsigned long tl = this_load; 1459 unsigned long tl = this_load;
1442 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); 1460 unsigned long tl_per_task;
1461
1462 tl_per_task = cpu_avg_load_per_task(this_cpu);
1443 1463
1444 /* 1464 /*
1445 * If sync wakeup then subtract the (maximum possible) 1465 * If sync wakeup then subtract the (maximum possible)
@@ -1677,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1677 * Not the local CPU - must adjust timestamp. This should 1697 * Not the local CPU - must adjust timestamp. This should
1678 * get optimised away in the !CONFIG_SMP case. 1698 * get optimised away in the !CONFIG_SMP case.
1679 */ 1699 */
1680 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) 1700 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1681 + rq->timestamp_last_tick; 1701 + rq->most_recent_timestamp;
1682 __activate_task(p, rq); 1702 __activate_task(p, rq);
1683 if (TASK_PREEMPTS_CURR(p, rq)) 1703 if (TASK_PREEMPTS_CURR(p, rq))
1684 resched_task(rq->curr); 1704 resched_task(rq->curr);
@@ -1941,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1941 __acquires(rq1->lock) 1961 __acquires(rq1->lock)
1942 __acquires(rq2->lock) 1962 __acquires(rq2->lock)
1943{ 1963{
1964 BUG_ON(!irqs_disabled());
1944 if (rq1 == rq2) { 1965 if (rq1 == rq2) {
1945 spin_lock(&rq1->lock); 1966 spin_lock(&rq1->lock);
1946 __acquire(rq2->lock); /* Fake it out ;) */ 1967 __acquire(rq2->lock); /* Fake it out ;) */
@@ -1980,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1980 __acquires(busiest->lock) 2001 __acquires(busiest->lock)
1981 __acquires(this_rq->lock) 2002 __acquires(this_rq->lock)
1982{ 2003{
2004 if (unlikely(!irqs_disabled())) {
2005 /* printk() doesn't work good under rq->lock */
2006 spin_unlock(&this_rq->lock);
2007 BUG_ON(1);
2008 }
1983 if (unlikely(!spin_trylock(&busiest->lock))) { 2009 if (unlikely(!spin_trylock(&busiest->lock))) {
1984 if (busiest < this_rq) { 2010 if (busiest < this_rq) {
1985 spin_unlock(&this_rq->lock); 2011 spin_unlock(&this_rq->lock);
@@ -2050,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2050 set_task_cpu(p, this_cpu); 2076 set_task_cpu(p, this_cpu);
2051 inc_nr_running(p, this_rq); 2077 inc_nr_running(p, this_rq);
2052 enqueue_task(p, this_array); 2078 enqueue_task(p, this_array);
2053 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 2079 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2054 + this_rq->timestamp_last_tick; 2080 + this_rq->most_recent_timestamp;
2055 /* 2081 /*
2056 * Note that idle threads have a prio of MAX_PRIO, for this test 2082 * Note that idle threads have a prio of MAX_PRIO, for this test
2057 * to be always true for them. 2083 * to be always true for them.
@@ -2087,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2087 * 2) too many balance attempts have failed. 2113 * 2) too many balance attempts have failed.
2088 */ 2114 */
2089 2115
2090 if (sd->nr_balance_failed > sd->cache_nice_tries) 2116 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2117#ifdef CONFIG_SCHEDSTATS
2118 if (task_hot(p, rq->most_recent_timestamp, sd))
2119 schedstat_inc(sd, lb_hot_gained[idle]);
2120#endif
2091 return 1; 2121 return 1;
2122 }
2092 2123
2093 if (task_hot(p, rq->timestamp_last_tick, sd)) 2124 if (task_hot(p, rq->most_recent_timestamp, sd))
2094 return 0; 2125 return 0;
2095 return 1; 2126 return 1;
2096} 2127}
@@ -2188,11 +2219,6 @@ skip_queue:
2188 goto skip_bitmap; 2219 goto skip_bitmap;
2189 } 2220 }
2190 2221
2191#ifdef CONFIG_SCHEDSTATS
2192 if (task_hot(tmp, busiest->timestamp_last_tick, sd))
2193 schedstat_inc(sd, lb_hot_gained[idle]);
2194#endif
2195
2196 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2222 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2197 pulled++; 2223 pulled++;
2198 rem_load_move -= tmp->load_weight; 2224 rem_load_move -= tmp->load_weight;
@@ -2230,7 +2256,7 @@ out:
2230static struct sched_group * 2256static struct sched_group *
2231find_busiest_group(struct sched_domain *sd, int this_cpu, 2257find_busiest_group(struct sched_domain *sd, int this_cpu,
2232 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2258 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2233 cpumask_t *cpus) 2259 cpumask_t *cpus, int *balance)
2234{ 2260{
2235 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2261 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2236 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2262 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2259,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2259 unsigned long load, group_capacity; 2285 unsigned long load, group_capacity;
2260 int local_group; 2286 int local_group;
2261 int i; 2287 int i;
2288 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2262 unsigned long sum_nr_running, sum_weighted_load; 2289 unsigned long sum_nr_running, sum_weighted_load;
2263 2290
2264 local_group = cpu_isset(this_cpu, group->cpumask); 2291 local_group = cpu_isset(this_cpu, group->cpumask);
2265 2292
2293 if (local_group)
2294 balance_cpu = first_cpu(group->cpumask);
2295
2266 /* Tally up the load of all CPUs in the group */ 2296 /* Tally up the load of all CPUs in the group */
2267 sum_weighted_load = sum_nr_running = avg_load = 0; 2297 sum_weighted_load = sum_nr_running = avg_load = 0;
2268 2298
@@ -2278,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2278 *sd_idle = 0; 2308 *sd_idle = 0;
2279 2309
2280 /* Bias balancing toward cpus of our domain */ 2310 /* Bias balancing toward cpus of our domain */
2281 if (local_group) 2311 if (local_group) {
2312 if (idle_cpu(i) && !first_idle_cpu) {
2313 first_idle_cpu = 1;
2314 balance_cpu = i;
2315 }
2316
2282 load = target_load(i, load_idx); 2317 load = target_load(i, load_idx);
2283 else 2318 } else
2284 load = source_load(i, load_idx); 2319 load = source_load(i, load_idx);
2285 2320
2286 avg_load += load; 2321 avg_load += load;
@@ -2288,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2288 sum_weighted_load += rq->raw_weighted_load; 2323 sum_weighted_load += rq->raw_weighted_load;
2289 } 2324 }
2290 2325
2326 /*
2327 * First idle cpu or the first cpu(busiest) in this sched group
2328 * is eligible for doing load balancing at this and above
2329 * domains.
2330 */
2331 if (local_group && balance_cpu != this_cpu && balance) {
2332 *balance = 0;
2333 goto ret;
2334 }
2335
2291 total_load += avg_load; 2336 total_load += avg_load;
2292 total_pwr += group->cpu_power; 2337 total_pwr += group->cpu_power;
2293 2338
@@ -2447,18 +2492,21 @@ small_imbalance:
2447 pwr_now /= SCHED_LOAD_SCALE; 2492 pwr_now /= SCHED_LOAD_SCALE;
2448 2493
2449 /* Amount of load we'd subtract */ 2494 /* Amount of load we'd subtract */
2450 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; 2495 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2496 busiest->cpu_power;
2451 if (max_load > tmp) 2497 if (max_load > tmp)
2452 pwr_move += busiest->cpu_power * 2498 pwr_move += busiest->cpu_power *
2453 min(busiest_load_per_task, max_load - tmp); 2499 min(busiest_load_per_task, max_load - tmp);
2454 2500
2455 /* Amount of load we'd add */ 2501 /* Amount of load we'd add */
2456 if (max_load*busiest->cpu_power < 2502 if (max_load * busiest->cpu_power <
2457 busiest_load_per_task*SCHED_LOAD_SCALE) 2503 busiest_load_per_task * SCHED_LOAD_SCALE)
2458 tmp = max_load*busiest->cpu_power/this->cpu_power; 2504 tmp = max_load * busiest->cpu_power / this->cpu_power;
2459 else 2505 else
2460 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; 2506 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2461 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); 2507 this->cpu_power;
2508 pwr_move += this->cpu_power *
2509 min(this_load_per_task, this_load + tmp);
2462 pwr_move /= SCHED_LOAD_SCALE; 2510 pwr_move /= SCHED_LOAD_SCALE;
2463 2511
2464 /* Move if we gain throughput */ 2512 /* Move if we gain throughput */
@@ -2479,8 +2527,8 @@ out_balanced:
2479 *imbalance = min_load_per_task; 2527 *imbalance = min_load_per_task;
2480 return group_min; 2528 return group_min;
2481 } 2529 }
2482ret:
2483#endif 2530#endif
2531ret:
2484 *imbalance = 0; 2532 *imbalance = 0;
2485 return NULL; 2533 return NULL;
2486} 2534}
@@ -2529,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2529/* 2577/*
2530 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2578 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2531 * tasks if there is an imbalance. 2579 * tasks if there is an imbalance.
2532 *
2533 * Called with this_rq unlocked.
2534 */ 2580 */
2535static int load_balance(int this_cpu, struct rq *this_rq, 2581static int load_balance(int this_cpu, struct rq *this_rq,
2536 struct sched_domain *sd, enum idle_type idle) 2582 struct sched_domain *sd, enum idle_type idle,
2583 int *balance)
2537{ 2584{
2538 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2585 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2539 struct sched_group *group; 2586 struct sched_group *group;
2540 unsigned long imbalance; 2587 unsigned long imbalance;
2541 struct rq *busiest; 2588 struct rq *busiest;
2542 cpumask_t cpus = CPU_MASK_ALL; 2589 cpumask_t cpus = CPU_MASK_ALL;
2590 unsigned long flags;
2543 2591
2544 /* 2592 /*
2545 * When power savings policy is enabled for the parent domain, idle 2593 * When power savings policy is enabled for the parent domain, idle
@@ -2555,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2555 2603
2556redo: 2604redo:
2557 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2605 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2558 &cpus); 2606 &cpus, balance);
2607
2608 if (*balance == 0)
2609 goto out_balanced;
2610
2559 if (!group) { 2611 if (!group) {
2560 schedstat_inc(sd, lb_nobusyg[idle]); 2612 schedstat_inc(sd, lb_nobusyg[idle]);
2561 goto out_balanced; 2613 goto out_balanced;
@@ -2579,11 +2631,13 @@ redo:
2579 * still unbalanced. nr_moved simply stays zero, so it is 2631 * still unbalanced. nr_moved simply stays zero, so it is
2580 * correctly treated as an imbalance. 2632 * correctly treated as an imbalance.
2581 */ 2633 */
2634 local_irq_save(flags);
2582 double_rq_lock(this_rq, busiest); 2635 double_rq_lock(this_rq, busiest);
2583 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2636 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2584 minus_1_or_zero(busiest->nr_running), 2637 minus_1_or_zero(busiest->nr_running),
2585 imbalance, sd, idle, &all_pinned); 2638 imbalance, sd, idle, &all_pinned);
2586 double_rq_unlock(this_rq, busiest); 2639 double_rq_unlock(this_rq, busiest);
2640 local_irq_restore(flags);
2587 2641
2588 /* All tasks on this runqueue were pinned by CPU affinity */ 2642 /* All tasks on this runqueue were pinned by CPU affinity */
2589 if (unlikely(all_pinned)) { 2643 if (unlikely(all_pinned)) {
@@ -2600,13 +2654,13 @@ redo:
2600 2654
2601 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2655 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2602 2656
2603 spin_lock(&busiest->lock); 2657 spin_lock_irqsave(&busiest->lock, flags);
2604 2658
2605 /* don't kick the migration_thread, if the curr 2659 /* don't kick the migration_thread, if the curr
2606 * task on busiest cpu can't be moved to this_cpu 2660 * task on busiest cpu can't be moved to this_cpu
2607 */ 2661 */
2608 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 2662 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2609 spin_unlock(&busiest->lock); 2663 spin_unlock_irqrestore(&busiest->lock, flags);
2610 all_pinned = 1; 2664 all_pinned = 1;
2611 goto out_one_pinned; 2665 goto out_one_pinned;
2612 } 2666 }
@@ -2616,7 +2670,7 @@ redo:
2616 busiest->push_cpu = this_cpu; 2670 busiest->push_cpu = this_cpu;
2617 active_balance = 1; 2671 active_balance = 1;
2618 } 2672 }
2619 spin_unlock(&busiest->lock); 2673 spin_unlock_irqrestore(&busiest->lock, flags);
2620 if (active_balance) 2674 if (active_balance)
2621 wake_up_process(busiest->migration_thread); 2675 wake_up_process(busiest->migration_thread);
2622 2676
@@ -2695,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2695 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2749 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2696redo: 2750redo:
2697 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2751 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2698 &sd_idle, &cpus); 2752 &sd_idle, &cpus, NULL);
2699 if (!group) { 2753 if (!group) {
2700 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2754 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2701 goto out_balanced; 2755 goto out_balanced;
@@ -2755,14 +2809,28 @@ out_balanced:
2755static void idle_balance(int this_cpu, struct rq *this_rq) 2809static void idle_balance(int this_cpu, struct rq *this_rq)
2756{ 2810{
2757 struct sched_domain *sd; 2811 struct sched_domain *sd;
2812 int pulled_task = 0;
2813 unsigned long next_balance = jiffies + 60 * HZ;
2758 2814
2759 for_each_domain(this_cpu, sd) { 2815 for_each_domain(this_cpu, sd) {
2760 if (sd->flags & SD_BALANCE_NEWIDLE) { 2816 if (sd->flags & SD_BALANCE_NEWIDLE) {
2761 /* If we've pulled tasks over stop searching: */ 2817 /* If we've pulled tasks over stop searching: */
2762 if (load_balance_newidle(this_cpu, this_rq, sd)) 2818 pulled_task = load_balance_newidle(this_cpu,
2819 this_rq, sd);
2820 if (time_after(next_balance,
2821 sd->last_balance + sd->balance_interval))
2822 next_balance = sd->last_balance
2823 + sd->balance_interval;
2824 if (pulled_task)
2763 break; 2825 break;
2764 } 2826 }
2765 } 2827 }
2828 if (!pulled_task)
2829 /*
2830 * We are going idle. next_balance may be set based on
2831 * a busy processor. So reset next_balance.
2832 */
2833 this_rq->next_balance = next_balance;
2766} 2834}
2767 2835
2768/* 2836/*
@@ -2815,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2815 spin_unlock(&target_rq->lock); 2883 spin_unlock(&target_rq->lock);
2816} 2884}
2817 2885
2818/* 2886static void update_load(struct rq *this_rq)
2819 * rebalance_tick will get called every timer tick, on every CPU.
2820 *
2821 * It checks each scheduling domain to see if it is due to be balanced,
2822 * and initiates a balancing operation if so.
2823 *
2824 * Balancing parameters are set up in arch_init_sched_domains.
2825 */
2826
2827/* Don't have all balancing operations going off at once: */
2828static inline unsigned long cpu_offset(int cpu)
2829{
2830 return jiffies + cpu * HZ / NR_CPUS;
2831}
2832
2833static void
2834rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2835{ 2887{
2836 unsigned long this_load, interval, j = cpu_offset(this_cpu); 2888 unsigned long this_load;
2837 struct sched_domain *sd;
2838 int i, scale; 2889 int i, scale;
2839 2890
2840 this_load = this_rq->raw_weighted_load; 2891 this_load = this_rq->raw_weighted_load;
@@ -2854,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2854 new_load += scale-1; 2905 new_load += scale-1;
2855 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; 2906 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2856 } 2907 }
2908}
2909
2910/*
2911 * run_rebalance_domains is triggered when needed from the scheduler tick.
2912 *
2913 * It checks each scheduling domain to see if it is due to be balanced,
2914 * and initiates a balancing operation if so.
2915 *
2916 * Balancing parameters are set up in arch_init_sched_domains.
2917 */
2918static DEFINE_SPINLOCK(balancing);
2919
2920static void run_rebalance_domains(struct softirq_action *h)
2921{
2922 int this_cpu = smp_processor_id(), balance = 1;
2923 struct rq *this_rq = cpu_rq(this_cpu);
2924 unsigned long interval;
2925 struct sched_domain *sd;
2926 /*
2927 * We are idle if there are no processes running. This
2928 * is valid even if we are the idle process (SMT).
2929 */
2930 enum idle_type idle = !this_rq->nr_running ?
2931 SCHED_IDLE : NOT_IDLE;
2932 /* Earliest time when we have to call run_rebalance_domains again */
2933 unsigned long next_balance = jiffies + 60*HZ;
2857 2934
2858 for_each_domain(this_cpu, sd) { 2935 for_each_domain(this_cpu, sd) {
2859 if (!(sd->flags & SD_LOAD_BALANCE)) 2936 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2868,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2868 if (unlikely(!interval)) 2945 if (unlikely(!interval))
2869 interval = 1; 2946 interval = 1;
2870 2947
2871 if (j - sd->last_balance >= interval) { 2948 if (sd->flags & SD_SERIALIZE) {
2872 if (load_balance(this_cpu, this_rq, sd, idle)) { 2949 if (!spin_trylock(&balancing))
2950 goto out;
2951 }
2952
2953 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2954 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
2873 /* 2955 /*
2874 * We've pulled tasks over so either we're no 2956 * We've pulled tasks over so either we're no
2875 * longer idle, or one of our SMT siblings is 2957 * longer idle, or one of our SMT siblings is
@@ -2877,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2877 */ 2959 */
2878 idle = NOT_IDLE; 2960 idle = NOT_IDLE;
2879 } 2961 }
2880 sd->last_balance += interval; 2962 sd->last_balance = jiffies;
2881 } 2963 }
2964 if (sd->flags & SD_SERIALIZE)
2965 spin_unlock(&balancing);
2966out:
2967 if (time_after(next_balance, sd->last_balance + interval))
2968 next_balance = sd->last_balance + interval;
2969
2970 /*
2971 * Stop the load balance at this level. There is another
2972 * CPU in our sched group which is doing load balancing more
2973 * actively.
2974 */
2975 if (!balance)
2976 break;
2882 } 2977 }
2978 this_rq->next_balance = next_balance;
2883} 2979}
2884#else 2980#else
2885/* 2981/*
2886 * on UP we do not need to balance between CPUs: 2982 * on UP we do not need to balance between CPUs:
2887 */ 2983 */
2888static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
2889{
2890}
2891static inline void idle_balance(int cpu, struct rq *rq) 2984static inline void idle_balance(int cpu, struct rq *rq)
2892{ 2985{
2893} 2986}
2894#endif 2987#endif
2895 2988
2896static inline int wake_priority_sleeper(struct rq *rq) 2989static inline void wake_priority_sleeper(struct rq *rq)
2897{ 2990{
2898 int ret = 0;
2899
2900#ifdef CONFIG_SCHED_SMT 2991#ifdef CONFIG_SCHED_SMT
2992 if (!rq->nr_running)
2993 return;
2994
2901 spin_lock(&rq->lock); 2995 spin_lock(&rq->lock);
2902 /* 2996 /*
2903 * If an SMT sibling task has been put to sleep for priority 2997 * If an SMT sibling task has been put to sleep for priority
2904 * reasons reschedule the idle task to see if it can now run. 2998 * reasons reschedule the idle task to see if it can now run.
2905 */ 2999 */
2906 if (rq->nr_running) { 3000 if (rq->nr_running)
2907 resched_task(rq->idle); 3001 resched_task(rq->idle);
2908 ret = 1;
2909 }
2910 spin_unlock(&rq->lock); 3002 spin_unlock(&rq->lock);
2911#endif 3003#endif
2912 return ret;
2913} 3004}
2914 3005
2915DEFINE_PER_CPU(struct kernel_stat, kstat); 3006DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2923,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2923static inline void 3014static inline void
2924update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) 3015update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2925{ 3016{
2926 p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); 3017 p->sched_time += now - p->last_ran;
3018 p->last_ran = rq->most_recent_timestamp = now;
2927} 3019}
2928 3020
2929/* 3021/*
@@ -2936,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
2936 unsigned long flags; 3028 unsigned long flags;
2937 3029
2938 local_irq_save(flags); 3030 local_irq_save(flags);
2939 ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); 3031 ns = p->sched_time + sched_clock() - p->last_ran;
2940 ns = p->sched_time + sched_clock() - ns;
2941 local_irq_restore(flags); 3032 local_irq_restore(flags);
2942 3033
2943 return ns; 3034 return ns;
@@ -3037,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3037 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3128 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3038} 3129}
3039 3130
3040/* 3131static void task_running_tick(struct rq *rq, struct task_struct *p)
3041 * This function gets called by the timer code, with HZ frequency.
3042 * We call it with interrupts disabled.
3043 *
3044 * It also gets called by the fork code, when changing the parent's
3045 * timeslices.
3046 */
3047void scheduler_tick(void)
3048{ 3132{
3049 unsigned long long now = sched_clock();
3050 struct task_struct *p = current;
3051 int cpu = smp_processor_id();
3052 struct rq *rq = cpu_rq(cpu);
3053
3054 update_cpu_clock(p, rq, now);
3055
3056 rq->timestamp_last_tick = now;
3057
3058 if (p == rq->idle) {
3059 if (wake_priority_sleeper(rq))
3060 goto out;
3061 rebalance_tick(cpu, rq, SCHED_IDLE);
3062 return;
3063 }
3064
3065 /* Task might have expired already, but not scheduled off yet */
3066 if (p->array != rq->active) { 3133 if (p->array != rq->active) {
3134 /* Task has expired but was not scheduled yet */
3067 set_tsk_need_resched(p); 3135 set_tsk_need_resched(p);
3068 goto out; 3136 return;
3069 } 3137 }
3070 spin_lock(&rq->lock); 3138 spin_lock(&rq->lock);
3071 /* 3139 /*
@@ -3133,8 +3201,34 @@ void scheduler_tick(void)
3133 } 3201 }
3134out_unlock: 3202out_unlock:
3135 spin_unlock(&rq->lock); 3203 spin_unlock(&rq->lock);
3136out: 3204}
3137 rebalance_tick(cpu, rq, NOT_IDLE); 3205
3206/*
3207 * This function gets called by the timer code, with HZ frequency.
3208 * We call it with interrupts disabled.
3209 *
3210 * It also gets called by the fork code, when changing the parent's
3211 * timeslices.
3212 */
3213void scheduler_tick(void)
3214{
3215 unsigned long long now = sched_clock();
3216 struct task_struct *p = current;
3217 int cpu = smp_processor_id();
3218 struct rq *rq = cpu_rq(cpu);
3219
3220 update_cpu_clock(p, rq, now);
3221
3222 if (p == rq->idle)
3223 /* Task on the idle queue */
3224 wake_priority_sleeper(rq);
3225 else
3226 task_running_tick(rq, p);
3227#ifdef CONFIG_SMP
3228 update_load(rq);
3229 if (time_after_eq(jiffies, rq->next_balance))
3230 raise_softirq(SCHED_SOFTIRQ);
3231#endif
3138} 3232}
3139 3233
3140#ifdef CONFIG_SCHED_SMT 3234#ifdef CONFIG_SCHED_SMT
@@ -3280,7 +3374,8 @@ void fastcall add_preempt_count(int val)
3280 /* 3374 /*
3281 * Spinlock count overflowing soon? 3375 * Spinlock count overflowing soon?
3282 */ 3376 */
3283 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 3377 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3378 PREEMPT_MASK - 10);
3284} 3379}
3285EXPORT_SYMBOL(add_preempt_count); 3380EXPORT_SYMBOL(add_preempt_count);
3286 3381
@@ -3333,6 +3428,7 @@ asmlinkage void __sched schedule(void)
3333 printk(KERN_ERR "BUG: scheduling while atomic: " 3428 printk(KERN_ERR "BUG: scheduling while atomic: "
3334 "%s/0x%08x/%d\n", 3429 "%s/0x%08x/%d\n",
3335 current->comm, preempt_count(), current->pid); 3430 current->comm, preempt_count(), current->pid);
3431 debug_show_held_locks(current);
3336 dump_stack(); 3432 dump_stack();
3337 } 3433 }
3338 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3434 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4804,18 +4900,18 @@ static void show_task(struct task_struct *p)
4804 show_stack(p, NULL); 4900 show_stack(p, NULL);
4805} 4901}
4806 4902
4807void show_state(void) 4903void show_state_filter(unsigned long state_filter)
4808{ 4904{
4809 struct task_struct *g, *p; 4905 struct task_struct *g, *p;
4810 4906
4811#if (BITS_PER_LONG == 32) 4907#if (BITS_PER_LONG == 32)
4812 printk("\n" 4908 printk("\n"
4813 " sibling\n"); 4909 " free sibling\n");
4814 printk(" task PC pid father child younger older\n"); 4910 printk(" task PC stack pid father child younger older\n");
4815#else 4911#else
4816 printk("\n" 4912 printk("\n"
4817 " sibling\n"); 4913 " free sibling\n");
4818 printk(" task PC pid father child younger older\n"); 4914 printk(" task PC stack pid father child younger older\n");
4819#endif 4915#endif
4820 read_lock(&tasklist_lock); 4916 read_lock(&tasklist_lock);
4821 do_each_thread(g, p) { 4917 do_each_thread(g, p) {
@@ -4824,11 +4920,16 @@ void show_state(void)
4824 * console might take alot of time: 4920 * console might take alot of time:
4825 */ 4921 */
4826 touch_nmi_watchdog(); 4922 touch_nmi_watchdog();
4827 show_task(p); 4923 if (p->state & state_filter)
4924 show_task(p);
4828 } while_each_thread(g, p); 4925 } while_each_thread(g, p);
4829 4926
4830 read_unlock(&tasklist_lock); 4927 read_unlock(&tasklist_lock);
4831 debug_show_all_locks(); 4928 /*
4929 * Only show locks if all tasks are dumped:
4930 */
4931 if (state_filter == -1)
4932 debug_show_all_locks();
4832} 4933}
4833 4934
4834/** 4935/**
@@ -4973,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4973 * afterwards, and pretending it was a local activate. 5074 * afterwards, and pretending it was a local activate.
4974 * This way is cleaner and logically correct. 5075 * This way is cleaner and logically correct.
4975 */ 5076 */
4976 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 5077 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
4977 + rq_dest->timestamp_last_tick; 5078 + rq_dest->most_recent_timestamp;
4978 deactivate_task(p, rq_src); 5079 deactivate_task(p, rq_src);
4979 __activate_task(p, rq_dest); 5080 __activate_task(p, rq_dest);
4980 if (TASK_PREEMPTS_CURR(p, rq_dest)) 5081 if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5050,7 +5151,10 @@ wait_to_die:
5050} 5151}
5051 5152
5052#ifdef CONFIG_HOTPLUG_CPU 5153#ifdef CONFIG_HOTPLUG_CPU
5053/* Figure out where task on dead CPU should go, use force if neccessary. */ 5154/*
5155 * Figure out where task on dead CPU should go, use force if neccessary.
5156 * NOTE: interrupts should be disabled by the caller
5157 */
5054static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5158static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5055{ 5159{
5056 unsigned long flags; 5160 unsigned long flags;
@@ -5170,6 +5274,7 @@ void idle_task_exit(void)
5170 mmdrop(mm); 5274 mmdrop(mm);
5171} 5275}
5172 5276
5277/* called under rq->lock with disabled interrupts */
5173static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5278static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5174{ 5279{
5175 struct rq *rq = cpu_rq(dead_cpu); 5280 struct rq *rq = cpu_rq(dead_cpu);
@@ -5186,10 +5291,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5186 * Drop lock around migration; if someone else moves it, 5291 * Drop lock around migration; if someone else moves it,
5187 * that's OK. No task can be added to this CPU, so iteration is 5292 * that's OK. No task can be added to this CPU, so iteration is
5188 * fine. 5293 * fine.
5294 * NOTE: interrupts should be left disabled --dev@
5189 */ 5295 */
5190 spin_unlock_irq(&rq->lock); 5296 spin_unlock(&rq->lock);
5191 move_task_off_dead_cpu(dead_cpu, p); 5297 move_task_off_dead_cpu(dead_cpu, p);
5192 spin_lock_irq(&rq->lock); 5298 spin_lock(&rq->lock);
5193 5299
5194 put_task_struct(p); 5300 put_task_struct(p);
5195} 5301}
@@ -5342,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5342 if (!(sd->flags & SD_LOAD_BALANCE)) { 5448 if (!(sd->flags & SD_LOAD_BALANCE)) {
5343 printk("does not load-balance\n"); 5449 printk("does not load-balance\n");
5344 if (sd->parent) 5450 if (sd->parent)
5345 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); 5451 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5452 " has parent");
5346 break; 5453 break;
5347 } 5454 }
5348 5455
5349 printk("span %s\n", str); 5456 printk("span %s\n", str);
5350 5457
5351 if (!cpu_isset(cpu, sd->span)) 5458 if (!cpu_isset(cpu, sd->span))
5352 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 5459 printk(KERN_ERR "ERROR: domain->span does not contain "
5460 "CPU%d\n", cpu);
5353 if (!cpu_isset(cpu, group->cpumask)) 5461 if (!cpu_isset(cpu, group->cpumask))
5354 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 5462 printk(KERN_ERR "ERROR: domain->groups does not contain"
5463 " CPU%d\n", cpu);
5355 5464
5356 printk(KERN_DEBUG); 5465 printk(KERN_DEBUG);
5357 for (i = 0; i < level + 2; i++) 5466 for (i = 0; i < level + 2; i++)
@@ -5366,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5366 5475
5367 if (!group->cpu_power) { 5476 if (!group->cpu_power) {
5368 printk("\n"); 5477 printk("\n");
5369 printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); 5478 printk(KERN_ERR "ERROR: domain->cpu_power not "
5479 "set\n");
5370 } 5480 }
5371 5481
5372 if (!cpus_weight(group->cpumask)) { 5482 if (!cpus_weight(group->cpumask)) {
@@ -5389,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5389 printk("\n"); 5499 printk("\n");
5390 5500
5391 if (!cpus_equal(sd->span, groupmask)) 5501 if (!cpus_equal(sd->span, groupmask))
5392 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5502 printk(KERN_ERR "ERROR: groups don't span "
5503 "domain->span\n");
5393 5504
5394 level++; 5505 level++;
5395 sd = sd->parent; 5506 sd = sd->parent;
5507 if (!sd)
5508 continue;
5396 5509
5397 if (sd) { 5510 if (!cpus_subset(groupmask, sd->span))
5398 if (!cpus_subset(groupmask, sd->span)) 5511 printk(KERN_ERR "ERROR: parent span is not a superset "
5399 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 5512 "of domain->span\n");
5400 }
5401 5513
5402 } while (sd); 5514 } while (sd);
5403} 5515}
@@ -5511,28 +5623,27 @@ static int __init isolated_cpu_setup(char *str)
5511__setup ("isolcpus=", isolated_cpu_setup); 5623__setup ("isolcpus=", isolated_cpu_setup);
5512 5624
5513/* 5625/*
5514 * init_sched_build_groups takes an array of groups, the cpumask we wish 5626 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5515 * to span, and a pointer to a function which identifies what group a CPU 5627 * to a function which identifies what group(along with sched group) a CPU
5516 * belongs to. The return value of group_fn must be a valid index into the 5628 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5517 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we 5629 * (due to the fact that we keep track of groups covered with a cpumask_t).
5518 * keep track of groups covered with a cpumask_t).
5519 * 5630 *
5520 * init_sched_build_groups will build a circular linked list of the groups 5631 * init_sched_build_groups will build a circular linked list of the groups
5521 * covered by the given span, and will set each group's ->cpumask correctly, 5632 * covered by the given span, and will set each group's ->cpumask correctly,
5522 * and ->cpu_power to 0. 5633 * and ->cpu_power to 0.
5523 */ 5634 */
5524static void 5635static void
5525init_sched_build_groups(struct sched_group groups[], cpumask_t span, 5636init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5526 const cpumask_t *cpu_map, 5637 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5527 int (*group_fn)(int cpu, const cpumask_t *cpu_map)) 5638 struct sched_group **sg))
5528{ 5639{
5529 struct sched_group *first = NULL, *last = NULL; 5640 struct sched_group *first = NULL, *last = NULL;
5530 cpumask_t covered = CPU_MASK_NONE; 5641 cpumask_t covered = CPU_MASK_NONE;
5531 int i; 5642 int i;
5532 5643
5533 for_each_cpu_mask(i, span) { 5644 for_each_cpu_mask(i, span) {
5534 int group = group_fn(i, cpu_map); 5645 struct sched_group *sg;
5535 struct sched_group *sg = &groups[group]; 5646 int group = group_fn(i, cpu_map, &sg);
5536 int j; 5647 int j;
5537 5648
5538 if (cpu_isset(i, covered)) 5649 if (cpu_isset(i, covered))
@@ -5542,7 +5653,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5542 sg->cpu_power = 0; 5653 sg->cpu_power = 0;
5543 5654
5544 for_each_cpu_mask(j, span) { 5655 for_each_cpu_mask(j, span) {
5545 if (group_fn(j, cpu_map) != group) 5656 if (group_fn(j, cpu_map, NULL) != group)
5546 continue; 5657 continue;
5547 5658
5548 cpu_set(j, covered); 5659 cpu_set(j, covered);
@@ -5716,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size);
5716 */ 5827 */
5717static void touch_cache(void *__cache, unsigned long __size) 5828static void touch_cache(void *__cache, unsigned long __size)
5718{ 5829{
5719 unsigned long size = __size/sizeof(long), chunk1 = size/3, 5830 unsigned long size = __size / sizeof(long);
5720 chunk2 = 2*size/3; 5831 unsigned long chunk1 = size / 3;
5832 unsigned long chunk2 = 2 * size / 3;
5721 unsigned long *cache = __cache; 5833 unsigned long *cache = __cache;
5722 int i; 5834 int i;
5723 5835
@@ -5826,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5826 */ 5938 */
5827 measure_one(cache, size, cpu1, cpu2); 5939 measure_one(cache, size, cpu1, cpu2);
5828 for (i = 0; i < ITERATIONS; i++) 5940 for (i = 0; i < ITERATIONS; i++)
5829 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); 5941 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
5830 5942
5831 measure_one(cache, size, cpu2, cpu1); 5943 measure_one(cache, size, cpu2, cpu1);
5832 for (i = 0; i < ITERATIONS; i++) 5944 for (i = 0; i < ITERATIONS; i++)
5833 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); 5945 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
5834 5946
5835 /* 5947 /*
5836 * (We measure the non-migrating [cached] cost on both 5948 * (We measure the non-migrating [cached] cost on both
@@ -5840,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5840 5952
5841 measure_one(cache, size, cpu1, cpu1); 5953 measure_one(cache, size, cpu1, cpu1);
5842 for (i = 0; i < ITERATIONS; i++) 5954 for (i = 0; i < ITERATIONS; i++)
5843 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); 5955 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
5844 5956
5845 measure_one(cache, size, cpu2, cpu2); 5957 measure_one(cache, size, cpu2, cpu2);
5846 for (i = 0; i < ITERATIONS; i++) 5958 for (i = 0; i < ITERATIONS; i++)
5847 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); 5959 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
5848 5960
5849 /* 5961 /*
5850 * Get the per-iteration migration cost: 5962 * Get the per-iteration migration cost:
5851 */ 5963 */
5852 do_div(cost1, 2*ITERATIONS); 5964 do_div(cost1, 2 * ITERATIONS);
5853 do_div(cost2, 2*ITERATIONS); 5965 do_div(cost2, 2 * ITERATIONS);
5854 5966
5855 return cost1 - cost2; 5967 return cost1 - cost2;
5856} 5968}
@@ -5888,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5888 */ 6000 */
5889 cache = vmalloc(max_size); 6001 cache = vmalloc(max_size);
5890 if (!cache) { 6002 if (!cache) {
5891 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 6003 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
5892 return 1000000; /* return 1 msec on very small boxen */ 6004 return 1000000; /* return 1 msec on very small boxen */
5893 } 6005 }
5894 6006
@@ -5913,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5913 avg_fluct = (avg_fluct + fluct)/2; 6025 avg_fluct = (avg_fluct + fluct)/2;
5914 6026
5915 if (migration_debug) 6027 if (migration_debug)
5916 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", 6028 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6029 "(%8Ld %8Ld)\n",
5917 cpu1, cpu2, size, 6030 cpu1, cpu2, size,
5918 (long)cost / 1000000, 6031 (long)cost / 1000000,
5919 ((long)cost / 100000) % 10, 6032 ((long)cost / 100000) % 10,
@@ -6008,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
6008 -1 6121 -1
6009#endif 6122#endif
6010 ); 6123 );
6011 if (system_state == SYSTEM_BOOTING) { 6124 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6012 if (num_online_cpus() > 1) { 6125 printk("migration_cost=");
6013 printk("migration_cost="); 6126 for (distance = 0; distance <= max_distance; distance++) {
6014 for (distance = 0; distance <= max_distance; distance++) { 6127 if (distance)
6015 if (distance) 6128 printk(",");
6016 printk(","); 6129 printk("%ld", (long)migration_cost[distance] / 1000);
6017 printk("%ld", (long)migration_cost[distance] / 1000);
6018 }
6019 printk("\n");
6020 } 6130 }
6131 printk("\n");
6021 } 6132 }
6022 j1 = jiffies; 6133 j1 = jiffies;
6023 if (migration_debug) 6134 if (migration_debug)
6024 printk("migration: %ld seconds\n", (j1-j0)/HZ); 6135 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6025 6136
6026 /* 6137 /*
6027 * Move back to the original CPU. NUMA-Q gets confused 6138 * Move back to the original CPU. NUMA-Q gets confused
@@ -6118,10 +6229,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6118 */ 6229 */
6119#ifdef CONFIG_SCHED_SMT 6230#ifdef CONFIG_SCHED_SMT
6120static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6231static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6121static struct sched_group sched_group_cpus[NR_CPUS]; 6232static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6122 6233
6123static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) 6234static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
6235 struct sched_group **sg)
6124{ 6236{
6237 if (sg)
6238 *sg = &per_cpu(sched_group_cpus, cpu);
6125 return cpu; 6239 return cpu;
6126} 6240}
6127#endif 6241#endif
@@ -6131,39 +6245,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
6131 */ 6245 */
6132#ifdef CONFIG_SCHED_MC 6246#ifdef CONFIG_SCHED_MC
6133static DEFINE_PER_CPU(struct sched_domain, core_domains); 6247static DEFINE_PER_CPU(struct sched_domain, core_domains);
6134static struct sched_group sched_group_core[NR_CPUS]; 6248static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6135#endif 6249#endif
6136 6250
6137#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6251#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6138static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6252static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6253 struct sched_group **sg)
6139{ 6254{
6255 int group;
6140 cpumask_t mask = cpu_sibling_map[cpu]; 6256 cpumask_t mask = cpu_sibling_map[cpu];
6141 cpus_and(mask, mask, *cpu_map); 6257 cpus_and(mask, mask, *cpu_map);
6142 return first_cpu(mask); 6258 group = first_cpu(mask);
6259 if (sg)
6260 *sg = &per_cpu(sched_group_core, group);
6261 return group;
6143} 6262}
6144#elif defined(CONFIG_SCHED_MC) 6263#elif defined(CONFIG_SCHED_MC)
6145static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6264static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6265 struct sched_group **sg)
6146{ 6266{
6267 if (sg)
6268 *sg = &per_cpu(sched_group_core, cpu);
6147 return cpu; 6269 return cpu;
6148} 6270}
6149#endif 6271#endif
6150 6272
6151static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6273static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6152static struct sched_group sched_group_phys[NR_CPUS]; 6274static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6153 6275
6154static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) 6276static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
6277 struct sched_group **sg)
6155{ 6278{
6279 int group;
6156#ifdef CONFIG_SCHED_MC 6280#ifdef CONFIG_SCHED_MC
6157 cpumask_t mask = cpu_coregroup_map(cpu); 6281 cpumask_t mask = cpu_coregroup_map(cpu);
6158 cpus_and(mask, mask, *cpu_map); 6282 cpus_and(mask, mask, *cpu_map);
6159 return first_cpu(mask); 6283 group = first_cpu(mask);
6160#elif defined(CONFIG_SCHED_SMT) 6284#elif defined(CONFIG_SCHED_SMT)
6161 cpumask_t mask = cpu_sibling_map[cpu]; 6285 cpumask_t mask = cpu_sibling_map[cpu];
6162 cpus_and(mask, mask, *cpu_map); 6286 cpus_and(mask, mask, *cpu_map);
6163 return first_cpu(mask); 6287 group = first_cpu(mask);
6164#else 6288#else
6165 return cpu; 6289 group = cpu;
6166#endif 6290#endif
6291 if (sg)
6292 *sg = &per_cpu(sched_group_phys, group);
6293 return group;
6167} 6294}
6168 6295
6169#ifdef CONFIG_NUMA 6296#ifdef CONFIG_NUMA
@@ -6176,12 +6303,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
6176static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 6303static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6177 6304
6178static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 6305static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6179static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; 6306static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6180 6307
6181static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) 6308static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6309 struct sched_group **sg)
6182{ 6310{
6183 return cpu_to_node(cpu); 6311 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6312 int group;
6313
6314 cpus_and(nodemask, nodemask, *cpu_map);
6315 group = first_cpu(nodemask);
6316
6317 if (sg)
6318 *sg = &per_cpu(sched_group_allnodes, group);
6319 return group;
6184} 6320}
6321
6185static void init_numa_sched_groups_power(struct sched_group *group_head) 6322static void init_numa_sched_groups_power(struct sched_group *group_head)
6186{ 6323{
6187 struct sched_group *sg = group_head; 6324 struct sched_group *sg = group_head;
@@ -6217,16 +6354,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6217 int cpu, i; 6354 int cpu, i;
6218 6355
6219 for_each_cpu_mask(cpu, *cpu_map) { 6356 for_each_cpu_mask(cpu, *cpu_map) {
6220 struct sched_group *sched_group_allnodes
6221 = sched_group_allnodes_bycpu[cpu];
6222 struct sched_group **sched_group_nodes 6357 struct sched_group **sched_group_nodes
6223 = sched_group_nodes_bycpu[cpu]; 6358 = sched_group_nodes_bycpu[cpu];
6224 6359
6225 if (sched_group_allnodes) {
6226 kfree(sched_group_allnodes);
6227 sched_group_allnodes_bycpu[cpu] = NULL;
6228 }
6229
6230 if (!sched_group_nodes) 6360 if (!sched_group_nodes)
6231 continue; 6361 continue;
6232 6362
@@ -6320,7 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6320 struct sched_domain *sd; 6450 struct sched_domain *sd;
6321#ifdef CONFIG_NUMA 6451#ifdef CONFIG_NUMA
6322 struct sched_group **sched_group_nodes = NULL; 6452 struct sched_group **sched_group_nodes = NULL;
6323 struct sched_group *sched_group_allnodes = NULL; 6453 int sd_allnodes = 0;
6324 6454
6325 /* 6455 /*
6326 * Allocate the per-node list of sched groups 6456 * Allocate the per-node list of sched groups
@@ -6338,7 +6468,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6338 * Set up domains for cpus specified by the cpu_map. 6468 * Set up domains for cpus specified by the cpu_map.
6339 */ 6469 */
6340 for_each_cpu_mask(i, *cpu_map) { 6470 for_each_cpu_mask(i, *cpu_map) {
6341 int group;
6342 struct sched_domain *sd = NULL, *p; 6471 struct sched_domain *sd = NULL, *p;
6343 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 6472 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6344 6473
@@ -6347,26 +6476,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6347#ifdef CONFIG_NUMA 6476#ifdef CONFIG_NUMA
6348 if (cpus_weight(*cpu_map) 6477 if (cpus_weight(*cpu_map)
6349 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 6478 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6350 if (!sched_group_allnodes) {
6351 sched_group_allnodes
6352 = kmalloc_node(sizeof(struct sched_group)
6353 * MAX_NUMNODES,
6354 GFP_KERNEL,
6355 cpu_to_node(i));
6356 if (!sched_group_allnodes) {
6357 printk(KERN_WARNING
6358 "Can not alloc allnodes sched group\n");
6359 goto error;
6360 }
6361 sched_group_allnodes_bycpu[i]
6362 = sched_group_allnodes;
6363 }
6364 sd = &per_cpu(allnodes_domains, i); 6479 sd = &per_cpu(allnodes_domains, i);
6365 *sd = SD_ALLNODES_INIT; 6480 *sd = SD_ALLNODES_INIT;
6366 sd->span = *cpu_map; 6481 sd->span = *cpu_map;
6367 group = cpu_to_allnodes_group(i, cpu_map); 6482 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6368 sd->groups = &sched_group_allnodes[group];
6369 p = sd; 6483 p = sd;
6484 sd_allnodes = 1;
6370 } else 6485 } else
6371 p = NULL; 6486 p = NULL;
6372 6487
@@ -6381,36 +6496,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6381 6496
6382 p = sd; 6497 p = sd;
6383 sd = &per_cpu(phys_domains, i); 6498 sd = &per_cpu(phys_domains, i);
6384 group = cpu_to_phys_group(i, cpu_map);
6385 *sd = SD_CPU_INIT; 6499 *sd = SD_CPU_INIT;
6386 sd->span = nodemask; 6500 sd->span = nodemask;
6387 sd->parent = p; 6501 sd->parent = p;
6388 if (p) 6502 if (p)
6389 p->child = sd; 6503 p->child = sd;
6390 sd->groups = &sched_group_phys[group]; 6504 cpu_to_phys_group(i, cpu_map, &sd->groups);
6391 6505
6392#ifdef CONFIG_SCHED_MC 6506#ifdef CONFIG_SCHED_MC
6393 p = sd; 6507 p = sd;
6394 sd = &per_cpu(core_domains, i); 6508 sd = &per_cpu(core_domains, i);
6395 group = cpu_to_core_group(i, cpu_map);
6396 *sd = SD_MC_INIT; 6509 *sd = SD_MC_INIT;
6397 sd->span = cpu_coregroup_map(i); 6510 sd->span = cpu_coregroup_map(i);
6398 cpus_and(sd->span, sd->span, *cpu_map); 6511 cpus_and(sd->span, sd->span, *cpu_map);
6399 sd->parent = p; 6512 sd->parent = p;
6400 p->child = sd; 6513 p->child = sd;
6401 sd->groups = &sched_group_core[group]; 6514 cpu_to_core_group(i, cpu_map, &sd->groups);
6402#endif 6515#endif
6403 6516
6404#ifdef CONFIG_SCHED_SMT 6517#ifdef CONFIG_SCHED_SMT
6405 p = sd; 6518 p = sd;
6406 sd = &per_cpu(cpu_domains, i); 6519 sd = &per_cpu(cpu_domains, i);
6407 group = cpu_to_cpu_group(i, cpu_map);
6408 *sd = SD_SIBLING_INIT; 6520 *sd = SD_SIBLING_INIT;
6409 sd->span = cpu_sibling_map[i]; 6521 sd->span = cpu_sibling_map[i];
6410 cpus_and(sd->span, sd->span, *cpu_map); 6522 cpus_and(sd->span, sd->span, *cpu_map);
6411 sd->parent = p; 6523 sd->parent = p;
6412 p->child = sd; 6524 p->child = sd;
6413 sd->groups = &sched_group_cpus[group]; 6525 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6414#endif 6526#endif
6415 } 6527 }
6416 6528
@@ -6422,8 +6534,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6422 if (i != first_cpu(this_sibling_map)) 6534 if (i != first_cpu(this_sibling_map))
6423 continue; 6535 continue;
6424 6536
6425 init_sched_build_groups(sched_group_cpus, this_sibling_map, 6537 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
6426 cpu_map, &cpu_to_cpu_group);
6427 } 6538 }
6428#endif 6539#endif
6429 6540
@@ -6434,8 +6545,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6434 cpus_and(this_core_map, this_core_map, *cpu_map); 6545 cpus_and(this_core_map, this_core_map, *cpu_map);
6435 if (i != first_cpu(this_core_map)) 6546 if (i != first_cpu(this_core_map))
6436 continue; 6547 continue;
6437 init_sched_build_groups(sched_group_core, this_core_map, 6548 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
6438 cpu_map, &cpu_to_core_group);
6439 } 6549 }
6440#endif 6550#endif
6441 6551
@@ -6448,15 +6558,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6448 if (cpus_empty(nodemask)) 6558 if (cpus_empty(nodemask))
6449 continue; 6559 continue;
6450 6560
6451 init_sched_build_groups(sched_group_phys, nodemask, 6561 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6452 cpu_map, &cpu_to_phys_group);
6453 } 6562 }
6454 6563
6455#ifdef CONFIG_NUMA 6564#ifdef CONFIG_NUMA
6456 /* Set up node groups */ 6565 /* Set up node groups */
6457 if (sched_group_allnodes) 6566 if (sd_allnodes)
6458 init_sched_build_groups(sched_group_allnodes, *cpu_map, 6567 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
6459 cpu_map, &cpu_to_allnodes_group);
6460 6568
6461 for (i = 0; i < MAX_NUMNODES; i++) { 6569 for (i = 0; i < MAX_NUMNODES; i++) {
6462 /* Set up node groups */ 6570 /* Set up node groups */
@@ -6548,10 +6656,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6548 for (i = 0; i < MAX_NUMNODES; i++) 6656 for (i = 0; i < MAX_NUMNODES; i++)
6549 init_numa_sched_groups_power(sched_group_nodes[i]); 6657 init_numa_sched_groups_power(sched_group_nodes[i]);
6550 6658
6551 if (sched_group_allnodes) { 6659 if (sd_allnodes) {
6552 int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); 6660 struct sched_group *sg;
6553 struct sched_group *sg = &sched_group_allnodes[group];
6554 6661
6662 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6555 init_numa_sched_groups_power(sg); 6663 init_numa_sched_groups_power(sg);
6556 } 6664 }
6557#endif 6665#endif
@@ -6723,8 +6831,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6723 sched_smt_power_savings_store); 6831 sched_smt_power_savings_store);
6724#endif 6832#endif
6725 6833
6726
6727#ifdef CONFIG_HOTPLUG_CPU
6728/* 6834/*
6729 * Force a reinitialization of the sched domains hierarchy. The domains 6835 * Force a reinitialization of the sched domains hierarchy. The domains
6730 * and groups cannot be updated in place without racing with the balancing 6836 * and groups cannot be updated in place without racing with the balancing
@@ -6757,7 +6863,6 @@ static int update_sched_domains(struct notifier_block *nfb,
6757 6863
6758 return NOTIFY_OK; 6864 return NOTIFY_OK;
6759} 6865}
6760#endif
6761 6866
6762void __init sched_init_smp(void) 6867void __init sched_init_smp(void)
6763{ 6868{
@@ -6833,6 +6938,10 @@ void __init sched_init(void)
6833 6938
6834 set_load_weight(&init_task); 6939 set_load_weight(&init_task);
6835 6940
6941#ifdef CONFIG_SMP
6942 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6943#endif
6944
6836#ifdef CONFIG_RT_MUTEXES 6945#ifdef CONFIG_RT_MUTEXES
6837 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 6946 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6838#endif 6947#endif
@@ -6867,6 +6976,7 @@ void __might_sleep(char *file, int line)
6867 " context at %s:%d\n", file, line); 6976 " context at %s:%d\n", file, line);
6868 printk("in_atomic():%d, irqs_disabled():%d\n", 6977 printk("in_atomic():%d, irqs_disabled():%d\n",
6869 in_atomic(), irqs_disabled()); 6978 in_atomic(), irqs_disabled());
6979 debug_show_held_locks(current);
6870 dump_stack(); 6980 dump_stack();
6871 } 6981 }
6872#endif 6982#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 7ed8d5304bec..1921ffdc5e77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,10 @@
23#include <linux/ptrace.h> 23#include <linux/ptrace.h>
24#include <linux/signal.h> 24#include <linux/signal.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/freezer.h>
27#include <linux/pid_namespace.h>
28#include <linux/nsproxy.h>
29
26#include <asm/param.h> 30#include <asm/param.h>
27#include <asm/uaccess.h> 31#include <asm/uaccess.h>
28#include <asm/unistd.h> 32#include <asm/unistd.h>
@@ -33,7 +37,7 @@
33 * SLAB caches for signal bits. 37 * SLAB caches for signal bits.
34 */ 38 */
35 39
36static kmem_cache_t *sigqueue_cachep; 40static struct kmem_cache *sigqueue_cachep;
37 41
38/* 42/*
39 * In POSIX a signal is sent either to a specific thread (Linux task) 43 * In POSIX a signal is sent either to a specific thread (Linux task)
@@ -267,18 +271,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
267 int override_rlimit) 271 int override_rlimit)
268{ 272{
269 struct sigqueue *q = NULL; 273 struct sigqueue *q = NULL;
274 struct user_struct *user;
270 275
271 atomic_inc(&t->user->sigpending); 276 /*
277 * In order to avoid problems with "switch_user()", we want to make
278 * sure that the compiler doesn't re-load "t->user"
279 */
280 user = t->user;
281 barrier();
282 atomic_inc(&user->sigpending);
272 if (override_rlimit || 283 if (override_rlimit ||
273 atomic_read(&t->user->sigpending) <= 284 atomic_read(&user->sigpending) <=
274 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 285 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
275 q = kmem_cache_alloc(sigqueue_cachep, flags); 286 q = kmem_cache_alloc(sigqueue_cachep, flags);
276 if (unlikely(q == NULL)) { 287 if (unlikely(q == NULL)) {
277 atomic_dec(&t->user->sigpending); 288 atomic_dec(&user->sigpending);
278 } else { 289 } else {
279 INIT_LIST_HEAD(&q->list); 290 INIT_LIST_HEAD(&q->list);
280 q->flags = 0; 291 q->flags = 0;
281 q->user = get_uid(t->user); 292 q->user = get_uid(user);
282 } 293 }
283 return(q); 294 return(q);
284} 295}
@@ -575,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
575 error = -EPERM; 586 error = -EPERM;
576 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 587 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
577 && ((sig != SIGCONT) || 588 && ((sig != SIGCONT) ||
578 (current->signal->session != t->signal->session)) 589 (process_session(current) != process_session(t)))
579 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 590 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
580 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 591 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
581 && !capable(CAP_KILL)) 592 && !capable(CAP_KILL))
@@ -1126,8 +1137,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1126 return error; 1137 return error;
1127} 1138}
1128 1139
1129int 1140static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1130kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1131{ 1141{
1132 int error; 1142 int error;
1133 rcu_read_lock(); 1143 rcu_read_lock();
@@ -1870,8 +1880,12 @@ relock:
1870 if (sig_kernel_ignore(signr)) /* Default is nothing. */ 1880 if (sig_kernel_ignore(signr)) /* Default is nothing. */
1871 continue; 1881 continue;
1872 1882
1873 /* Init gets no signals it doesn't want. */ 1883 /*
1874 if (current == child_reaper) 1884 * Init of a pid space gets no signals it doesn't want from
1885 * within that pid space. It can of course get signals from
1886 * its parent pid space.
1887 */
1888 if (current == child_reaper(current))
1875 continue; 1889 continue;
1876 1890
1877 if (sig_kernel_stop(signr)) { 1891 if (sig_kernel_stop(signr)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce16..918e52df090e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
574 574
575 switch (action) { 575 switch (action) {
576 case CPU_UP_PREPARE: 576 case CPU_UP_PREPARE:
577 BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
578 BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
579 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 577 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
580 if (IS_ERR(p)) { 578 if (IS_ERR(p)) {
581 printk("ksoftirqd for %i failed\n", hotcpu); 579 printk("ksoftirqd for %i failed\n", hotcpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 476c3741511b..2c6c2bf85514 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -293,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
293} 293}
294 294
295EXPORT_SYMBOL(_spin_lock_nested); 295EXPORT_SYMBOL(_spin_lock_nested);
296unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
297{
298 unsigned long flags;
299
300 local_irq_save(flags);
301 preempt_disable();
302 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
303 /*
304 * On lockdep we dont want the hand-coded irq-enable of
305 * _raw_spin_lock_flags() code, because lockdep assumes
306 * that interrupts are not re-enabled during lock-acquire:
307 */
308#ifdef CONFIG_PROVE_SPIN_LOCKING
309 _raw_spin_lock(lock);
310#else
311 _raw_spin_lock_flags(lock, &flags);
312#endif
313 return flags;
314}
315
316EXPORT_SYMBOL(_spin_lock_irqsave_nested);
296 317
297#endif 318#endif
298 319
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801b..c7675c1bfdf2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
880 return 0; 880 return 0;
881} 881}
882 882
883static void deferred_cad(void *dummy) 883static void deferred_cad(struct work_struct *dummy)
884{ 884{
885 kernel_restart(NULL); 885 kernel_restart(NULL);
886} 886}
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
892 */ 892 */
893void ctrl_alt_del(void) 893void ctrl_alt_del(void)
894{ 894{
895 static DECLARE_WORK(cad_work, deferred_cad, NULL); 895 static DECLARE_WORK(cad_work, deferred_cad);
896 896
897 if (C_A_D) 897 if (C_A_D)
898 schedule_work(&cad_work); 898 schedule_work(&cad_work);
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
1102asmlinkage long sys_setuid(uid_t uid) 1102asmlinkage long sys_setuid(uid_t uid)
1103{ 1103{
1104 int old_euid = current->euid; 1104 int old_euid = current->euid;
1105 int old_ruid, old_suid, new_ruid, new_suid; 1105 int old_ruid, old_suid, new_suid;
1106 int retval; 1106 int retval;
1107 1107
1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
1109 if (retval) 1109 if (retval)
1110 return retval; 1110 return retval;
1111 1111
1112 old_ruid = new_ruid = current->uid; 1112 old_ruid = current->uid;
1113 old_suid = current->suid; 1113 old_suid = current->suid;
1114 new_suid = old_suid; 1114 new_suid = old_suid;
1115 1115
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1381 1381
1382 if (p->real_parent == group_leader) { 1382 if (p->real_parent == group_leader) {
1383 err = -EPERM; 1383 err = -EPERM;
1384 if (p->signal->session != group_leader->signal->session) 1384 if (process_session(p) != process_session(group_leader))
1385 goto out; 1385 goto out;
1386 err = -EACCES; 1386 err = -EACCES;
1387 if (p->did_exec) 1387 if (p->did_exec)
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1397 goto out; 1397 goto out;
1398 1398
1399 if (pgid != pid) { 1399 if (pgid != pid) {
1400 struct task_struct *p; 1400 struct task_struct *g =
1401 find_task_by_pid_type(PIDTYPE_PGID, pgid);
1401 1402
1402 do_each_task_pid(pgid, PIDTYPE_PGID, p) { 1403 if (!g || process_session(g) != process_session(group_leader))
1403 if (p->signal->session == group_leader->signal->session) 1404 goto out;
1404 goto ok_pgid;
1405 } while_each_task_pid(pgid, PIDTYPE_PGID, p);
1406 goto out;
1407 } 1405 }
1408 1406
1409ok_pgid:
1410 err = security_task_setpgid(p, pgid); 1407 err = security_task_setpgid(p, pgid);
1411 if (err) 1408 if (err)
1412 goto out; 1409 goto out;
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void)
1459asmlinkage long sys_getsid(pid_t pid) 1456asmlinkage long sys_getsid(pid_t pid)
1460{ 1457{
1461 if (!pid) 1458 if (!pid)
1462 return current->signal->session; 1459 return process_session(current);
1463 else { 1460 else {
1464 int retval; 1461 int retval;
1465 struct task_struct *p; 1462 struct task_struct *p;
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
1471 if (p) { 1468 if (p) {
1472 retval = security_task_getsid(p); 1469 retval = security_task_getsid(p);
1473 if (!retval) 1470 if (!retval)
1474 retval = p->signal->session; 1471 retval = process_session(p);
1475 } 1472 }
1476 read_unlock(&tasklist_lock); 1473 read_unlock(&tasklist_lock);
1477 return retval; 1474 return retval;
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void)
1484 pid_t session; 1481 pid_t session;
1485 int err = -EPERM; 1482 int err = -EPERM;
1486 1483
1487 mutex_lock(&tty_mutex);
1488 write_lock_irq(&tasklist_lock); 1484 write_lock_irq(&tasklist_lock);
1489 1485
1490 /* Fail if I am already a session leader */ 1486 /* Fail if I am already a session leader */
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void)
1504 1500
1505 group_leader->signal->leader = 1; 1501 group_leader->signal->leader = 1;
1506 __set_special_pids(session, session); 1502 __set_special_pids(session, session);
1503
1504 spin_lock(&group_leader->sighand->siglock);
1507 group_leader->signal->tty = NULL; 1505 group_leader->signal->tty = NULL;
1508 group_leader->signal->tty_old_pgrp = 0; 1506 group_leader->signal->tty_old_pgrp = 0;
1507 spin_unlock(&group_leader->sighand->siglock);
1508
1509 err = process_group(group_leader); 1509 err = process_group(group_leader);
1510out: 1510out:
1511 write_unlock_irq(&tasklist_lock); 1511 write_unlock_irq(&tasklist_lock);
1512 mutex_unlock(&tty_mutex);
1513 return err; 1512 return err;
1514} 1513}
1515 1514
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7a3b2e75f040..d7306d0f3dfc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(compat_sys_get_robust_list);
49cond_syscall(sys_epoll_create); 49cond_syscall(sys_epoll_create);
50cond_syscall(sys_epoll_ctl); 50cond_syscall(sys_epoll_ctl);
51cond_syscall(sys_epoll_wait); 51cond_syscall(sys_epoll_wait);
52cond_syscall(sys_epoll_pwait);
52cond_syscall(sys_semget); 53cond_syscall(sys_semget);
53cond_syscall(sys_semop); 54cond_syscall(sys_semop);
54cond_syscall(sys_semtimedop); 55cond_syscall(sys_semtimedop);
@@ -134,6 +135,7 @@ cond_syscall(sys_madvise);
134cond_syscall(sys_mremap); 135cond_syscall(sys_mremap);
135cond_syscall(sys_remap_file_pages); 136cond_syscall(sys_remap_file_pages);
136cond_syscall(compat_sys_move_pages); 137cond_syscall(compat_sys_move_pages);
138cond_syscall(compat_sys_migrate_pages);
137 139
138/* block-layer dependent */ 140/* block-layer dependent */
139cond_syscall(sys_bdflush); 141cond_syscall(sys_bdflush);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8020fb273c4f..130c5ec9ee0b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
54 54
55#ifdef CONFIG_X86 55#ifdef CONFIG_X86
56#include <asm/nmi.h> 56#include <asm/nmi.h>
57#include <asm/stacktrace.h>
57#endif 58#endif
58 59
59#if defined(CONFIG_SYSCTL) 60#if defined(CONFIG_SYSCTL)
@@ -91,7 +92,9 @@ extern char modprobe_path[];
91extern int sg_big_buff; 92extern int sg_big_buff;
92#endif 93#endif
93#ifdef CONFIG_SYSVIPC 94#ifdef CONFIG_SYSVIPC
94static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, 95static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
96 void __user *buffer, size_t *lenp, loff_t *ppos);
97static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
95 void __user *buffer, size_t *lenp, loff_t *ppos); 98 void __user *buffer, size_t *lenp, loff_t *ppos);
96#endif 99#endif
97 100
@@ -130,14 +133,26 @@ extern int max_lock_depth;
130 133
131#ifdef CONFIG_SYSCTL_SYSCALL 134#ifdef CONFIG_SYSCTL_SYSCALL
132static int parse_table(int __user *, int, void __user *, size_t __user *, 135static int parse_table(int __user *, int, void __user *, size_t __user *,
133 void __user *, size_t, ctl_table *, void **); 136 void __user *, size_t, ctl_table *);
134#endif 137#endif
135 138
136static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 139static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
137 void __user *buffer, size_t *lenp, loff_t *ppos); 140 void __user *buffer, size_t *lenp, loff_t *ppos);
138 141
142static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
143 void __user *oldval, size_t __user *oldlenp,
144 void __user *newval, size_t newlen);
145
146#ifdef CONFIG_SYSVIPC
147static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
148 void __user *oldval, size_t __user *oldlenp,
149 void __user *newval, size_t newlen);
150#endif
151
152#ifdef CONFIG_PROC_SYSCTL
139static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 153static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
140 void __user *buffer, size_t *lenp, loff_t *ppos); 154 void __user *buffer, size_t *lenp, loff_t *ppos);
155#endif
141 156
142static ctl_table root_table[]; 157static ctl_table root_table[];
143static struct ctl_table_header root_table_header = 158static struct ctl_table_header root_table_header =
@@ -160,6 +175,40 @@ extern ctl_table inotify_table[];
160int sysctl_legacy_va_layout; 175int sysctl_legacy_va_layout;
161#endif 176#endif
162 177
178static void *get_uts(ctl_table *table, int write)
179{
180 char *which = table->data;
181#ifdef CONFIG_UTS_NS
182 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
183 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
184#endif
185 if (!write)
186 down_read(&uts_sem);
187 else
188 down_write(&uts_sem);
189 return which;
190}
191
192static void put_uts(ctl_table *table, int write, void *which)
193{
194 if (!write)
195 up_read(&uts_sem);
196 else
197 up_write(&uts_sem);
198}
199
200#ifdef CONFIG_SYSVIPC
201static void *get_ipc(ctl_table *table, int write)
202{
203 char *which = table->data;
204 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
205 which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
206 return which;
207}
208#else
209#define get_ipc(T,W) ((T)->data)
210#endif
211
163/* /proc declarations: */ 212/* /proc declarations: */
164 213
165#ifdef CONFIG_PROC_SYSCTL 214#ifdef CONFIG_PROC_SYSCTL
@@ -168,7 +217,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
168static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); 217static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
169static int proc_opensys(struct inode *, struct file *); 218static int proc_opensys(struct inode *, struct file *);
170 219
171struct file_operations proc_sys_file_operations = { 220const struct file_operations proc_sys_file_operations = {
172 .open = proc_opensys, 221 .open = proc_opensys,
173 .read = proc_readsys, 222 .read = proc_readsys,
174 .write = proc_writesys, 223 .write = proc_writesys,
@@ -226,7 +275,6 @@ static ctl_table root_table[] = {
226}; 275};
227 276
228static ctl_table kern_table[] = { 277static ctl_table kern_table[] = {
229#ifndef CONFIG_UTS_NS
230 { 278 {
231 .ctl_name = KERN_OSTYPE, 279 .ctl_name = KERN_OSTYPE,
232 .procname = "ostype", 280 .procname = "ostype",
@@ -234,7 +282,7 @@ static ctl_table kern_table[] = {
234 .maxlen = sizeof(init_uts_ns.name.sysname), 282 .maxlen = sizeof(init_uts_ns.name.sysname),
235 .mode = 0444, 283 .mode = 0444,
236 .proc_handler = &proc_do_uts_string, 284 .proc_handler = &proc_do_uts_string,
237 .strategy = &sysctl_string, 285 .strategy = &sysctl_uts_string,
238 }, 286 },
239 { 287 {
240 .ctl_name = KERN_OSRELEASE, 288 .ctl_name = KERN_OSRELEASE,
@@ -243,7 +291,7 @@ static ctl_table kern_table[] = {
243 .maxlen = sizeof(init_uts_ns.name.release), 291 .maxlen = sizeof(init_uts_ns.name.release),
244 .mode = 0444, 292 .mode = 0444,
245 .proc_handler = &proc_do_uts_string, 293 .proc_handler = &proc_do_uts_string,
246 .strategy = &sysctl_string, 294 .strategy = &sysctl_uts_string,
247 }, 295 },
248 { 296 {
249 .ctl_name = KERN_VERSION, 297 .ctl_name = KERN_VERSION,
@@ -252,7 +300,7 @@ static ctl_table kern_table[] = {
252 .maxlen = sizeof(init_uts_ns.name.version), 300 .maxlen = sizeof(init_uts_ns.name.version),
253 .mode = 0444, 301 .mode = 0444,
254 .proc_handler = &proc_do_uts_string, 302 .proc_handler = &proc_do_uts_string,
255 .strategy = &sysctl_string, 303 .strategy = &sysctl_uts_string,
256 }, 304 },
257 { 305 {
258 .ctl_name = KERN_NODENAME, 306 .ctl_name = KERN_NODENAME,
@@ -261,7 +309,7 @@ static ctl_table kern_table[] = {
261 .maxlen = sizeof(init_uts_ns.name.nodename), 309 .maxlen = sizeof(init_uts_ns.name.nodename),
262 .mode = 0644, 310 .mode = 0644,
263 .proc_handler = &proc_do_uts_string, 311 .proc_handler = &proc_do_uts_string,
264 .strategy = &sysctl_string, 312 .strategy = &sysctl_uts_string,
265 }, 313 },
266 { 314 {
267 .ctl_name = KERN_DOMAINNAME, 315 .ctl_name = KERN_DOMAINNAME,
@@ -270,56 +318,8 @@ static ctl_table kern_table[] = {
270 .maxlen = sizeof(init_uts_ns.name.domainname), 318 .maxlen = sizeof(init_uts_ns.name.domainname),
271 .mode = 0644, 319 .mode = 0644,
272 .proc_handler = &proc_do_uts_string, 320 .proc_handler = &proc_do_uts_string,
273 .strategy = &sysctl_string, 321 .strategy = &sysctl_uts_string,
274 },
275#else /* !CONFIG_UTS_NS */
276 {
277 .ctl_name = KERN_OSTYPE,
278 .procname = "ostype",
279 .data = NULL,
280 /* could maybe use __NEW_UTS_LEN here? */
281 .maxlen = FIELD_SIZEOF(struct new_utsname, sysname),
282 .mode = 0444,
283 .proc_handler = &proc_do_uts_string,
284 .strategy = &sysctl_string,
285 },
286 {
287 .ctl_name = KERN_OSRELEASE,
288 .procname = "osrelease",
289 .data = NULL,
290 .maxlen = FIELD_SIZEOF(struct new_utsname, release),
291 .mode = 0444,
292 .proc_handler = &proc_do_uts_string,
293 .strategy = &sysctl_string,
294 },
295 {
296 .ctl_name = KERN_VERSION,
297 .procname = "version",
298 .data = NULL,
299 .maxlen = FIELD_SIZEOF(struct new_utsname, version),
300 .mode = 0444,
301 .proc_handler = &proc_do_uts_string,
302 .strategy = &sysctl_string,
303 },
304 {
305 .ctl_name = KERN_NODENAME,
306 .procname = "hostname",
307 .data = NULL,
308 .maxlen = FIELD_SIZEOF(struct new_utsname, nodename),
309 .mode = 0644,
310 .proc_handler = &proc_do_uts_string,
311 .strategy = &sysctl_string,
312 },
313 {
314 .ctl_name = KERN_DOMAINNAME,
315 .procname = "domainname",
316 .data = NULL,
317 .maxlen = FIELD_SIZEOF(struct new_utsname, domainname),
318 .mode = 0644,
319 .proc_handler = &proc_do_uts_string,
320 .strategy = &sysctl_string,
321 }, 322 },
322#endif /* !CONFIG_UTS_NS */
323 { 323 {
324 .ctl_name = KERN_PANIC, 324 .ctl_name = KERN_PANIC,
325 .procname = "panic", 325 .procname = "panic",
@@ -478,58 +478,65 @@ static ctl_table kern_table[] = {
478 { 478 {
479 .ctl_name = KERN_SHMMAX, 479 .ctl_name = KERN_SHMMAX,
480 .procname = "shmmax", 480 .procname = "shmmax",
481 .data = NULL, 481 .data = &init_ipc_ns.shm_ctlmax,
482 .maxlen = sizeof (size_t), 482 .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
483 .mode = 0644, 483 .mode = 0644,
484 .proc_handler = &proc_do_ipc_string, 484 .proc_handler = &proc_ipc_doulongvec_minmax,
485 .strategy = sysctl_ipc_data,
485 }, 486 },
486 { 487 {
487 .ctl_name = KERN_SHMALL, 488 .ctl_name = KERN_SHMALL,
488 .procname = "shmall", 489 .procname = "shmall",
489 .data = NULL, 490 .data = &init_ipc_ns.shm_ctlall,
490 .maxlen = sizeof (size_t), 491 .maxlen = sizeof (init_ipc_ns.shm_ctlall),
491 .mode = 0644, 492 .mode = 0644,
492 .proc_handler = &proc_do_ipc_string, 493 .proc_handler = &proc_ipc_doulongvec_minmax,
494 .strategy = sysctl_ipc_data,
493 }, 495 },
494 { 496 {
495 .ctl_name = KERN_SHMMNI, 497 .ctl_name = KERN_SHMMNI,
496 .procname = "shmmni", 498 .procname = "shmmni",
497 .data = NULL, 499 .data = &init_ipc_ns.shm_ctlmni,
498 .maxlen = sizeof (int), 500 .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
499 .mode = 0644, 501 .mode = 0644,
500 .proc_handler = &proc_do_ipc_string, 502 .proc_handler = &proc_ipc_dointvec,
503 .strategy = sysctl_ipc_data,
501 }, 504 },
502 { 505 {
503 .ctl_name = KERN_MSGMAX, 506 .ctl_name = KERN_MSGMAX,
504 .procname = "msgmax", 507 .procname = "msgmax",
505 .data = NULL, 508 .data = &init_ipc_ns.msg_ctlmax,
506 .maxlen = sizeof (int), 509 .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
507 .mode = 0644, 510 .mode = 0644,
508 .proc_handler = &proc_do_ipc_string, 511 .proc_handler = &proc_ipc_dointvec,
512 .strategy = sysctl_ipc_data,
509 }, 513 },
510 { 514 {
511 .ctl_name = KERN_MSGMNI, 515 .ctl_name = KERN_MSGMNI,
512 .procname = "msgmni", 516 .procname = "msgmni",
513 .data = NULL, 517 .data = &init_ipc_ns.msg_ctlmni,
514 .maxlen = sizeof (int), 518 .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
515 .mode = 0644, 519 .mode = 0644,
516 .proc_handler = &proc_do_ipc_string, 520 .proc_handler = &proc_ipc_dointvec,
521 .strategy = sysctl_ipc_data,
517 }, 522 },
518 { 523 {
519 .ctl_name = KERN_MSGMNB, 524 .ctl_name = KERN_MSGMNB,
520 .procname = "msgmnb", 525 .procname = "msgmnb",
521 .data = NULL, 526 .data = &init_ipc_ns.msg_ctlmnb,
522 .maxlen = sizeof (int), 527 .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
523 .mode = 0644, 528 .mode = 0644,
524 .proc_handler = &proc_do_ipc_string, 529 .proc_handler = &proc_ipc_dointvec,
530 .strategy = sysctl_ipc_data,
525 }, 531 },
526 { 532 {
527 .ctl_name = KERN_SEM, 533 .ctl_name = KERN_SEM,
528 .procname = "sem", 534 .procname = "sem",
529 .data = NULL, 535 .data = &init_ipc_ns.sem_ctls,
530 .maxlen = 4*sizeof (int), 536 .maxlen = 4*sizeof (int),
531 .mode = 0644, 537 .mode = 0644,
532 .proc_handler = &proc_do_ipc_string, 538 .proc_handler = &proc_ipc_dointvec,
539 .strategy = sysctl_ipc_data,
533 }, 540 },
534#endif 541#endif
535#ifdef CONFIG_MAGIC_SYSRQ 542#ifdef CONFIG_MAGIC_SYSRQ
@@ -542,6 +549,7 @@ static ctl_table kern_table[] = {
542 .proc_handler = &proc_dointvec, 549 .proc_handler = &proc_dointvec,
543 }, 550 },
544#endif 551#endif
552#ifdef CONFIG_PROC_SYSCTL
545 { 553 {
546 .ctl_name = KERN_CADPID, 554 .ctl_name = KERN_CADPID,
547 .procname = "cad_pid", 555 .procname = "cad_pid",
@@ -550,6 +558,7 @@ static ctl_table kern_table[] = {
550 .mode = 0600, 558 .mode = 0600,
551 .proc_handler = &proc_do_cad_pid, 559 .proc_handler = &proc_do_cad_pid,
552 }, 560 },
561#endif
553 { 562 {
554 .ctl_name = KERN_MAX_THREADS, 563 .ctl_name = KERN_MAX_THREADS,
555 .procname = "threads-max", 564 .procname = "threads-max",
@@ -703,6 +712,14 @@ static ctl_table kern_table[] = {
703 .mode = 0444, 712 .mode = 0444,
704 .proc_handler = &proc_dointvec, 713 .proc_handler = &proc_dointvec,
705 }, 714 },
715 {
716 .ctl_name = CTL_UNNUMBERED,
717 .procname = "kstack_depth_to_print",
718 .data = &kstack_depth_to_print,
719 .maxlen = sizeof(int),
720 .mode = 0644,
721 .proc_handler = &proc_dointvec,
722 },
706#endif 723#endif
707#if defined(CONFIG_MMU) 724#if defined(CONFIG_MMU)
708 { 725 {
@@ -973,17 +990,6 @@ static ctl_table vm_table[] = {
973 .extra1 = &zero, 990 .extra1 = &zero,
974 }, 991 },
975#endif 992#endif
976#ifdef CONFIG_SWAP
977 {
978 .ctl_name = VM_SWAP_TOKEN_TIMEOUT,
979 .procname = "swap_token_timeout",
980 .data = &swap_token_default_timeout,
981 .maxlen = sizeof(swap_token_default_timeout),
982 .mode = 0644,
983 .proc_handler = &proc_dointvec_jiffies,
984 .strategy = &sysctl_jiffies,
985 },
986#endif
987#ifdef CONFIG_NUMA 993#ifdef CONFIG_NUMA
988 { 994 {
989 .ctl_name = VM_ZONE_RECLAIM_MODE, 995 .ctl_name = VM_ZONE_RECLAIM_MODE,
@@ -1237,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1237 do { 1243 do {
1238 struct ctl_table_header *head = 1244 struct ctl_table_header *head =
1239 list_entry(tmp, struct ctl_table_header, ctl_entry); 1245 list_entry(tmp, struct ctl_table_header, ctl_entry);
1240 void *context = NULL;
1241 1246
1242 if (!use_table(head)) 1247 if (!use_table(head))
1243 continue; 1248 continue;
@@ -1245,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1245 spin_unlock(&sysctl_lock); 1250 spin_unlock(&sysctl_lock);
1246 1251
1247 error = parse_table(name, nlen, oldval, oldlenp, 1252 error = parse_table(name, nlen, oldval, oldlenp,
1248 newval, newlen, head->ctl_table, 1253 newval, newlen, head->ctl_table);
1249 &context);
1250 kfree(context);
1251 1254
1252 spin_lock(&sysctl_lock); 1255 spin_lock(&sysctl_lock);
1253 unuse_table(head); 1256 unuse_table(head);
@@ -1303,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op)
1303static int parse_table(int __user *name, int nlen, 1306static int parse_table(int __user *name, int nlen,
1304 void __user *oldval, size_t __user *oldlenp, 1307 void __user *oldval, size_t __user *oldlenp,
1305 void __user *newval, size_t newlen, 1308 void __user *newval, size_t newlen,
1306 ctl_table *table, void **context) 1309 ctl_table *table)
1307{ 1310{
1308 int n; 1311 int n;
1309repeat: 1312repeat:
@@ -1311,7 +1314,9 @@ repeat:
1311 return -ENOTDIR; 1314 return -ENOTDIR;
1312 if (get_user(n, name)) 1315 if (get_user(n, name))
1313 return -EFAULT; 1316 return -EFAULT;
1314 for ( ; table->ctl_name; table++) { 1317 for ( ; table->ctl_name || table->procname; table++) {
1318 if (!table->ctl_name)
1319 continue;
1315 if (n == table->ctl_name || table->ctl_name == CTL_ANY) { 1320 if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
1316 int error; 1321 int error;
1317 if (table->child) { 1322 if (table->child) {
@@ -1321,7 +1326,7 @@ repeat:
1321 error = table->strategy( 1326 error = table->strategy(
1322 table, name, nlen, 1327 table, name, nlen,
1323 oldval, oldlenp, 1328 oldval, oldlenp,
1324 newval, newlen, context); 1329 newval, newlen);
1325 if (error) 1330 if (error)
1326 return error; 1331 return error;
1327 } 1332 }
@@ -1332,7 +1337,7 @@ repeat:
1332 } 1337 }
1333 error = do_sysctl_strategy(table, name, nlen, 1338 error = do_sysctl_strategy(table, name, nlen,
1334 oldval, oldlenp, 1339 oldval, oldlenp,
1335 newval, newlen, context); 1340 newval, newlen);
1336 return error; 1341 return error;
1337 } 1342 }
1338 } 1343 }
@@ -1343,7 +1348,7 @@ repeat:
1343int do_sysctl_strategy (ctl_table *table, 1348int do_sysctl_strategy (ctl_table *table,
1344 int __user *name, int nlen, 1349 int __user *name, int nlen,
1345 void __user *oldval, size_t __user *oldlenp, 1350 void __user *oldval, size_t __user *oldlenp,
1346 void __user *newval, size_t newlen, void **context) 1351 void __user *newval, size_t newlen)
1347{ 1352{
1348 int op = 0, rc; 1353 int op = 0, rc;
1349 size_t len; 1354 size_t len;
@@ -1357,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table,
1357 1362
1358 if (table->strategy) { 1363 if (table->strategy) {
1359 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1364 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1360 newval, newlen, context); 1365 newval, newlen);
1361 if (rc < 0) 1366 if (rc < 0)
1362 return rc; 1367 return rc;
1363 if (rc > 0) 1368 if (rc > 0)
@@ -1528,7 +1533,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
1528 int len; 1533 int len;
1529 mode_t mode; 1534 mode_t mode;
1530 1535
1531 for (; table->ctl_name; table++) { 1536 for (; table->ctl_name || table->procname; table++) {
1532 /* Can't do anything without a proc name. */ 1537 /* Can't do anything without a proc name. */
1533 if (!table->procname) 1538 if (!table->procname)
1534 continue; 1539 continue;
@@ -1575,7 +1580,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
1575static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) 1580static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
1576{ 1581{
1577 struct proc_dir_entry *de; 1582 struct proc_dir_entry *de;
1578 for (; table->ctl_name; table++) { 1583 for (; table->ctl_name || table->procname; table++) {
1579 if (!(de = table->de)) 1584 if (!(de = table->de))
1580 continue; 1585 continue;
1581 if (de->mode & S_IFDIR) { 1586 if (de->mode & S_IFDIR) {
@@ -1610,7 +1615,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1610 size_t count, loff_t *ppos) 1615 size_t count, loff_t *ppos)
1611{ 1616{
1612 int op; 1617 int op;
1613 struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); 1618 struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
1614 struct ctl_table *table; 1619 struct ctl_table *table;
1615 size_t res; 1620 size_t res;
1616 ssize_t error = -ENOTDIR; 1621 ssize_t error = -ENOTDIR;
@@ -1749,66 +1754,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
1749 * Special case of dostring for the UTS structure. This has locks 1754 * Special case of dostring for the UTS structure. This has locks
1750 * to observe. Should this be in kernel/sys.c ???? 1755 * to observe. Should this be in kernel/sys.c ????
1751 */ 1756 */
1752
1753#ifndef CONFIG_UTS_NS
1754static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1755 void __user *buffer, size_t *lenp, loff_t *ppos)
1756{
1757 int r;
1758 1757
1759 if (!write) {
1760 down_read(&uts_sem);
1761 r=proc_dostring(table,0,filp,buffer,lenp, ppos);
1762 up_read(&uts_sem);
1763 } else {
1764 down_write(&uts_sem);
1765 r=proc_dostring(table,1,filp,buffer,lenp, ppos);
1766 up_write(&uts_sem);
1767 }
1768 return r;
1769}
1770#else /* !CONFIG_UTS_NS */
1771static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 1758static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1772 void __user *buffer, size_t *lenp, loff_t *ppos) 1759 void __user *buffer, size_t *lenp, loff_t *ppos)
1773{ 1760{
1774 int r; 1761 int r;
1775 struct uts_namespace* uts_ns = current->nsproxy->uts_ns; 1762 void *which;
1776 char* which; 1763 which = get_uts(table, write);
1777 1764 r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
1778 switch (table->ctl_name) { 1765 put_uts(table, write, which);
1779 case KERN_OSTYPE:
1780 which = uts_ns->name.sysname;
1781 break;
1782 case KERN_NODENAME:
1783 which = uts_ns->name.nodename;
1784 break;
1785 case KERN_OSRELEASE:
1786 which = uts_ns->name.release;
1787 break;
1788 case KERN_VERSION:
1789 which = uts_ns->name.version;
1790 break;
1791 case KERN_DOMAINNAME:
1792 which = uts_ns->name.domainname;
1793 break;
1794 default:
1795 r = -EINVAL;
1796 goto out;
1797 }
1798
1799 if (!write) {
1800 down_read(&uts_sem);
1801 r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
1802 up_read(&uts_sem);
1803 } else {
1804 down_write(&uts_sem);
1805 r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
1806 up_write(&uts_sem);
1807 }
1808 out:
1809 return r; 1766 return r;
1810} 1767}
1811#endif /* !CONFIG_UTS_NS */
1812 1768
1813static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 1769static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1814 int *valp, 1770 int *valp,
@@ -1880,7 +1836,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
1880 p = buf; 1836 p = buf;
1881 if (*p == '-' && left > 1) { 1837 if (*p == '-' && left > 1) {
1882 neg = 1; 1838 neg = 1;
1883 left--, p++; 1839 p++;
1884 } 1840 }
1885 if (*p < '0' || *p > '9') 1841 if (*p < '0' || *p > '9')
1886 break; 1842 break;
@@ -1972,9 +1928,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
1972 1928
1973#define OP_SET 0 1929#define OP_SET 0
1974#define OP_AND 1 1930#define OP_AND 1
1975#define OP_OR 2
1976#define OP_MAX 3
1977#define OP_MIN 4
1978 1931
1979static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, 1932static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1980 int *valp, 1933 int *valp,
@@ -1986,13 +1939,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1986 switch(op) { 1939 switch(op) {
1987 case OP_SET: *valp = val; break; 1940 case OP_SET: *valp = val; break;
1988 case OP_AND: *valp &= val; break; 1941 case OP_AND: *valp &= val; break;
1989 case OP_OR: *valp |= val; break;
1990 case OP_MAX: if(*valp < val)
1991 *valp = val;
1992 break;
1993 case OP_MIN: if(*valp > val)
1994 *valp = val;
1995 break;
1996 } 1942 }
1997 } else { 1943 } else {
1998 int val = *valp; 1944 int val = *valp;
@@ -2131,7 +2077,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
2131 p = buf; 2077 p = buf;
2132 if (*p == '-' && left > 1) { 2078 if (*p == '-' && left > 1) {
2133 neg = 1; 2079 neg = 1;
2134 left--, p++; 2080 p++;
2135 } 2081 }
2136 if (*p < '0' || *p > '9') 2082 if (*p < '0' || *p > '9')
2137 break; 2083 break;
@@ -2387,46 +2333,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2387} 2333}
2388 2334
2389#ifdef CONFIG_SYSVIPC 2335#ifdef CONFIG_SYSVIPC
2390static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, 2336static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2391 void __user *buffer, size_t *lenp, loff_t *ppos) 2337 void __user *buffer, size_t *lenp, loff_t *ppos)
2392{ 2338{
2393 void *data; 2339 void *which;
2394 struct ipc_namespace *ns; 2340 which = get_ipc(table, write);
2395 2341 return __do_proc_dointvec(which, table, write, filp, buffer,
2396 ns = current->nsproxy->ipc_ns;
2397
2398 switch (table->ctl_name) {
2399 case KERN_SHMMAX:
2400 data = &ns->shm_ctlmax;
2401 goto proc_minmax;
2402 case KERN_SHMALL:
2403 data = &ns->shm_ctlall;
2404 goto proc_minmax;
2405 case KERN_SHMMNI:
2406 data = &ns->shm_ctlmni;
2407 break;
2408 case KERN_MSGMAX:
2409 data = &ns->msg_ctlmax;
2410 break;
2411 case KERN_MSGMNI:
2412 data = &ns->msg_ctlmni;
2413 break;
2414 case KERN_MSGMNB:
2415 data = &ns->msg_ctlmnb;
2416 break;
2417 case KERN_SEM:
2418 data = &ns->sem_ctls;
2419 break;
2420 default:
2421 return -EINVAL;
2422 }
2423
2424 return __do_proc_dointvec(data, table, write, filp, buffer,
2425 lenp, ppos, NULL, NULL); 2342 lenp, ppos, NULL, NULL);
2426proc_minmax: 2343}
2427 return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, 2344
2345static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2346 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
2347{
2348 void *which;
2349 which = get_ipc(table, write);
2350 return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
2428 lenp, ppos, 1l, 1l); 2351 lenp, ppos, 1l, 1l);
2429} 2352}
2353
2430#endif 2354#endif
2431 2355
2432static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 2356static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -2471,6 +2395,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
2471{ 2395{
2472 return -ENOSYS; 2396 return -ENOSYS;
2473} 2397}
2398static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2399 void __user *buffer, size_t *lenp, loff_t *ppos)
2400{
2401 return -ENOSYS;
2402}
2403static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2404 struct file *filp, void __user *buffer,
2405 size_t *lenp, loff_t *ppos)
2406{
2407 return -ENOSYS;
2408}
2474#endif 2409#endif
2475 2410
2476int proc_dointvec(ctl_table *table, int write, struct file *filp, 2411int proc_dointvec(ctl_table *table, int write, struct file *filp,
@@ -2535,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2535/* The generic string strategy routine: */ 2470/* The generic string strategy routine: */
2536int sysctl_string(ctl_table *table, int __user *name, int nlen, 2471int sysctl_string(ctl_table *table, int __user *name, int nlen,
2537 void __user *oldval, size_t __user *oldlenp, 2472 void __user *oldval, size_t __user *oldlenp,
2538 void __user *newval, size_t newlen, void **context) 2473 void __user *newval, size_t newlen)
2539{ 2474{
2540 if (!table->data || !table->maxlen) 2475 if (!table->data || !table->maxlen)
2541 return -ENOTDIR; 2476 return -ENOTDIR;
@@ -2581,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2581 */ 2516 */
2582int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2517int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2583 void __user *oldval, size_t __user *oldlenp, 2518 void __user *oldval, size_t __user *oldlenp,
2584 void __user *newval, size_t newlen, void **context) 2519 void __user *newval, size_t newlen)
2585{ 2520{
2586 2521
2587 if (newval && newlen) { 2522 if (newval && newlen) {
@@ -2617,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2617/* Strategy function to convert jiffies to seconds */ 2552/* Strategy function to convert jiffies to seconds */
2618int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2553int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2619 void __user *oldval, size_t __user *oldlenp, 2554 void __user *oldval, size_t __user *oldlenp,
2620 void __user *newval, size_t newlen, void **context) 2555 void __user *newval, size_t newlen)
2621{ 2556{
2622 if (oldval) { 2557 if (oldval) {
2623 size_t olen; 2558 size_t olen;
@@ -2645,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2645/* Strategy function to convert jiffies to seconds */ 2580/* Strategy function to convert jiffies to seconds */
2646int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2581int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2647 void __user *oldval, size_t __user *oldlenp, 2582 void __user *oldval, size_t __user *oldlenp,
2648 void __user *newval, size_t newlen, void **context) 2583 void __user *newval, size_t newlen)
2649{ 2584{
2650 if (oldval) { 2585 if (oldval) {
2651 size_t olen; 2586 size_t olen;
@@ -2670,50 +2605,140 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2670 return 1; 2605 return 1;
2671} 2606}
2672 2607
2608
2609/* The generic string strategy routine: */
2610static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2611 void __user *oldval, size_t __user *oldlenp,
2612 void __user *newval, size_t newlen)
2613{
2614 struct ctl_table uts_table;
2615 int r, write;
2616 write = newval && newlen;
2617 memcpy(&uts_table, table, sizeof(uts_table));
2618 uts_table.data = get_uts(table, write);
2619 r = sysctl_string(&uts_table, name, nlen,
2620 oldval, oldlenp, newval, newlen);
2621 put_uts(table, write, uts_table.data);
2622 return r;
2623}
2624
2625#ifdef CONFIG_SYSVIPC
2626/* The generic sysctl ipc data routine. */
2627static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2628 void __user *oldval, size_t __user *oldlenp,
2629 void __user *newval, size_t newlen)
2630{
2631 size_t len;
2632 void *data;
2633
2634 /* Get out of I don't have a variable */
2635 if (!table->data || !table->maxlen)
2636 return -ENOTDIR;
2637
2638 data = get_ipc(table, 1);
2639 if (!data)
2640 return -ENOTDIR;
2641
2642 if (oldval && oldlenp) {
2643 if (get_user(len, oldlenp))
2644 return -EFAULT;
2645 if (len) {
2646 if (len > table->maxlen)
2647 len = table->maxlen;
2648 if (copy_to_user(oldval, data, len))
2649 return -EFAULT;
2650 if (put_user(len, oldlenp))
2651 return -EFAULT;
2652 }
2653 }
2654
2655 if (newval && newlen) {
2656 if (newlen > table->maxlen)
2657 newlen = table->maxlen;
2658
2659 if (copy_from_user(data, newval, newlen))
2660 return -EFAULT;
2661 }
2662 return 1;
2663}
2664#endif
2665
2673#else /* CONFIG_SYSCTL_SYSCALL */ 2666#else /* CONFIG_SYSCTL_SYSCALL */
2674 2667
2675 2668
2676asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 2669asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2677{ 2670{
2678 static int msg_count; 2671 static int msg_count;
2672 struct __sysctl_args tmp;
2673 int name[CTL_MAXNAME];
2674 int i;
2675
2676 /* Read in the sysctl name for better debug message logging */
2677 if (copy_from_user(&tmp, args, sizeof(tmp)))
2678 return -EFAULT;
2679 if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME)
2680 return -ENOTDIR;
2681 for (i = 0; i < tmp.nlen; i++)
2682 if (get_user(name[i], tmp.name + i))
2683 return -EFAULT;
2684
2685 /* Ignore accesses to kernel.version */
2686 if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
2687 goto out;
2679 2688
2680 if (msg_count < 5) { 2689 if (msg_count < 5) {
2681 msg_count++; 2690 msg_count++;
2682 printk(KERN_INFO 2691 printk(KERN_INFO
2683 "warning: process `%s' used the removed sysctl " 2692 "warning: process `%s' used the removed sysctl "
2684 "system call\n", current->comm); 2693 "system call with ", current->comm);
2694 for (i = 0; i < tmp.nlen; i++)
2695 printk("%d.", name[i]);
2696 printk("\n");
2685 } 2697 }
2698out:
2686 return -ENOSYS; 2699 return -ENOSYS;
2687} 2700}
2688 2701
2689int sysctl_string(ctl_table *table, int __user *name, int nlen, 2702int sysctl_string(ctl_table *table, int __user *name, int nlen,
2690 void __user *oldval, size_t __user *oldlenp, 2703 void __user *oldval, size_t __user *oldlenp,
2691 void __user *newval, size_t newlen, void **context) 2704 void __user *newval, size_t newlen)
2692{ 2705{
2693 return -ENOSYS; 2706 return -ENOSYS;
2694} 2707}
2695 2708
2696int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2709int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2697 void __user *oldval, size_t __user *oldlenp, 2710 void __user *oldval, size_t __user *oldlenp,
2698 void __user *newval, size_t newlen, void **context) 2711 void __user *newval, size_t newlen)
2699{ 2712{
2700 return -ENOSYS; 2713 return -ENOSYS;
2701} 2714}
2702 2715
2703int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2716int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2704 void __user *oldval, size_t __user *oldlenp, 2717 void __user *oldval, size_t __user *oldlenp,
2705 void __user *newval, size_t newlen, void **context) 2718 void __user *newval, size_t newlen)
2706{ 2719{
2707 return -ENOSYS; 2720 return -ENOSYS;
2708} 2721}
2709 2722
2710int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2723int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2711 void __user *oldval, size_t __user *oldlenp, 2724 void __user *oldval, size_t __user *oldlenp,
2712 void __user *newval, size_t newlen, void **context) 2725 void __user *newval, size_t newlen)
2713{ 2726{
2714 return -ENOSYS; 2727 return -ENOSYS;
2715} 2728}
2716 2729
2730static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2731 void __user *oldval, size_t __user *oldlenp,
2732 void __user *newval, size_t newlen)
2733{
2734 return -ENOSYS;
2735}
2736static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2737 void __user *oldval, size_t __user *oldlenp,
2738 void __user *newval, size_t newlen)
2739{
2740 return -ENOSYS;
2741}
2717#endif /* CONFIG_SYSCTL_SYSCALL */ 2742#endif /* CONFIG_SYSCTL_SYSCALL */
2718 2743
2719/* 2744/*
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 5d6a8c54ee85..4c3476fa058d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
34 34
35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
36static int family_registered; 36static int family_registered;
37kmem_cache_t *taskstats_cache; 37struct kmem_cache *taskstats_cache;
38 38
39static struct genl_family family = { 39static struct genl_family family = {
40 .id = GENL_ID_GENERATE, 40 .id = GENL_ID_GENERATE,
@@ -69,7 +69,7 @@ enum actions {
69}; 69};
70 70
71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
72 void **replyp, size_t size) 72 size_t size)
73{ 73{
74 struct sk_buff *skb; 74 struct sk_buff *skb;
75 void *reply; 75 void *reply;
@@ -77,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
77 /* 77 /*
78 * If new attributes are added, please revisit this allocation 78 * If new attributes are added, please revisit this allocation
79 */ 79 */
80 skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL); 80 skb = genlmsg_new(size, GFP_KERNEL);
81 if (!skb) 81 if (!skb)
82 return -ENOMEM; 82 return -ENOMEM;
83 83
@@ -85,20 +85,15 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
85 int seq = get_cpu_var(taskstats_seqnum)++; 85 int seq = get_cpu_var(taskstats_seqnum)++;
86 put_cpu_var(taskstats_seqnum); 86 put_cpu_var(taskstats_seqnum);
87 87
88 reply = genlmsg_put(skb, 0, seq, 88 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
89 family.id, 0, 0,
90 cmd, family.version);
91 } else 89 } else
92 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, 90 reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
93 family.id, 0, 0,
94 cmd, family.version);
95 if (reply == NULL) { 91 if (reply == NULL) {
96 nlmsg_free(skb); 92 nlmsg_free(skb);
97 return -EINVAL; 93 return -EINVAL;
98 } 94 }
99 95
100 *skbp = skb; 96 *skbp = skb;
101 *replyp = reply;
102 return 0; 97 return 0;
103} 98}
104 99
@@ -123,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
123/* 118/*
124 * Send taskstats data in @skb to listeners registered for @cpu's exit data 119 * Send taskstats data in @skb to listeners registered for @cpu's exit data
125 */ 120 */
126static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) 121static void send_cpu_listeners(struct sk_buff *skb,
122 struct listener_list *listeners)
127{ 123{
128 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 124 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
129 struct listener_list *listeners;
130 struct listener *s, *tmp; 125 struct listener *s, *tmp;
131 struct sk_buff *skb_next, *skb_cur = skb; 126 struct sk_buff *skb_next, *skb_cur = skb;
132 void *reply = genlmsg_data(genlhdr); 127 void *reply = genlmsg_data(genlhdr);
@@ -139,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
139 } 134 }
140 135
141 rc = 0; 136 rc = 0;
142 listeners = &per_cpu(listener_array, cpu);
143 down_read(&listeners->sem); 137 down_read(&listeners->sem);
144 list_for_each_entry(s, &listeners->list, list) { 138 list_for_each_entry(s, &listeners->list, list) {
145 skb_next = NULL; 139 skb_next = NULL;
@@ -174,24 +168,23 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
174 up_write(&listeners->sem); 168 up_write(&listeners->sem);
175} 169}
176 170
177static int fill_pid(pid_t pid, struct task_struct *pidtsk, 171static int fill_pid(pid_t pid, struct task_struct *tsk,
178 struct taskstats *stats) 172 struct taskstats *stats)
179{ 173{
180 int rc = 0; 174 int rc = 0;
181 struct task_struct *tsk = pidtsk;
182 175
183 if (!pidtsk) { 176 if (!tsk) {
184 read_lock(&tasklist_lock); 177 rcu_read_lock();
185 tsk = find_task_by_pid(pid); 178 tsk = find_task_by_pid(pid);
186 if (!tsk) { 179 if (tsk)
187 read_unlock(&tasklist_lock); 180 get_task_struct(tsk);
181 rcu_read_unlock();
182 if (!tsk)
188 return -ESRCH; 183 return -ESRCH;
189 }
190 get_task_struct(tsk);
191 read_unlock(&tasklist_lock);
192 } else 184 } else
193 get_task_struct(tsk); 185 get_task_struct(tsk);
194 186
187 memset(stats, 0, sizeof(*stats));
195 /* 188 /*
196 * Each accounting subsystem adds calls to its functions to 189 * Each accounting subsystem adds calls to its functions to
197 * fill in relevant parts of struct taskstsats as follows 190 * fill in relevant parts of struct taskstsats as follows
@@ -214,39 +207,32 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
214 207
215} 208}
216 209
217static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, 210static int fill_tgid(pid_t tgid, struct task_struct *first,
218 struct taskstats *stats) 211 struct taskstats *stats)
219{ 212{
220 struct task_struct *tsk, *first; 213 struct task_struct *tsk;
221 unsigned long flags; 214 unsigned long flags;
215 int rc = -ESRCH;
222 216
223 /* 217 /*
224 * Add additional stats from live tasks except zombie thread group 218 * Add additional stats from live tasks except zombie thread group
225 * leaders who are already counted with the dead tasks 219 * leaders who are already counted with the dead tasks
226 */ 220 */
227 first = tgidtsk; 221 rcu_read_lock();
228 if (!first) { 222 if (!first)
229 read_lock(&tasklist_lock);
230 first = find_task_by_pid(tgid); 223 first = find_task_by_pid(tgid);
231 if (!first) {
232 read_unlock(&tasklist_lock);
233 return -ESRCH;
234 }
235 get_task_struct(first);
236 read_unlock(&tasklist_lock);
237 } else
238 get_task_struct(first);
239 224
240 /* Start with stats from dead tasks */ 225 if (!first || !lock_task_sighand(first, &flags))
241 spin_lock_irqsave(&first->signal->stats_lock, flags); 226 goto out;
227
242 if (first->signal->stats) 228 if (first->signal->stats)
243 memcpy(stats, first->signal->stats, sizeof(*stats)); 229 memcpy(stats, first->signal->stats, sizeof(*stats));
244 spin_unlock_irqrestore(&first->signal->stats_lock, flags); 230 else
231 memset(stats, 0, sizeof(*stats));
245 232
246 tsk = first; 233 tsk = first;
247 read_lock(&tasklist_lock);
248 do { 234 do {
249 if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) 235 if (tsk->exit_state)
250 continue; 236 continue;
251 /* 237 /*
252 * Accounting subsystem can call its functions here to 238 * Accounting subsystem can call its functions here to
@@ -257,15 +243,18 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
257 delayacct_add_tsk(stats, tsk); 243 delayacct_add_tsk(stats, tsk);
258 244
259 } while_each_thread(first, tsk); 245 } while_each_thread(first, tsk);
260 read_unlock(&tasklist_lock);
261 stats->version = TASKSTATS_VERSION;
262 246
247 unlock_task_sighand(first, &flags);
248 rc = 0;
249out:
250 rcu_read_unlock();
251
252 stats->version = TASKSTATS_VERSION;
263 /* 253 /*
264 * Accounting subsytems can also add calls here to modify 254 * Accounting subsytems can also add calls here to modify
265 * fields of taskstats. 255 * fields of taskstats.
266 */ 256 */
267 257 return rc;
268 return 0;
269} 258}
270 259
271 260
@@ -273,7 +262,7 @@ static void fill_tgid_exit(struct task_struct *tsk)
273{ 262{
274 unsigned long flags; 263 unsigned long flags;
275 264
276 spin_lock_irqsave(&tsk->signal->stats_lock, flags); 265 spin_lock_irqsave(&tsk->sighand->siglock, flags);
277 if (!tsk->signal->stats) 266 if (!tsk->signal->stats)
278 goto ret; 267 goto ret;
279 268
@@ -285,7 +274,7 @@ static void fill_tgid_exit(struct task_struct *tsk)
285 */ 274 */
286 delayacct_add_tsk(tsk->signal->stats, tsk); 275 delayacct_add_tsk(tsk->signal->stats, tsk);
287ret: 276ret:
288 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); 277 spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
289 return; 278 return;
290} 279}
291 280
@@ -356,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask)
356 return ret; 345 return ret;
357} 346}
358 347
348static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
349{
350 struct nlattr *na, *ret;
351 int aggr;
352
353 aggr = (type == TASKSTATS_TYPE_PID)
354 ? TASKSTATS_TYPE_AGGR_PID
355 : TASKSTATS_TYPE_AGGR_TGID;
356
357 na = nla_nest_start(skb, aggr);
358 if (!na)
359 goto err;
360 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
361 goto err;
362 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
363 if (!ret)
364 goto err;
365 nla_nest_end(skb, na);
366
367 return nla_data(ret);
368err:
369 return NULL;
370}
371
359static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 372static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
360{ 373{
361 int rc = 0; 374 int rc = 0;
362 struct sk_buff *rep_skb; 375 struct sk_buff *rep_skb;
363 struct taskstats stats; 376 struct taskstats *stats;
364 void *reply;
365 size_t size; 377 size_t size;
366 struct nlattr *na;
367 cpumask_t mask; 378 cpumask_t mask;
368 379
369 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 380 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -384,146 +395,122 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
384 size = nla_total_size(sizeof(u32)) + 395 size = nla_total_size(sizeof(u32)) +
385 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 396 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
386 397
387 memset(&stats, 0, sizeof(stats)); 398 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
388 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
389 if (rc < 0) 399 if (rc < 0)
390 return rc; 400 return rc;
391 401
402 rc = -EINVAL;
392 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 403 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
393 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 404 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
394 rc = fill_pid(pid, NULL, &stats); 405 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
395 if (rc < 0) 406 if (!stats)
396 goto err; 407 goto err;
397 408
398 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 409 rc = fill_pid(pid, NULL, stats);
399 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); 410 if (rc < 0)
400 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 411 goto err;
401 stats);
402 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 412 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
403 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 413 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
404 rc = fill_tgid(tgid, NULL, &stats); 414 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
405 if (rc < 0) 415 if (!stats)
406 goto err; 416 goto err;
407 417
408 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 418 rc = fill_tgid(tgid, NULL, stats);
409 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); 419 if (rc < 0)
410 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 420 goto err;
411 stats); 421 } else
412 } else {
413 rc = -EINVAL;
414 goto err; 422 goto err;
415 }
416
417 nla_nest_end(rep_skb, na);
418 423
419 return send_reply(rep_skb, info->snd_pid); 424 return send_reply(rep_skb, info->snd_pid);
420
421nla_put_failure:
422 return genlmsg_cancel(rep_skb, reply);
423err: 425err:
424 nlmsg_free(rep_skb); 426 nlmsg_free(rep_skb);
425 return rc; 427 return rc;
426} 428}
427 429
428void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 430static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
429{ 431{
430 struct listener_list *listeners; 432 struct signal_struct *sig = tsk->signal;
431 struct taskstats *tmp; 433 struct taskstats *stats;
432 /*
433 * This is the cpu on which the task is exiting currently and will
434 * be the one for which the exit event is sent, even if the cpu
435 * on which this function is running changes later.
436 */
437 *mycpu = raw_smp_processor_id();
438 434
439 *ptidstats = NULL; 435 if (sig->stats || thread_group_empty(tsk))
440 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 436 goto ret;
441 if (!tmp)
442 return;
443 437
444 listeners = &per_cpu(listener_array, *mycpu); 438 /* No problem if kmem_cache_zalloc() fails */
445 down_read(&listeners->sem); 439 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
446 if (!list_empty(&listeners->list)) { 440
447 *ptidstats = tmp; 441 spin_lock_irq(&tsk->sighand->siglock);
448 tmp = NULL; 442 if (!sig->stats) {
443 sig->stats = stats;
444 stats = NULL;
449 } 445 }
450 up_read(&listeners->sem); 446 spin_unlock_irq(&tsk->sighand->siglock);
451 kfree(tmp); 447
448 if (stats)
449 kmem_cache_free(taskstats_cache, stats);
450ret:
451 return sig->stats;
452} 452}
453 453
454/* Send pid data out on exit */ 454/* Send pid data out on exit */
455void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 455void taskstats_exit(struct task_struct *tsk, int group_dead)
456 int group_dead, unsigned int mycpu)
457{ 456{
458 int rc; 457 int rc;
458 struct listener_list *listeners;
459 struct taskstats *stats;
459 struct sk_buff *rep_skb; 460 struct sk_buff *rep_skb;
460 void *reply;
461 size_t size; 461 size_t size;
462 int is_thread_group; 462 int is_thread_group;
463 struct nlattr *na;
464 unsigned long flags;
465 463
466 if (!family_registered || !tidstats) 464 if (!family_registered)
467 return; 465 return;
468 466
469 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
470 is_thread_group = tsk->signal->stats ? 1 : 0;
471 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
472
473 rc = 0;
474 /* 467 /*
475 * Size includes space for nested attributes 468 * Size includes space for nested attributes
476 */ 469 */
477 size = nla_total_size(sizeof(u32)) + 470 size = nla_total_size(sizeof(u32)) +
478 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 471 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
479 472
480 if (is_thread_group) 473 is_thread_group = !!taskstats_tgid_alloc(tsk);
481 size = 2 * size; /* PID + STATS + TGID + STATS */ 474 if (is_thread_group) {
475 /* PID + STATS + TGID + STATS */
476 size = 2 * size;
477 /* fill the tsk->signal->stats structure */
478 fill_tgid_exit(tsk);
479 }
482 480
483 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 481 listeners = &__raw_get_cpu_var(listener_array);
484 if (rc < 0) 482 if (list_empty(&listeners->list))
485 goto ret; 483 return;
486 484
487 rc = fill_pid(tsk->pid, tsk, tidstats); 485 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
488 if (rc < 0) 486 if (rc < 0)
489 goto err_skb; 487 return;
490 488
491 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 489 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
492 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); 490 if (!stats)
493 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 491 goto err;
494 *tidstats);
495 nla_nest_end(rep_skb, na);
496 492
497 if (!is_thread_group) 493 rc = fill_pid(tsk->pid, tsk, stats);
498 goto send; 494 if (rc < 0)
495 goto err;
499 496
500 /* 497 /*
501 * tsk has/had a thread group so fill the tsk->signal->stats structure
502 * Doesn't matter if tsk is the leader or the last group member leaving 498 * Doesn't matter if tsk is the leader or the last group member leaving
503 */ 499 */
504 500 if (!is_thread_group || !group_dead)
505 fill_tgid_exit(tsk);
506 if (!group_dead)
507 goto send; 501 goto send;
508 502
509 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 503 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
510 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); 504 if (!stats)
511 /* No locking needed for tsk->signal->stats since group is dead */ 505 goto err;
512 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 506
513 *tsk->signal->stats); 507 memcpy(stats, tsk->signal->stats, sizeof(*stats));
514 nla_nest_end(rep_skb, na);
515 508
516send: 509send:
517 send_cpu_listeners(rep_skb, mycpu); 510 send_cpu_listeners(rep_skb, listeners);
518 return; 511 return;
519 512err:
520nla_put_failure:
521 genlmsg_cancel(rep_skb, reply);
522 goto ret;
523err_skb:
524 nlmsg_free(rep_skb); 513 nlmsg_free(rep_skb);
525ret:
526 return;
527} 514}
528 515
529static struct genl_ops taskstats_ops = { 516static struct genl_ops taskstats_ops = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939bd9..22504afc0d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
156 /* check if clocksource is already registered */ 156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) { 157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. " 158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name); 159 "Already registered!", c->name);
160 ret = -EBUSY; 160 ret = -EBUSY;
161 } else { 161 } else {
162 /* register it */ 162 /* register it */
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
186} 186}
187EXPORT_SYMBOL(clocksource_reselect); 187EXPORT_SYMBOL(clocksource_reselect);
188 188
189#ifdef CONFIG_SYSFS
189/** 190/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource 191 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused 192 * @dev: unused
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
275 * Sysfs setup bits: 276 * Sysfs setup bits:
276 */ 277 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, 278static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource); 279 sysfs_override_clocksource);
279 280
280static SYSDEV_ATTR(available_clocksource, 0600, 281static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL); 282 sysfs_show_available_clocksources, NULL);
282 283
283static struct sysdev_class clocksource_sysclass = { 284static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"), 285 set_kset_name("clocksource"),
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
307} 308}
308 309
309device_initcall(init_clocksource_sysfs); 310device_initcall(init_clocksource_sysfs);
311#endif /* CONFIG_SYSFS */
310 312
311/** 313/**
312 * boot_override_clocksource - boot clock override 314 * boot_override_clocksource - boot clock override
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 126bb30c4afe..a99b2a6e6a07 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -57,7 +57,7 @@ static cycle_t jiffies_read(void)
57 57
58struct clocksource clocksource_jiffies = { 58struct clocksource clocksource_jiffies = {
59 .name = "jiffies", 59 .name = "jiffies",
60 .rating = 0, /* lowest rating*/ 60 .rating = 1, /* lowest valid rating*/
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 47195fa0ec4f..3afeaa3a73f9 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -161,9 +161,9 @@ void second_overflow(void)
161 time_adjust += MAX_TICKADJ; 161 time_adjust += MAX_TICKADJ;
162 tick_length -= MAX_TICKADJ_SCALED; 162 tick_length -= MAX_TICKADJ_SCALED;
163 } else { 163 } else {
164 time_adjust = 0;
165 tick_length += (s64)(time_adjust * NSEC_PER_USEC / 164 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
166 HZ) << TICK_LENGTH_SHIFT; 165 HZ) << TICK_LENGTH_SHIFT;
166 time_adjust = 0;
167 } 167 }
168 } 168 }
169} 169}
diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffec1..0256ab443d8a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
80EXPORT_SYMBOL(boot_tvec_bases); 80EXPORT_SYMBOL(boot_tvec_bases);
81static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 81static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
82 82
83/**
84 * __round_jiffies - function to round jiffies to a full second
85 * @j: the time in (absolute) jiffies that should be rounded
86 * @cpu: the processor number on which the timeout will happen
87 *
88 * __round_jiffies rounds an absolute time in the future (in jiffies)
89 * up or down to (approximately) full seconds. This is useful for timers
90 * for which the exact time they fire does not matter too much, as long as
91 * they fire approximately every X seconds.
92 *
93 * By rounding these timers to whole seconds, all such timers will fire
94 * at the same time, rather than at various times spread out. The goal
95 * of this is to have the CPU wake up less, which saves power.
96 *
97 * The exact rounding is skewed for each processor to avoid all
98 * processors firing at the exact same time, which could lead
99 * to lock contention or spurious cache line bouncing.
100 *
101 * The return value is the rounded version of the "j" parameter.
102 */
103unsigned long __round_jiffies(unsigned long j, int cpu)
104{
105 int rem;
106 unsigned long original = j;
107
108 /*
109 * We don't want all cpus firing their timers at once hitting the
110 * same lock or cachelines, so we skew each extra cpu with an extra
111 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
112 * already did this.
113 * The skew is done by adding 3*cpunr, then round, then subtract this
114 * extra offset again.
115 */
116 j += cpu * 3;
117
118 rem = j % HZ;
119
120 /*
121 * If the target jiffie is just after a whole second (which can happen
122 * due to delays of the timer irq, long irq off times etc etc) then
123 * we should round down to the whole second, not up. Use 1/4th second
124 * as cutoff for this rounding as an extreme upper bound for this.
125 */
126 if (rem < HZ/4) /* round down */
127 j = j - rem;
128 else /* round up */
129 j = j - rem + HZ;
130
131 /* now that we have rounded, subtract the extra skew again */
132 j -= cpu * 3;
133
134 if (j <= jiffies) /* rounding ate our timeout entirely; */
135 return original;
136 return j;
137}
138EXPORT_SYMBOL_GPL(__round_jiffies);
139
140/**
141 * __round_jiffies_relative - function to round jiffies to a full second
142 * @j: the time in (relative) jiffies that should be rounded
143 * @cpu: the processor number on which the timeout will happen
144 *
145 * __round_jiffies_relative rounds a time delta in the future (in jiffies)
146 * up or down to (approximately) full seconds. This is useful for timers
147 * for which the exact time they fire does not matter too much, as long as
148 * they fire approximately every X seconds.
149 *
150 * By rounding these timers to whole seconds, all such timers will fire
151 * at the same time, rather than at various times spread out. The goal
152 * of this is to have the CPU wake up less, which saves power.
153 *
154 * The exact rounding is skewed for each processor to avoid all
155 * processors firing at the exact same time, which could lead
156 * to lock contention or spurious cache line bouncing.
157 *
158 * The return value is the rounded version of the "j" parameter.
159 */
160unsigned long __round_jiffies_relative(unsigned long j, int cpu)
161{
162 /*
163 * In theory the following code can skip a jiffy in case jiffies
164 * increments right between the addition and the later subtraction.
165 * However since the entire point of this function is to use approximate
166 * timeouts, it's entirely ok to not handle that.
167 */
168 return __round_jiffies(j + jiffies, cpu) - jiffies;
169}
170EXPORT_SYMBOL_GPL(__round_jiffies_relative);
171
172/**
173 * round_jiffies - function to round jiffies to a full second
174 * @j: the time in (absolute) jiffies that should be rounded
175 *
176 * round_jiffies rounds an absolute time in the future (in jiffies)
177 * up or down to (approximately) full seconds. This is useful for timers
178 * for which the exact time they fire does not matter too much, as long as
179 * they fire approximately every X seconds.
180 *
181 * By rounding these timers to whole seconds, all such timers will fire
182 * at the same time, rather than at various times spread out. The goal
183 * of this is to have the CPU wake up less, which saves power.
184 *
185 * The return value is the rounded version of the "j" parameter.
186 */
187unsigned long round_jiffies(unsigned long j)
188{
189 return __round_jiffies(j, raw_smp_processor_id());
190}
191EXPORT_SYMBOL_GPL(round_jiffies);
192
193/**
194 * round_jiffies_relative - function to round jiffies to a full second
195 * @j: the time in (relative) jiffies that should be rounded
196 *
197 * round_jiffies_relative rounds a time delta in the future (in jiffies)
198 * up or down to (approximately) full seconds. This is useful for timers
199 * for which the exact time they fire does not matter too much, as long as
200 * they fire approximately every X seconds.
201 *
202 * By rounding these timers to whole seconds, all such timers will fire
203 * at the same time, rather than at various times spread out. The goal
204 * of this is to have the CPU wake up less, which saves power.
205 *
206 * The return value is the rounded version of the "j" parameter.
207 */
208unsigned long round_jiffies_relative(unsigned long j)
209{
210 return __round_jiffies_relative(j, raw_smp_processor_id());
211}
212EXPORT_SYMBOL_GPL(round_jiffies_relative);
213
214
83static inline void set_running_timer(tvec_base_t *base, 215static inline void set_running_timer(tvec_base_t *base,
84 struct timer_list *timer) 216 struct timer_list *timer)
85{ 217{
@@ -714,7 +846,7 @@ static int change_clocksource(void)
714 clock = new; 846 clock = new;
715 clock->cycle_last = now; 847 clock->cycle_last = now;
716 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 848 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
717 clock->name); 849 clock->name);
718 return 1; 850 return 1;
719 } else if (clock->update_callback) { 851 } else if (clock->update_callback) {
720 return clock->update_callback(); 852 return clock->update_callback();
@@ -722,7 +854,10 @@ static int change_clocksource(void)
722 return 0; 854 return 0;
723} 855}
724#else 856#else
725#define change_clocksource() (0) 857static inline int change_clocksource(void)
858{
859 return 0;
860}
726#endif 861#endif
727 862
728/** 863/**
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device);
820 * If the error is already larger, we look ahead even further 955 * If the error is already larger, we look ahead even further
821 * to compensate for late or lost adjustments. 956 * to compensate for late or lost adjustments.
822 */ 957 */
823static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) 958static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
959 s64 *offset)
824{ 960{
825 s64 tick_error, i; 961 s64 tick_error, i;
826 u32 look_ahead, adj; 962 u32 look_ahead, adj;
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
844 * Now calculate the error in (1 << look_ahead) ticks, but first 980 * Now calculate the error in (1 << look_ahead) ticks, but first
845 * remove the single look ahead already included in the error. 981 * remove the single look ahead already included in the error.
846 */ 982 */
847 tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); 983 tick_error = current_tick_length() >>
984 (TICK_LENGTH_SHIFT - clock->shift + 1);
848 tick_error -= clock->xtime_interval >> 1; 985 tick_error -= clock->xtime_interval >> 1;
849 error = ((error - tick_error) >> look_ahead) + tick_error; 986 error = ((error - tick_error) >> look_ahead) + tick_error;
850 987
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
896 clock->mult += adj; 1033 clock->mult += adj;
897 clock->xtime_interval += interval; 1034 clock->xtime_interval += interval;
898 clock->xtime_nsec -= offset; 1035 clock->xtime_nsec -= offset;
899 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); 1036 clock->error -= (interval - offset) <<
1037 (TICK_LENGTH_SHIFT - clock->shift);
900} 1038}
901 1039
902/** 1040/**
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index db443221ba5b..baacc3691415 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -36,7 +36,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
36 36
37 /* calculate task elapsed time in timespec */ 37 /* calculate task elapsed time in timespec */
38 do_posix_clock_monotonic_gettime(&uptime); 38 do_posix_clock_monotonic_gettime(&uptime);
39 ts = timespec_sub(uptime, current->group_leader->start_time); 39 ts = timespec_sub(uptime, tsk->start_time);
40 /* rebase elapsed time to usec */ 40 /* rebase elapsed time to usec */
41 ac_etime = timespec_to_ns(&ts); 41 ac_etime = timespec_to_ns(&ts);
42 do_div(ac_etime, NSEC_PER_USEC); 42 do_div(ac_etime, NSEC_PER_USEC);
@@ -58,7 +58,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
58 stats->ac_uid = tsk->uid; 58 stats->ac_uid = tsk->uid;
59 stats->ac_gid = tsk->gid; 59 stats->ac_gid = tsk->gid;
60 stats->ac_pid = tsk->pid; 60 stats->ac_pid = tsk->pid;
61 stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0; 61 rcu_read_lock();
62 stats->ac_ppid = pid_alive(tsk) ?
63 rcu_dereference(tsk->real_parent)->tgid : 0;
64 rcu_read_unlock();
62 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 65 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
63 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 66 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
64 stats->ac_minflt = tsk->min_flt; 67 stats->ac_minflt = tsk->min_flt;
@@ -77,18 +80,31 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
77 */ 80 */
78void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) 81void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
79{ 82{
83 struct mm_struct *mm;
84
80 /* convert pages-jiffies to Mbyte-usec */ 85 /* convert pages-jiffies to Mbyte-usec */
81 stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; 86 stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
82 stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; 87 stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
83 if (p->mm) { 88 mm = get_task_mm(p);
89 if (mm) {
84 /* adjust to KB unit */ 90 /* adjust to KB unit */
85 stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; 91 stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB;
86 stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; 92 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB;
93 mmput(mm);
87 } 94 }
88 stats->read_char = p->rchar; 95 stats->read_char = p->rchar;
89 stats->write_char = p->wchar; 96 stats->write_char = p->wchar;
90 stats->read_syscalls = p->syscr; 97 stats->read_syscalls = p->syscr;
91 stats->write_syscalls = p->syscw; 98 stats->write_syscalls = p->syscw;
99#ifdef CONFIG_TASK_IO_ACCOUNTING
100 stats->read_bytes = p->ioac.read_bytes;
101 stats->write_bytes = p->ioac.write_bytes;
102 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
103#else
104 stats->read_bytes = 0;
105 stats->write_bytes = 0;
106 stats->cancelled_write_bytes = 0;
107#endif
92} 108}
93#undef KB 109#undef KB
94#undef MB 110#undef MB
diff --git a/kernel/unwind.c b/kernel/unwind.c
index 2e2368607aab..09c261329249 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -11,13 +11,16 @@
11 11
12#include <linux/unwind.h> 12#include <linux/unwind.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/delay.h> 14#include <linux/bootmem.h>
15#include <linux/sort.h>
15#include <linux/stop_machine.h> 16#include <linux/stop_machine.h>
17#include <linux/uaccess.h>
16#include <asm/sections.h> 18#include <asm/sections.h>
17#include <asm/uaccess.h> 19#include <asm/uaccess.h>
18#include <asm/unaligned.h> 20#include <asm/unaligned.h>
19 21
20extern char __start_unwind[], __end_unwind[]; 22extern const char __start_unwind[], __end_unwind[];
23extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
21 24
22#define MAX_STACK_DEPTH 8 25#define MAX_STACK_DEPTH 8
23 26
@@ -92,6 +95,7 @@ static const struct {
92 95
93typedef unsigned long uleb128_t; 96typedef unsigned long uleb128_t;
94typedef signed long sleb128_t; 97typedef signed long sleb128_t;
98#define sleb128abs __builtin_labs
95 99
96static struct unwind_table { 100static struct unwind_table {
97 struct { 101 struct {
@@ -100,6 +104,8 @@ static struct unwind_table {
100 } core, init; 104 } core, init;
101 const void *address; 105 const void *address;
102 unsigned long size; 106 unsigned long size;
107 const unsigned char *header;
108 unsigned long hdrsz;
103 struct unwind_table *link; 109 struct unwind_table *link;
104 const char *name; 110 const char *name;
105} root_table; 111} root_table;
@@ -131,6 +137,17 @@ struct unwind_state {
131 137
132static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; 138static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
133 139
140static unsigned unwind_debug;
141static int __init unwind_debug_setup(char *s)
142{
143 unwind_debug = simple_strtoul(s, NULL, 0);
144 return 1;
145}
146__setup("unwind_debug=", unwind_debug_setup);
147#define dprintk(lvl, fmt, args...) \
148 ((void)(lvl > unwind_debug \
149 || printk(KERN_DEBUG "unwind: " fmt "\n", ##args)))
150
134static struct unwind_table *find_table(unsigned long pc) 151static struct unwind_table *find_table(unsigned long pc)
135{ 152{
136 struct unwind_table *table; 153 struct unwind_table *table;
@@ -145,6 +162,12 @@ static struct unwind_table *find_table(unsigned long pc)
145 return table; 162 return table;
146} 163}
147 164
165static unsigned long read_pointer(const u8 **pLoc,
166 const void *end,
167 signed ptrType,
168 unsigned long text_base,
169 unsigned long data_base);
170
148static void init_unwind_table(struct unwind_table *table, 171static void init_unwind_table(struct unwind_table *table,
149 const char *name, 172 const char *name,
150 const void *core_start, 173 const void *core_start,
@@ -152,14 +175,33 @@ static void init_unwind_table(struct unwind_table *table,
152 const void *init_start, 175 const void *init_start,
153 unsigned long init_size, 176 unsigned long init_size,
154 const void *table_start, 177 const void *table_start,
155 unsigned long table_size) 178 unsigned long table_size,
179 const u8 *header_start,
180 unsigned long header_size)
156{ 181{
182 const u8 *ptr = header_start + 4;
183 const u8 *end = header_start + header_size;
184
157 table->core.pc = (unsigned long)core_start; 185 table->core.pc = (unsigned long)core_start;
158 table->core.range = core_size; 186 table->core.range = core_size;
159 table->init.pc = (unsigned long)init_start; 187 table->init.pc = (unsigned long)init_start;
160 table->init.range = init_size; 188 table->init.range = init_size;
161 table->address = table_start; 189 table->address = table_start;
162 table->size = table_size; 190 table->size = table_size;
191 /* See if the linker provided table looks valid. */
192 if (header_size <= 4
193 || header_start[0] != 1
194 || (void *)read_pointer(&ptr, end, header_start[1], 0, 0)
195 != table_start
196 || !read_pointer(&ptr, end, header_start[2], 0, 0)
197 || !read_pointer(&ptr, end, header_start[3], 0,
198 (unsigned long)header_start)
199 || !read_pointer(&ptr, end, header_start[3], 0,
200 (unsigned long)header_start))
201 header_start = NULL;
202 table->hdrsz = header_size;
203 smp_wmb();
204 table->header = header_start;
163 table->link = NULL; 205 table->link = NULL;
164 table->name = name; 206 table->name = name;
165} 207}
@@ -169,7 +211,144 @@ void __init unwind_init(void)
169 init_unwind_table(&root_table, "kernel", 211 init_unwind_table(&root_table, "kernel",
170 _text, _end - _text, 212 _text, _end - _text,
171 NULL, 0, 213 NULL, 0,
172 __start_unwind, __end_unwind - __start_unwind); 214 __start_unwind, __end_unwind - __start_unwind,
215 __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr);
216}
217
218static const u32 bad_cie, not_fde;
219static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *);
220static signed fde_pointer_type(const u32 *cie);
221
222struct eh_frame_hdr_table_entry {
223 unsigned long start, fde;
224};
225
226static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2)
227{
228 const struct eh_frame_hdr_table_entry *e1 = p1;
229 const struct eh_frame_hdr_table_entry *e2 = p2;
230
231 return (e1->start > e2->start) - (e1->start < e2->start);
232}
233
234static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size)
235{
236 struct eh_frame_hdr_table_entry *e1 = p1;
237 struct eh_frame_hdr_table_entry *e2 = p2;
238 unsigned long v;
239
240 v = e1->start;
241 e1->start = e2->start;
242 e2->start = v;
243 v = e1->fde;
244 e1->fde = e2->fde;
245 e2->fde = v;
246}
247
248static void __init setup_unwind_table(struct unwind_table *table,
249 void *(*alloc)(unsigned long))
250{
251 const u8 *ptr;
252 unsigned long tableSize = table->size, hdrSize;
253 unsigned n;
254 const u32 *fde;
255 struct {
256 u8 version;
257 u8 eh_frame_ptr_enc;
258 u8 fde_count_enc;
259 u8 table_enc;
260 unsigned long eh_frame_ptr;
261 unsigned int fde_count;
262 struct eh_frame_hdr_table_entry table[];
263 } __attribute__((__packed__)) *header;
264
265 if (table->header)
266 return;
267
268 if (table->hdrsz)
269 printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n",
270 table->name);
271
272 if (tableSize & (sizeof(*fde) - 1))
273 return;
274
275 for (fde = table->address, n = 0;
276 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
277 tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
278 const u32 *cie = cie_for_fde(fde, table);
279 signed ptrType;
280
281 if (cie == &not_fde)
282 continue;
283 if (cie == NULL
284 || cie == &bad_cie
285 || (ptrType = fde_pointer_type(cie)) < 0)
286 return;
287 ptr = (const u8 *)(fde + 2);
288 if (!read_pointer(&ptr,
289 (const u8 *)(fde + 1) + *fde,
290 ptrType, 0, 0))
291 return;
292 ++n;
293 }
294
295 if (tableSize || !n)
296 return;
297
298 hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
299 + 2 * n * sizeof(unsigned long);
300 dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize);
301 header = alloc(hdrSize);
302 if (!header)
303 return;
304 header->version = 1;
305 header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native;
306 header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4;
307 header->table_enc = DW_EH_PE_abs|DW_EH_PE_native;
308 put_unaligned((unsigned long)table->address, &header->eh_frame_ptr);
309 BUILD_BUG_ON(offsetof(typeof(*header), fde_count)
310 % __alignof(typeof(header->fde_count)));
311 header->fde_count = n;
312
313 BUILD_BUG_ON(offsetof(typeof(*header), table)
314 % __alignof(typeof(*header->table)));
315 for (fde = table->address, tableSize = table->size, n = 0;
316 tableSize;
317 tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
318 const u32 *cie = fde + 1 - fde[1] / sizeof(*fde);
319
320 if (!fde[1])
321 continue; /* this is a CIE */
322 ptr = (const u8 *)(fde + 2);
323 header->table[n].start = read_pointer(&ptr,
324 (const u8 *)(fde + 1) + *fde,
325 fde_pointer_type(cie), 0, 0);
326 header->table[n].fde = (unsigned long)fde;
327 ++n;
328 }
329 WARN_ON(n != header->fde_count);
330
331 sort(header->table,
332 n,
333 sizeof(*header->table),
334 cmp_eh_frame_hdr_table_entries,
335 swap_eh_frame_hdr_table_entries);
336
337 table->hdrsz = hdrSize;
338 smp_wmb();
339 table->header = (const void *)header;
340}
341
342static void *__init balloc(unsigned long sz)
343{
344 return __alloc_bootmem_nopanic(sz,
345 sizeof(unsigned int),
346 __pa(MAX_DMA_ADDRESS));
347}
348
349void __init unwind_setup(void)
350{
351 setup_unwind_table(&root_table, balloc);
173} 352}
174 353
175#ifdef CONFIG_MODULES 354#ifdef CONFIG_MODULES
@@ -193,7 +372,8 @@ void *unwind_add_table(struct module *module,
193 init_unwind_table(table, module->name, 372 init_unwind_table(table, module->name,
194 module->module_core, module->core_size, 373 module->module_core, module->core_size,
195 module->module_init, module->init_size, 374 module->module_init, module->init_size,
196 table_start, table_size); 375 table_start, table_size,
376 NULL, 0);
197 377
198 if (last_table) 378 if (last_table)
199 last_table->link = table; 379 last_table->link = table;
@@ -303,9 +483,31 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
303 return value; 483 return value;
304} 484}
305 485
486static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
487{
488 const u32 *cie;
489
490 if (!*fde || (*fde & (sizeof(*fde) - 1)))
491 return &bad_cie;
492 if (!fde[1])
493 return &not_fde; /* this is a CIE */
494 if ((fde[1] & (sizeof(*fde) - 1))
495 || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address)
496 return NULL; /* this is not a valid FDE */
497 cie = fde + 1 - fde[1] / sizeof(*fde);
498 if (*cie <= sizeof(*cie) + 4
499 || *cie >= fde[1] - sizeof(*fde)
500 || (*cie & (sizeof(*cie) - 1))
501 || cie[1])
502 return NULL; /* this is not a (valid) CIE */
503 return cie;
504}
505
306static unsigned long read_pointer(const u8 **pLoc, 506static unsigned long read_pointer(const u8 **pLoc,
307 const void *end, 507 const void *end,
308 signed ptrType) 508 signed ptrType,
509 unsigned long text_base,
510 unsigned long data_base)
309{ 511{
310 unsigned long value = 0; 512 unsigned long value = 0;
311 union { 513 union {
@@ -317,13 +519,17 @@ static unsigned long read_pointer(const u8 **pLoc,
317 const unsigned long *pul; 519 const unsigned long *pul;
318 } ptr; 520 } ptr;
319 521
320 if (ptrType < 0 || ptrType == DW_EH_PE_omit) 522 if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
523 dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end);
321 return 0; 524 return 0;
525 }
322 ptr.p8 = *pLoc; 526 ptr.p8 = *pLoc;
323 switch(ptrType & DW_EH_PE_FORM) { 527 switch(ptrType & DW_EH_PE_FORM) {
324 case DW_EH_PE_data2: 528 case DW_EH_PE_data2:
325 if (end < (const void *)(ptr.p16u + 1)) 529 if (end < (const void *)(ptr.p16u + 1)) {
530 dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end);
326 return 0; 531 return 0;
532 }
327 if(ptrType & DW_EH_PE_signed) 533 if(ptrType & DW_EH_PE_signed)
328 value = get_unaligned(ptr.p16s++); 534 value = get_unaligned(ptr.p16s++);
329 else 535 else
@@ -331,8 +537,10 @@ static unsigned long read_pointer(const u8 **pLoc,
331 break; 537 break;
332 case DW_EH_PE_data4: 538 case DW_EH_PE_data4:
333#ifdef CONFIG_64BIT 539#ifdef CONFIG_64BIT
334 if (end < (const void *)(ptr.p32u + 1)) 540 if (end < (const void *)(ptr.p32u + 1)) {
541 dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end);
335 return 0; 542 return 0;
543 }
336 if(ptrType & DW_EH_PE_signed) 544 if(ptrType & DW_EH_PE_signed)
337 value = get_unaligned(ptr.p32s++); 545 value = get_unaligned(ptr.p32s++);
338 else 546 else
@@ -344,8 +552,10 @@ static unsigned long read_pointer(const u8 **pLoc,
344 BUILD_BUG_ON(sizeof(u32) != sizeof(value)); 552 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
345#endif 553#endif
346 case DW_EH_PE_native: 554 case DW_EH_PE_native:
347 if (end < (const void *)(ptr.pul + 1)) 555 if (end < (const void *)(ptr.pul + 1)) {
556 dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end);
348 return 0; 557 return 0;
558 }
349 value = get_unaligned(ptr.pul++); 559 value = get_unaligned(ptr.pul++);
350 break; 560 break;
351 case DW_EH_PE_leb128: 561 case DW_EH_PE_leb128:
@@ -353,10 +563,14 @@ static unsigned long read_pointer(const u8 **pLoc,
353 value = ptrType & DW_EH_PE_signed 563 value = ptrType & DW_EH_PE_signed
354 ? get_sleb128(&ptr.p8, end) 564 ? get_sleb128(&ptr.p8, end)
355 : get_uleb128(&ptr.p8, end); 565 : get_uleb128(&ptr.p8, end);
356 if ((const void *)ptr.p8 > end) 566 if ((const void *)ptr.p8 > end) {
567 dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end);
357 return 0; 568 return 0;
569 }
358 break; 570 break;
359 default: 571 default:
572 dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
573 ptrType, ptr.p8, end);
360 return 0; 574 return 0;
361 } 575 }
362 switch(ptrType & DW_EH_PE_ADJUST) { 576 switch(ptrType & DW_EH_PE_ADJUST) {
@@ -365,12 +579,33 @@ static unsigned long read_pointer(const u8 **pLoc,
365 case DW_EH_PE_pcrel: 579 case DW_EH_PE_pcrel:
366 value += (unsigned long)*pLoc; 580 value += (unsigned long)*pLoc;
367 break; 581 break;
582 case DW_EH_PE_textrel:
583 if (likely(text_base)) {
584 value += text_base;
585 break;
586 }
587 dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
588 ptrType, *pLoc, end);
589 return 0;
590 case DW_EH_PE_datarel:
591 if (likely(data_base)) {
592 value += data_base;
593 break;
594 }
595 dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
596 ptrType, *pLoc, end);
597 return 0;
368 default: 598 default:
599 dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
600 ptrType, *pLoc, end);
369 return 0; 601 return 0;
370 } 602 }
371 if ((ptrType & DW_EH_PE_indirect) 603 if ((ptrType & DW_EH_PE_indirect)
372 && __get_user(value, (unsigned long *)value)) 604 && probe_kernel_address((unsigned long *)value, value)) {
605 dprintk(1, "Cannot read indirect value %lx (%p,%p).",
606 value, *pLoc, end);
373 return 0; 607 return 0;
608 }
374 *pLoc = ptr.p8; 609 *pLoc = ptr.p8;
375 610
376 return value; 611 return value;
@@ -413,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie)
413 case 'P': { 648 case 'P': {
414 signed ptrType = *ptr++; 649 signed ptrType = *ptr++;
415 650
416 if (!read_pointer(&ptr, end, ptrType) || ptr > end) 651 if (!read_pointer(&ptr, end, ptrType, 0, 0)
652 || ptr > end)
417 return -1; 653 return -1;
418 } 654 }
419 break; 655 break;
@@ -473,7 +709,8 @@ static int processCFI(const u8 *start,
473 case DW_CFA_nop: 709 case DW_CFA_nop:
474 break; 710 break;
475 case DW_CFA_set_loc: 711 case DW_CFA_set_loc:
476 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) 712 state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0);
713 if (state->loc == 0)
477 result = 0; 714 result = 0;
478 break; 715 break;
479 case DW_CFA_advance_loc1: 716 case DW_CFA_advance_loc1:
@@ -519,8 +756,10 @@ static int processCFI(const u8 *start,
519 state->label = NULL; 756 state->label = NULL;
520 return 1; 757 return 1;
521 } 758 }
522 if (state->stackDepth >= MAX_STACK_DEPTH) 759 if (state->stackDepth >= MAX_STACK_DEPTH) {
760 dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end);
523 return 0; 761 return 0;
762 }
524 state->stack[state->stackDepth++] = ptr.p8; 763 state->stack[state->stackDepth++] = ptr.p8;
525 break; 764 break;
526 case DW_CFA_restore_state: 765 case DW_CFA_restore_state:
@@ -535,8 +774,10 @@ static int processCFI(const u8 *start,
535 result = processCFI(start, end, 0, ptrType, state); 774 result = processCFI(start, end, 0, ptrType, state);
536 state->loc = loc; 775 state->loc = loc;
537 state->label = label; 776 state->label = label;
538 } else 777 } else {
778 dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end);
539 return 0; 779 return 0;
780 }
540 break; 781 break;
541 case DW_CFA_def_cfa: 782 case DW_CFA_def_cfa:
542 state->cfa.reg = get_uleb128(&ptr.p8, end); 783 state->cfa.reg = get_uleb128(&ptr.p8, end);
@@ -568,6 +809,7 @@ static int processCFI(const u8 *start,
568 break; 809 break;
569 case DW_CFA_GNU_window_save: 810 case DW_CFA_GNU_window_save:
570 default: 811 default:
812 dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end);
571 result = 0; 813 result = 0;
572 break; 814 break;
573 } 815 }
@@ -583,12 +825,17 @@ static int processCFI(const u8 *start,
583 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); 825 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
584 break; 826 break;
585 } 827 }
586 if (ptr.p8 > end) 828 if (ptr.p8 > end) {
829 dprintk(1, "Data overrun (%p,%p).", ptr.p8, end);
587 result = 0; 830 result = 0;
831 }
588 if (result && targetLoc != 0 && targetLoc < state->loc) 832 if (result && targetLoc != 0 && targetLoc < state->loc)
589 return 1; 833 return 1;
590 } 834 }
591 835
836 if (result && ptr.p8 < end)
837 dprintk(1, "Data underrun (%p,%p).", ptr.p8, end);
838
592 return result 839 return result
593 && ptr.p8 == end 840 && ptr.p8 == end
594 && (targetLoc == 0 841 && (targetLoc == 0
@@ -605,54 +852,122 @@ int unwind(struct unwind_frame_info *frame)
605#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) 852#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
606 const u32 *fde = NULL, *cie = NULL; 853 const u32 *fde = NULL, *cie = NULL;
607 const u8 *ptr = NULL, *end = NULL; 854 const u8 *ptr = NULL, *end = NULL;
608 unsigned long pc = UNW_PC(frame) - frame->call_frame; 855 unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
609 unsigned long startLoc = 0, endLoc = 0, cfa; 856 unsigned long startLoc = 0, endLoc = 0, cfa;
610 unsigned i; 857 unsigned i;
611 signed ptrType = -1; 858 signed ptrType = -1;
612 uleb128_t retAddrReg = 0; 859 uleb128_t retAddrReg = 0;
613 struct unwind_table *table; 860 const struct unwind_table *table;
614 struct unwind_state state; 861 struct unwind_state state;
615 862
616 if (UNW_PC(frame) == 0) 863 if (UNW_PC(frame) == 0)
617 return -EINVAL; 864 return -EINVAL;
618 if ((table = find_table(pc)) != NULL 865 if ((table = find_table(pc)) != NULL
619 && !(table->size & (sizeof(*fde) - 1))) { 866 && !(table->size & (sizeof(*fde) - 1))) {
620 unsigned long tableSize = table->size; 867 const u8 *hdr = table->header;
621 868 unsigned long tableSize;
622 for (fde = table->address; 869
623 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; 870 smp_rmb();
624 tableSize -= sizeof(*fde) + *fde, 871 if (hdr && hdr[0] == 1) {
625 fde += 1 + *fde / sizeof(*fde)) { 872 switch(hdr[3] & DW_EH_PE_FORM) {
626 if (!*fde || (*fde & (sizeof(*fde) - 1))) 873 case DW_EH_PE_native: tableSize = sizeof(unsigned long); break;
627 break; 874 case DW_EH_PE_data2: tableSize = 2; break;
628 if (!fde[1]) 875 case DW_EH_PE_data4: tableSize = 4; break;
629 continue; /* this is a CIE */ 876 case DW_EH_PE_data8: tableSize = 8; break;
630 if ((fde[1] & (sizeof(*fde) - 1)) 877 default: tableSize = 0; break;
631 || fde[1] > (unsigned long)(fde + 1) 878 }
632 - (unsigned long)table->address) 879 ptr = hdr + 4;
633 continue; /* this is not a valid FDE */ 880 end = hdr + table->hdrsz;
634 cie = fde + 1 - fde[1] / sizeof(*fde); 881 if (tableSize
635 if (*cie <= sizeof(*cie) + 4 882 && read_pointer(&ptr, end, hdr[1], 0, 0)
636 || *cie >= fde[1] - sizeof(*fde) 883 == (unsigned long)table->address
637 || (*cie & (sizeof(*cie) - 1)) 884 && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0
638 || cie[1] 885 && i == (end - ptr) / (2 * tableSize)
639 || (ptrType = fde_pointer_type(cie)) < 0) { 886 && !((end - ptr) % (2 * tableSize))) {
640 cie = NULL; /* this is not a (valid) CIE */ 887 do {
641 continue; 888 const u8 *cur = ptr + (i / 2) * (2 * tableSize);
889
890 startLoc = read_pointer(&cur,
891 cur + tableSize,
892 hdr[3], 0,
893 (unsigned long)hdr);
894 if (pc < startLoc)
895 i /= 2;
896 else {
897 ptr = cur - tableSize;
898 i = (i + 1) / 2;
899 }
900 } while (startLoc && i > 1);
901 if (i == 1
902 && (startLoc = read_pointer(&ptr,
903 ptr + tableSize,
904 hdr[3], 0,
905 (unsigned long)hdr)) != 0
906 && pc >= startLoc)
907 fde = (void *)read_pointer(&ptr,
908 ptr + tableSize,
909 hdr[3], 0,
910 (unsigned long)hdr);
642 } 911 }
912 }
913 if(hdr && !fde)
914 dprintk(3, "Binary lookup for %lx failed.", pc);
915
916 if (fde != NULL) {
917 cie = cie_for_fde(fde, table);
643 ptr = (const u8 *)(fde + 2); 918 ptr = (const u8 *)(fde + 2);
644 startLoc = read_pointer(&ptr, 919 if(cie != NULL
645 (const u8 *)(fde + 1) + *fde, 920 && cie != &bad_cie
646 ptrType); 921 && cie != &not_fde
647 endLoc = startLoc 922 && (ptrType = fde_pointer_type(cie)) >= 0
648 + read_pointer(&ptr, 923 && read_pointer(&ptr,
649 (const u8 *)(fde + 1) + *fde, 924 (const u8 *)(fde + 1) + *fde,
650 ptrType & DW_EH_PE_indirect 925 ptrType, 0, 0) == startLoc) {
651 ? ptrType 926 if (!(ptrType & DW_EH_PE_indirect))
652 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); 927 ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
653 if (pc >= startLoc && pc < endLoc) 928 endLoc = startLoc
654 break; 929 + read_pointer(&ptr,
655 cie = NULL; 930 (const u8 *)(fde + 1) + *fde,
931 ptrType, 0, 0);
932 if(pc >= endLoc)
933 fde = NULL;
934 } else
935 fde = NULL;
936 if(!fde)
937 dprintk(1, "Binary lookup result for %lx discarded.", pc);
938 }
939 if (fde == NULL) {
940 for (fde = table->address, tableSize = table->size;
941 cie = NULL, tableSize > sizeof(*fde)
942 && tableSize - sizeof(*fde) >= *fde;
943 tableSize -= sizeof(*fde) + *fde,
944 fde += 1 + *fde / sizeof(*fde)) {
945 cie = cie_for_fde(fde, table);
946 if (cie == &bad_cie) {
947 cie = NULL;
948 break;
949 }
950 if (cie == NULL
951 || cie == &not_fde
952 || (ptrType = fde_pointer_type(cie)) < 0)
953 continue;
954 ptr = (const u8 *)(fde + 2);
955 startLoc = read_pointer(&ptr,
956 (const u8 *)(fde + 1) + *fde,
957 ptrType, 0, 0);
958 if (!startLoc)
959 continue;
960 if (!(ptrType & DW_EH_PE_indirect))
961 ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
962 endLoc = startLoc
963 + read_pointer(&ptr,
964 (const u8 *)(fde + 1) + *fde,
965 ptrType, 0, 0);
966 if (pc >= startLoc && pc < endLoc)
967 break;
968 }
969 if(!fde)
970 dprintk(3, "Linear lookup for %lx failed.", pc);
656 } 971 }
657 } 972 }
658 if (cie != NULL) { 973 if (cie != NULL) {
@@ -686,6 +1001,8 @@ int unwind(struct unwind_frame_info *frame)
686 if (ptr >= end || *ptr) 1001 if (ptr >= end || *ptr)
687 cie = NULL; 1002 cie = NULL;
688 } 1003 }
1004 if(!cie)
1005 dprintk(1, "CIE unusable (%p,%p).", ptr, end);
689 ++ptr; 1006 ++ptr;
690 } 1007 }
691 if (cie != NULL) { 1008 if (cie != NULL) {
@@ -695,17 +1012,27 @@ int unwind(struct unwind_frame_info *frame)
695 state.dataAlign = get_sleb128(&ptr, end); 1012 state.dataAlign = get_sleb128(&ptr, end);
696 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) 1013 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
697 cie = NULL; 1014 cie = NULL;
698 else { 1015 else if (UNW_PC(frame) % state.codeAlign
1016 || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
1017 dprintk(1, "Input pointer(s) misaligned (%lx,%lx).",
1018 UNW_PC(frame), UNW_SP(frame));
1019 return -EPERM;
1020 } else {
699 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); 1021 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
700 /* skip augmentation */ 1022 /* skip augmentation */
701 if (((const char *)(cie + 2))[1] == 'z') 1023 if (((const char *)(cie + 2))[1] == 'z') {
702 ptr += get_uleb128(&ptr, end); 1024 uleb128_t augSize = get_uleb128(&ptr, end);
1025
1026 ptr += augSize;
1027 }
703 if (ptr > end 1028 if (ptr > end
704 || retAddrReg >= ARRAY_SIZE(reg_info) 1029 || retAddrReg >= ARRAY_SIZE(reg_info)
705 || REG_INVALID(retAddrReg) 1030 || REG_INVALID(retAddrReg)
706 || reg_info[retAddrReg].width != sizeof(unsigned long)) 1031 || reg_info[retAddrReg].width != sizeof(unsigned long))
707 cie = NULL; 1032 cie = NULL;
708 } 1033 }
1034 if(!cie)
1035 dprintk(1, "CIE validation failed (%p,%p).", ptr, end);
709 } 1036 }
710 if (cie != NULL) { 1037 if (cie != NULL) {
711 state.cieStart = ptr; 1038 state.cieStart = ptr;
@@ -719,13 +1046,15 @@ int unwind(struct unwind_frame_info *frame)
719 if ((ptr += augSize) > end) 1046 if ((ptr += augSize) > end)
720 fde = NULL; 1047 fde = NULL;
721 } 1048 }
1049 if(!fde)
1050 dprintk(1, "FDE validation failed (%p,%p).", ptr, end);
722 } 1051 }
723 if (cie == NULL || fde == NULL) { 1052 if (cie == NULL || fde == NULL) {
724#ifdef CONFIG_FRAME_POINTER 1053#ifdef CONFIG_FRAME_POINTER
725 unsigned long top, bottom; 1054 unsigned long top, bottom;
726#endif
727 1055
728#ifdef CONFIG_FRAME_POINTER 1056 if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long))
1057 return -EPERM;
729 top = STACK_TOP(frame->task); 1058 top = STACK_TOP(frame->task);
730 bottom = STACK_BOTTOM(frame->task); 1059 bottom = STACK_BOTTOM(frame->task);
731# if FRAME_RETADDR_OFFSET < 0 1060# if FRAME_RETADDR_OFFSET < 0
@@ -741,18 +1070,19 @@ int unwind(struct unwind_frame_info *frame)
741 & (sizeof(unsigned long) - 1))) { 1070 & (sizeof(unsigned long) - 1))) {
742 unsigned long link; 1071 unsigned long link;
743 1072
744 if (!__get_user(link, 1073 if (!probe_kernel_address(
745 (unsigned long *)(UNW_FP(frame) 1074 (unsigned long *)(UNW_FP(frame)
746 + FRAME_LINK_OFFSET)) 1075 + FRAME_LINK_OFFSET),
1076 link)
747# if FRAME_RETADDR_OFFSET < 0 1077# if FRAME_RETADDR_OFFSET < 0
748 && link > bottom && link < UNW_FP(frame) 1078 && link > bottom && link < UNW_FP(frame)
749# else 1079# else
750 && link > UNW_FP(frame) && link < bottom 1080 && link > UNW_FP(frame) && link < bottom
751# endif 1081# endif
752 && !(link & (sizeof(link) - 1)) 1082 && !(link & (sizeof(link) - 1))
753 && !__get_user(UNW_PC(frame), 1083 && !probe_kernel_address(
754 (unsigned long *)(UNW_FP(frame) 1084 (unsigned long *)(UNW_FP(frame)
755 + FRAME_RETADDR_OFFSET))) { 1085 + FRAME_RETADDR_OFFSET), UNW_PC(frame))) {
756 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET 1086 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
757# if FRAME_RETADDR_OFFSET < 0 1087# if FRAME_RETADDR_OFFSET < 0
758 - 1088 -
@@ -775,8 +1105,11 @@ int unwind(struct unwind_frame_info *frame)
775 || state.regs[retAddrReg].where == Nowhere 1105 || state.regs[retAddrReg].where == Nowhere
776 || state.cfa.reg >= ARRAY_SIZE(reg_info) 1106 || state.cfa.reg >= ARRAY_SIZE(reg_info)
777 || reg_info[state.cfa.reg].width != sizeof(unsigned long) 1107 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
778 || state.cfa.offs % sizeof(unsigned long)) 1108 || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
1109 || state.cfa.offs % sizeof(unsigned long)) {
1110 dprintk(1, "Unusable unwind info (%p,%p).", ptr, end);
779 return -EIO; 1111 return -EIO;
1112 }
780 /* update frame */ 1113 /* update frame */
781#ifndef CONFIG_AS_CFI_SIGNAL_FRAME 1114#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
782 if(frame->call_frame 1115 if(frame->call_frame
@@ -795,10 +1128,14 @@ int unwind(struct unwind_frame_info *frame)
795#else 1128#else
796# define CASES CASE(8); CASE(16); CASE(32); CASE(64) 1129# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
797#endif 1130#endif
1131 pc = UNW_PC(frame);
1132 sp = UNW_SP(frame);
798 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { 1133 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
799 if (REG_INVALID(i)) { 1134 if (REG_INVALID(i)) {
800 if (state.regs[i].where == Nowhere) 1135 if (state.regs[i].where == Nowhere)
801 continue; 1136 continue;
1137 dprintk(1, "Cannot restore register %u (%d).",
1138 i, state.regs[i].where);
802 return -EIO; 1139 return -EIO;
803 } 1140 }
804 switch(state.regs[i].where) { 1141 switch(state.regs[i].where) {
@@ -807,8 +1144,11 @@ int unwind(struct unwind_frame_info *frame)
807 case Register: 1144 case Register:
808 if (state.regs[i].value >= ARRAY_SIZE(reg_info) 1145 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
809 || REG_INVALID(state.regs[i].value) 1146 || REG_INVALID(state.regs[i].value)
810 || reg_info[i].width > reg_info[state.regs[i].value].width) 1147 || reg_info[i].width > reg_info[state.regs[i].value].width) {
1148 dprintk(1, "Cannot restore register %u from register %lu.",
1149 i, state.regs[i].value);
811 return -EIO; 1150 return -EIO;
1151 }
812 switch(reg_info[state.regs[i].value].width) { 1152 switch(reg_info[state.regs[i].value].width) {
813#define CASE(n) \ 1153#define CASE(n) \
814 case sizeof(u##n): \ 1154 case sizeof(u##n): \
@@ -818,6 +1158,9 @@ int unwind(struct unwind_frame_info *frame)
818 CASES; 1158 CASES;
819#undef CASE 1159#undef CASE
820 default: 1160 default:
1161 dprintk(1, "Unsupported register size %u (%lu).",
1162 reg_info[state.regs[i].value].width,
1163 state.regs[i].value);
821 return -EIO; 1164 return -EIO;
822 } 1165 }
823 break; 1166 break;
@@ -842,12 +1185,17 @@ int unwind(struct unwind_frame_info *frame)
842 CASES; 1185 CASES;
843#undef CASE 1186#undef CASE
844 default: 1187 default:
1188 dprintk(1, "Unsupported register size %u (%u).",
1189 reg_info[i].width, i);
845 return -EIO; 1190 return -EIO;
846 } 1191 }
847 break; 1192 break;
848 case Value: 1193 case Value:
849 if (reg_info[i].width != sizeof(unsigned long)) 1194 if (reg_info[i].width != sizeof(unsigned long)) {
1195 dprintk(1, "Unsupported value size %u (%u).",
1196 reg_info[i].width, i);
850 return -EIO; 1197 return -EIO;
1198 }
851 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value 1199 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
852 * state.dataAlign; 1200 * state.dataAlign;
853 break; 1201 break;
@@ -859,15 +1207,20 @@ int unwind(struct unwind_frame_info *frame)
859 % sizeof(unsigned long) 1207 % sizeof(unsigned long)
860 || addr < startLoc 1208 || addr < startLoc
861 || addr + sizeof(unsigned long) < addr 1209 || addr + sizeof(unsigned long) < addr
862 || addr + sizeof(unsigned long) > endLoc) 1210 || addr + sizeof(unsigned long) > endLoc) {
1211 dprintk(1, "Bad memory location %lx (%lx).",
1212 addr, state.regs[i].value);
863 return -EIO; 1213 return -EIO;
1214 }
864 switch(reg_info[i].width) { 1215 switch(reg_info[i].width) {
865#define CASE(n) case sizeof(u##n): \ 1216#define CASE(n) case sizeof(u##n): \
866 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ 1217 probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
867 break 1218 break
868 CASES; 1219 CASES;
869#undef CASE 1220#undef CASE
870 default: 1221 default:
1222 dprintk(1, "Unsupported memory size %u (%u).",
1223 reg_info[i].width, i);
871 return -EIO; 1224 return -EIO;
872 } 1225 }
873 } 1226 }
@@ -875,6 +1228,17 @@ int unwind(struct unwind_frame_info *frame)
875 } 1228 }
876 } 1229 }
877 1230
1231 if (UNW_PC(frame) % state.codeAlign
1232 || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
1233 dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
1234 UNW_PC(frame), UNW_SP(frame));
1235 return -EIO;
1236 }
1237 if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) {
1238 dprintk(1, "No progress (%lx,%lx).", pc, sp);
1239 return -EIO;
1240 }
1241
878 return 0; 1242 return 0;
879#undef CASES 1243#undef CASES
880#undef FRAME_REG 1244#undef FRAME_REG
diff --git a/kernel/user.c b/kernel/user.c
index 6408c0424291..4869563080e9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) 27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
28 28
29static kmem_cache_t *uid_cachep; 29static struct kmem_cache *uid_cachep;
30static struct list_head uidhash_table[UIDHASH_SZ]; 30static struct list_head uidhash_table[UIDHASH_SZ];
31 31
32/* 32/*
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
132 if (!up) { 132 if (!up) {
133 struct user_struct *new; 133 struct user_struct *new;
134 134
135 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); 135 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
136 if (!new) 136 if (!new)
137 return NULL; 137 return NULL;
138 new->uid = uid; 138 new->uid = uid;
@@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user)
187 atomic_dec(&old_user->processes); 187 atomic_dec(&old_user->processes);
188 switch_uid_keyring(new_user); 188 switch_uid_keyring(new_user);
189 current->user = new_user; 189 current->user = new_user;
190
191 /*
192 * We need to synchronize with __sigqueue_alloc()
193 * doing a get_uid(p->user).. If that saw the old
194 * user value, we need to wait until it has exited
195 * its critical region before we can free the old
196 * structure.
197 */
198 smp_mb();
199 spin_unlock_wait(&current->sighand->siglock);
200
190 free_uid(old_user); 201 free_uid(old_user);
191 suid_keys(current); 202 suid_keys(current);
192} 203}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3df9bfc7ff78..db49886bfae1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,9 @@
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30#include <linux/hardirq.h> 30#include <linux/hardirq.h>
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/freezer.h>
33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h>
32 35
33/* 36/*
34 * The per-CPU workqueue (if single thread, we always use the first 37 * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct {
55 struct task_struct *thread; 58 struct task_struct *thread;
56 59
57 int run_depth; /* Detect run_workqueue() recursion depth */ 60 int run_depth; /* Detect run_workqueue() recursion depth */
61
62 int freezeable; /* Freeze the thread during suspend */
58} ____cacheline_aligned; 63} ____cacheline_aligned;
59 64
60/* 65/*
@@ -80,6 +85,99 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
80 return list_empty(&wq->list); 85 return list_empty(&wq->list);
81} 86}
82 87
88/*
89 * Set the workqueue on which a work item is to be run
90 * - Must *only* be called if the pending flag is set
91 */
92static inline void set_wq_data(struct work_struct *work, void *wq)
93{
94 unsigned long new;
95
96 BUG_ON(!work_pending(work));
97
98 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
99 new |= work->management & WORK_STRUCT_FLAG_MASK;
100 work->management = new;
101}
102
103static inline void *get_wq_data(struct work_struct *work)
104{
105 return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
106}
107
108static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
109{
110 int ret = 0;
111 unsigned long flags;
112
113 spin_lock_irqsave(&cwq->lock, flags);
114 /*
115 * We need to re-validate the work info after we've gotten
116 * the cpu_workqueue lock. We can run the work now iff:
117 *
118 * - the wq_data still matches the cpu_workqueue_struct
119 * - AND the work is still marked pending
120 * - AND the work is still on a list (which will be this
121 * workqueue_struct list)
122 *
123 * All these conditions are important, because we
124 * need to protect against the work being run right
125 * now on another CPU (all but the last one might be
126 * true if it's currently running and has not been
127 * released yet, for example).
128 */
129 if (get_wq_data(work) == cwq
130 && work_pending(work)
131 && !list_empty(&work->entry)) {
132 work_func_t f = work->func;
133 list_del_init(&work->entry);
134 spin_unlock_irqrestore(&cwq->lock, flags);
135
136 if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
137 work_release(work);
138 f(work);
139
140 spin_lock_irqsave(&cwq->lock, flags);
141 cwq->remove_sequence++;
142 wake_up(&cwq->work_done);
143 ret = 1;
144 }
145 spin_unlock_irqrestore(&cwq->lock, flags);
146 return ret;
147}
148
149/**
150 * run_scheduled_work - run scheduled work synchronously
151 * @work: work to run
152 *
153 * This checks if the work was pending, and runs it
154 * synchronously if so. It returns a boolean to indicate
155 * whether it had any scheduled work to run or not.
156 *
157 * NOTE! This _only_ works for normal work_structs. You
158 * CANNOT use this for delayed work, because the wq data
159 * for delayed work will not point properly to the per-
160 * CPU workqueue struct, but will change!
161 */
162int fastcall run_scheduled_work(struct work_struct *work)
163{
164 for (;;) {
165 struct cpu_workqueue_struct *cwq;
166
167 if (!work_pending(work))
168 return 0;
169 if (list_empty(&work->entry))
170 return 0;
171 /* NOTE! This depends intimately on __queue_work! */
172 cwq = get_wq_data(work);
173 if (!cwq)
174 return 0;
175 if (__run_work(cwq, work))
176 return 1;
177 }
178}
179EXPORT_SYMBOL(run_scheduled_work);
180
83/* Preempt must be disabled. */ 181/* Preempt must be disabled. */
84static void __queue_work(struct cpu_workqueue_struct *cwq, 182static void __queue_work(struct cpu_workqueue_struct *cwq,
85 struct work_struct *work) 183 struct work_struct *work)
@@ -87,7 +185,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
87 unsigned long flags; 185 unsigned long flags;
88 186
89 spin_lock_irqsave(&cwq->lock, flags); 187 spin_lock_irqsave(&cwq->lock, flags);
90 work->wq_data = cwq; 188 set_wq_data(work, cwq);
91 list_add_tail(&work->entry, &cwq->worklist); 189 list_add_tail(&work->entry, &cwq->worklist);
92 cwq->insert_sequence++; 190 cwq->insert_sequence++;
93 wake_up(&cwq->more_work); 191 wake_up(&cwq->more_work);
@@ -99,7 +197,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
99 * @wq: workqueue to use 197 * @wq: workqueue to use
100 * @work: work to queue 198 * @work: work to queue
101 * 199 *
102 * Returns non-zero if it was successfully added. 200 * Returns 0 if @work was already on a queue, non-zero otherwise.
103 * 201 *
104 * We queue the work to the CPU it was submitted, but there is no 202 * We queue the work to the CPU it was submitted, but there is no
105 * guarantee that it will be processed by that CPU. 203 * guarantee that it will be processed by that CPU.
@@ -108,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
108{ 206{
109 int ret = 0, cpu = get_cpu(); 207 int ret = 0, cpu = get_cpu();
110 208
111 if (!test_and_set_bit(0, &work->pending)) { 209 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
112 if (unlikely(is_single_threaded(wq))) 210 if (unlikely(is_single_threaded(wq)))
113 cpu = singlethread_cpu; 211 cpu = singlethread_cpu;
114 BUG_ON(!list_empty(&work->entry)); 212 BUG_ON(!list_empty(&work->entry));
@@ -122,38 +220,42 @@ EXPORT_SYMBOL_GPL(queue_work);
122 220
123static void delayed_work_timer_fn(unsigned long __data) 221static void delayed_work_timer_fn(unsigned long __data)
124{ 222{
125 struct work_struct *work = (struct work_struct *)__data; 223 struct delayed_work *dwork = (struct delayed_work *)__data;
126 struct workqueue_struct *wq = work->wq_data; 224 struct workqueue_struct *wq = get_wq_data(&dwork->work);
127 int cpu = smp_processor_id(); 225 int cpu = smp_processor_id();
128 226
129 if (unlikely(is_single_threaded(wq))) 227 if (unlikely(is_single_threaded(wq)))
130 cpu = singlethread_cpu; 228 cpu = singlethread_cpu;
131 229
132 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 230 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
133} 231}
134 232
135/** 233/**
136 * queue_delayed_work - queue work on a workqueue after delay 234 * queue_delayed_work - queue work on a workqueue after delay
137 * @wq: workqueue to use 235 * @wq: workqueue to use
138 * @work: work to queue 236 * @work: delayable work to queue
139 * @delay: number of jiffies to wait before queueing 237 * @delay: number of jiffies to wait before queueing
140 * 238 *
141 * Returns non-zero if it was successfully added. 239 * Returns 0 if @work was already on a queue, non-zero otherwise.
142 */ 240 */
143int fastcall queue_delayed_work(struct workqueue_struct *wq, 241int fastcall queue_delayed_work(struct workqueue_struct *wq,
144 struct work_struct *work, unsigned long delay) 242 struct delayed_work *dwork, unsigned long delay)
145{ 243{
146 int ret = 0; 244 int ret = 0;
147 struct timer_list *timer = &work->timer; 245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work;
247
248 if (delay == 0)
249 return queue_work(wq, work);
148 250
149 if (!test_and_set_bit(0, &work->pending)) { 251 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
150 BUG_ON(timer_pending(timer)); 252 BUG_ON(timer_pending(timer));
151 BUG_ON(!list_empty(&work->entry)); 253 BUG_ON(!list_empty(&work->entry));
152 254
153 /* This stores wq for the moment, for the timer_fn */ 255 /* This stores wq for the moment, for the timer_fn */
154 work->wq_data = wq; 256 set_wq_data(work, wq);
155 timer->expires = jiffies + delay; 257 timer->expires = jiffies + delay;
156 timer->data = (unsigned long)work; 258 timer->data = (unsigned long)dwork;
157 timer->function = delayed_work_timer_fn; 259 timer->function = delayed_work_timer_fn;
158 add_timer(timer); 260 add_timer(timer);
159 ret = 1; 261 ret = 1;
@@ -169,22 +271,23 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
169 * @work: work to queue 271 * @work: work to queue
170 * @delay: number of jiffies to wait before queueing 272 * @delay: number of jiffies to wait before queueing
171 * 273 *
172 * Returns non-zero if it was successfully added. 274 * Returns 0 if @work was already on a queue, non-zero otherwise.
173 */ 275 */
174int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 276int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
175 struct work_struct *work, unsigned long delay) 277 struct delayed_work *dwork, unsigned long delay)
176{ 278{
177 int ret = 0; 279 int ret = 0;
178 struct timer_list *timer = &work->timer; 280 struct timer_list *timer = &dwork->timer;
281 struct work_struct *work = &dwork->work;
179 282
180 if (!test_and_set_bit(0, &work->pending)) { 283 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
181 BUG_ON(timer_pending(timer)); 284 BUG_ON(timer_pending(timer));
182 BUG_ON(!list_empty(&work->entry)); 285 BUG_ON(!list_empty(&work->entry));
183 286
184 /* This stores wq for the moment, for the timer_fn */ 287 /* This stores wq for the moment, for the timer_fn */
185 work->wq_data = wq; 288 set_wq_data(work, wq);
186 timer->expires = jiffies + delay; 289 timer->expires = jiffies + delay;
187 timer->data = (unsigned long)work; 290 timer->data = (unsigned long)dwork;
188 timer->function = delayed_work_timer_fn; 291 timer->function = delayed_work_timer_fn;
189 add_timer_on(timer, cpu); 292 add_timer_on(timer, cpu);
190 ret = 1; 293 ret = 1;
@@ -212,15 +315,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
212 while (!list_empty(&cwq->worklist)) { 315 while (!list_empty(&cwq->worklist)) {
213 struct work_struct *work = list_entry(cwq->worklist.next, 316 struct work_struct *work = list_entry(cwq->worklist.next,
214 struct work_struct, entry); 317 struct work_struct, entry);
215 void (*f) (void *) = work->func; 318 work_func_t f = work->func;
216 void *data = work->data;
217 319
218 list_del_init(cwq->worklist.next); 320 list_del_init(cwq->worklist.next);
219 spin_unlock_irqrestore(&cwq->lock, flags); 321 spin_unlock_irqrestore(&cwq->lock, flags);
220 322
221 BUG_ON(work->wq_data != cwq); 323 BUG_ON(get_wq_data(work) != cwq);
222 clear_bit(0, &work->pending); 324 if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
223 f(data); 325 work_release(work);
326 f(work);
327
328 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
329 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
330 "%s/0x%08x/%d\n",
331 current->comm, preempt_count(),
332 current->pid);
333 printk(KERN_ERR " last function: ");
334 print_symbol("%s\n", (unsigned long)f);
335 debug_show_held_locks(current);
336 dump_stack();
337 }
224 338
225 spin_lock_irqsave(&cwq->lock, flags); 339 spin_lock_irqsave(&cwq->lock, flags);
226 cwq->remove_sequence++; 340 cwq->remove_sequence++;
@@ -237,7 +351,8 @@ static int worker_thread(void *__cwq)
237 struct k_sigaction sa; 351 struct k_sigaction sa;
238 sigset_t blocked; 352 sigset_t blocked;
239 353
240 current->flags |= PF_NOFREEZE; 354 if (!cwq->freezeable)
355 current->flags |= PF_NOFREEZE;
241 356
242 set_user_nice(current, -5); 357 set_user_nice(current, -5);
243 358
@@ -260,6 +375,9 @@ static int worker_thread(void *__cwq)
260 375
261 set_current_state(TASK_INTERRUPTIBLE); 376 set_current_state(TASK_INTERRUPTIBLE);
262 while (!kthread_should_stop()) { 377 while (!kthread_should_stop()) {
378 if (cwq->freezeable)
379 try_to_freeze();
380
263 add_wait_queue(&cwq->more_work, &wait); 381 add_wait_queue(&cwq->more_work, &wait);
264 if (list_empty(&cwq->worklist)) 382 if (list_empty(&cwq->worklist))
265 schedule(); 383 schedule();
@@ -336,7 +454,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
336EXPORT_SYMBOL_GPL(flush_workqueue); 454EXPORT_SYMBOL_GPL(flush_workqueue);
337 455
338static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 456static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
339 int cpu) 457 int cpu, int freezeable)
340{ 458{
341 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 459 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
342 struct task_struct *p; 460 struct task_struct *p;
@@ -346,6 +464,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
346 cwq->thread = NULL; 464 cwq->thread = NULL;
347 cwq->insert_sequence = 0; 465 cwq->insert_sequence = 0;
348 cwq->remove_sequence = 0; 466 cwq->remove_sequence = 0;
467 cwq->freezeable = freezeable;
349 INIT_LIST_HEAD(&cwq->worklist); 468 INIT_LIST_HEAD(&cwq->worklist);
350 init_waitqueue_head(&cwq->more_work); 469 init_waitqueue_head(&cwq->more_work);
351 init_waitqueue_head(&cwq->work_done); 470 init_waitqueue_head(&cwq->work_done);
@@ -361,7 +480,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
361} 480}
362 481
363struct workqueue_struct *__create_workqueue(const char *name, 482struct workqueue_struct *__create_workqueue(const char *name,
364 int singlethread) 483 int singlethread, int freezeable)
365{ 484{
366 int cpu, destroy = 0; 485 int cpu, destroy = 0;
367 struct workqueue_struct *wq; 486 struct workqueue_struct *wq;
@@ -381,7 +500,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
381 mutex_lock(&workqueue_mutex); 500 mutex_lock(&workqueue_mutex);
382 if (singlethread) { 501 if (singlethread) {
383 INIT_LIST_HEAD(&wq->list); 502 INIT_LIST_HEAD(&wq->list);
384 p = create_workqueue_thread(wq, singlethread_cpu); 503 p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
385 if (!p) 504 if (!p)
386 destroy = 1; 505 destroy = 1;
387 else 506 else
@@ -389,7 +508,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
389 } else { 508 } else {
390 list_add(&wq->list, &workqueues); 509 list_add(&wq->list, &workqueues);
391 for_each_online_cpu(cpu) { 510 for_each_online_cpu(cpu) {
392 p = create_workqueue_thread(wq, cpu); 511 p = create_workqueue_thread(wq, cpu, freezeable);
393 if (p) { 512 if (p) {
394 kthread_bind(p, cpu); 513 kthread_bind(p, cpu);
395 wake_up_process(p); 514 wake_up_process(p);
@@ -468,38 +587,37 @@ EXPORT_SYMBOL(schedule_work);
468 587
469/** 588/**
470 * schedule_delayed_work - put work task in global workqueue after delay 589 * schedule_delayed_work - put work task in global workqueue after delay
471 * @work: job to be done 590 * @dwork: job to be done
472 * @delay: number of jiffies to wait 591 * @delay: number of jiffies to wait or 0 for immediate execution
473 * 592 *
474 * After waiting for a given time this puts a job in the kernel-global 593 * After waiting for a given time this puts a job in the kernel-global
475 * workqueue. 594 * workqueue.
476 */ 595 */
477int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) 596int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
478{ 597{
479 return queue_delayed_work(keventd_wq, work, delay); 598 return queue_delayed_work(keventd_wq, dwork, delay);
480} 599}
481EXPORT_SYMBOL(schedule_delayed_work); 600EXPORT_SYMBOL(schedule_delayed_work);
482 601
483/** 602/**
484 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 603 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
485 * @cpu: cpu to use 604 * @cpu: cpu to use
486 * @work: job to be done 605 * @dwork: job to be done
487 * @delay: number of jiffies to wait 606 * @delay: number of jiffies to wait
488 * 607 *
489 * After waiting for a given time this puts a job in the kernel-global 608 * After waiting for a given time this puts a job in the kernel-global
490 * workqueue on the specified CPU. 609 * workqueue on the specified CPU.
491 */ 610 */
492int schedule_delayed_work_on(int cpu, 611int schedule_delayed_work_on(int cpu,
493 struct work_struct *work, unsigned long delay) 612 struct delayed_work *dwork, unsigned long delay)
494{ 613{
495 return queue_delayed_work_on(cpu, keventd_wq, work, delay); 614 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
496} 615}
497EXPORT_SYMBOL(schedule_delayed_work_on); 616EXPORT_SYMBOL(schedule_delayed_work_on);
498 617
499/** 618/**
500 * schedule_on_each_cpu - call a function on each online CPU from keventd 619 * schedule_on_each_cpu - call a function on each online CPU from keventd
501 * @func: the function to call 620 * @func: the function to call
502 * @info: a pointer to pass to func()
503 * 621 *
504 * Returns zero on success. 622 * Returns zero on success.
505 * Returns -ve errno on failure. 623 * Returns -ve errno on failure.
@@ -508,7 +626,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
508 * 626 *
509 * schedule_on_each_cpu() is very slow. 627 * schedule_on_each_cpu() is very slow.
510 */ 628 */
511int schedule_on_each_cpu(void (*func)(void *info), void *info) 629int schedule_on_each_cpu(work_func_t func)
512{ 630{
513 int cpu; 631 int cpu;
514 struct work_struct *works; 632 struct work_struct *works;
@@ -519,7 +637,7 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
519 637
520 mutex_lock(&workqueue_mutex); 638 mutex_lock(&workqueue_mutex);
521 for_each_online_cpu(cpu) { 639 for_each_online_cpu(cpu) {
522 INIT_WORK(per_cpu_ptr(works, cpu), func, info); 640 INIT_WORK(per_cpu_ptr(works, cpu), func);
523 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 641 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
524 per_cpu_ptr(works, cpu)); 642 per_cpu_ptr(works, cpu));
525 } 643 }
@@ -539,12 +657,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
539 * cancel_rearming_delayed_workqueue - reliably kill off a delayed 657 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
540 * work whose handler rearms the delayed work. 658 * work whose handler rearms the delayed work.
541 * @wq: the controlling workqueue structure 659 * @wq: the controlling workqueue structure
542 * @work: the delayed work struct 660 * @dwork: the delayed work struct
543 */ 661 */
544void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, 662void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
545 struct work_struct *work) 663 struct delayed_work *dwork)
546{ 664{
547 while (!cancel_delayed_work(work)) 665 while (!cancel_delayed_work(dwork))
548 flush_workqueue(wq); 666 flush_workqueue(wq);
549} 667}
550EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); 668EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,18 +670,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
552/** 670/**
553 * cancel_rearming_delayed_work - reliably kill off a delayed keventd 671 * cancel_rearming_delayed_work - reliably kill off a delayed keventd
554 * work whose handler rearms the delayed work. 672 * work whose handler rearms the delayed work.
555 * @work: the delayed work struct 673 * @dwork: the delayed work struct
556 */ 674 */
557void cancel_rearming_delayed_work(struct work_struct *work) 675void cancel_rearming_delayed_work(struct delayed_work *dwork)
558{ 676{
559 cancel_rearming_delayed_workqueue(keventd_wq, work); 677 cancel_rearming_delayed_workqueue(keventd_wq, dwork);
560} 678}
561EXPORT_SYMBOL(cancel_rearming_delayed_work); 679EXPORT_SYMBOL(cancel_rearming_delayed_work);
562 680
563/** 681/**
564 * execute_in_process_context - reliably execute the routine with user context 682 * execute_in_process_context - reliably execute the routine with user context
565 * @fn: the function to execute 683 * @fn: the function to execute
566 * @data: data to pass to the function
567 * @ew: guaranteed storage for the execute work structure (must 684 * @ew: guaranteed storage for the execute work structure (must
568 * be available when the work executes) 685 * be available when the work executes)
569 * 686 *
@@ -573,15 +690,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
573 * Returns: 0 - function was executed 690 * Returns: 0 - function was executed
574 * 1 - function was scheduled for execution 691 * 1 - function was scheduled for execution
575 */ 692 */
576int execute_in_process_context(void (*fn)(void *data), void *data, 693int execute_in_process_context(work_func_t fn, struct execute_work *ew)
577 struct execute_work *ew)
578{ 694{
579 if (!in_interrupt()) { 695 if (!in_interrupt()) {
580 fn(data); 696 fn(&ew->work);
581 return 0; 697 return 0;
582 } 698 }
583 699
584 INIT_WORK(&ew->work, fn, data); 700 INIT_WORK(&ew->work, fn);
585 schedule_work(&ew->work); 701 schedule_work(&ew->work);
586 702
587 return 1; 703 return 1;
@@ -609,7 +725,6 @@ int current_is_keventd(void)
609 725
610} 726}
611 727
612#ifdef CONFIG_HOTPLUG_CPU
613/* Take the work from this (downed) CPU. */ 728/* Take the work from this (downed) CPU. */
614static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 729static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
615{ 730{
@@ -642,7 +757,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
642 mutex_lock(&workqueue_mutex); 757 mutex_lock(&workqueue_mutex);
643 /* Create a new workqueue thread for it. */ 758 /* Create a new workqueue thread for it. */
644 list_for_each_entry(wq, &workqueues, list) { 759 list_for_each_entry(wq, &workqueues, list) {
645 if (!create_workqueue_thread(wq, hotcpu)) { 760 if (!create_workqueue_thread(wq, hotcpu, 0)) {
646 printk("workqueue for %i failed\n", hotcpu); 761 printk("workqueue for %i failed\n", hotcpu);
647 return NOTIFY_BAD; 762 return NOTIFY_BAD;
648 } 763 }
@@ -692,7 +807,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
692 807
693 return NOTIFY_OK; 808 return NOTIFY_OK;
694} 809}
695#endif
696 810
697void init_workqueues(void) 811void init_workqueues(void)
698{ 812{