aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz20
-rw-r--r--kernel/acct.c29
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c13
-rw-r--r--kernel/compat.c33
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/cpuset.c126
-rw-r--r--kernel/delayacct.c19
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exit.c83
-rw-r--r--kernel/fork.c132
-rw-r--r--kernel/futex.c62
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/kallsyms.c33
-rw-r--r--kernel/kexec.c60
-rw-r--r--kernel/kmod.c26
-rw-r--r--kernel/kprobes.c117
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/latency.c1
-rw-r--r--kernel/lockdep.c244
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c6
-rw-r--r--kernel/module.c75
-rw-r--r--kernel/mutex-debug.c3
-rw-r--r--kernel/mutex.c9
-rw-r--r--kernel/nsproxy.c38
-rw-r--r--kernel/pid.c77
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/disk.c101
-rw-r--r--kernel/power/main.c14
-rw-r--r--kernel/power/power.h32
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c143
-rw-r--r--kernel/power/snapshot.c860
-rw-r--r--kernel/power/swap.c347
-rw-r--r--kernel/power/swsusp.c98
-rw-r--r--kernel/power/user.c102
-rw-r--r--kernel/printk.c45
-rw-r--r--kernel/profile.c47
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/relay.c20
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/sched.c554
-rw-r--r--kernel/signal.c38
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/spinlock.c21
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c446
-rw-r--r--kernel/taskstats.c193
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/timer.c162
-rw-r--r--kernel/tsacct.c19
-rw-r--r--kernel/unwind.c212
-rw-r--r--kernel/user.c15
-rw-r--r--kernel/workqueue.c214
64 files changed, 3336 insertions, 1678 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8b..4af15802ccd4 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
7 default HZ_250 7 default HZ_250
8 help 8 help
9 Allows the configuration of the timer frequency. It is customary 9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more 10 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
11 beneficial for servers and NUMA systems that do not need to have 11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus 12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts. 13 contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
19 config HZ_100 19 config HZ_100
20 bool "100 HZ" 20 bool "100 HZ"
21 help 21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems 22 100 Hz is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if 23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring. 24 too many timer interrupts are occurring.
25 25
26 config HZ_250 26 config HZ_250
27 bool "250 HZ" 27 bool "250 HZ"
28 help 28 help
29 250 HZ is a good compromise choice allowing server performance 29 250 Hz is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even 30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems. 31 on SMP and NUMA systems. If you are going to be using NTSC video
32 or multimedia, selected 300Hz instead.
33
34 config HZ_300
35 bool "300 HZ"
36 help
37 300 Hz is a good compromise choice allowing server performance
38 while also showing good interactive responsiveness even
39 on SMP and NUMA systems and exactly dividing by both PAL and
40 NTSC frame rates for video and multimedia work.
32 41
33 config HZ_1000 42 config HZ_1000
34 bool "1000 HZ" 43 bool "1000 HZ"
35 help 44 help
36 1000 HZ is the preferred choice for desktop systems and other 45 1000 Hz is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events. 46 systems requiring fast interactive responses to events.
38 47
39endchoice 48endchoice
@@ -42,5 +51,6 @@ config HZ
42 int 51 int
43 default 100 if HZ_100 52 default 100 if HZ_100
44 default 250 if HZ_250 53 default 250 if HZ_250
54 default 300 if HZ_300
45 default 1000 if HZ_1000 55 default 1000 if HZ_1000
46 56
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a81..70d0d88e5554 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
89 struct timer_list timer; 89 struct timer_list timer;
90}; 90};
91 91
92static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; 92static struct acct_glbs acct_globals __cacheline_aligned =
93 {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
93 94
94/* 95/*
95 * Called whenever the timer says to check the free space. 96 * Called whenever the timer says to check the free space.
@@ -117,7 +118,7 @@ static int check_free_space(struct file *file)
117 spin_unlock(&acct_globals.lock); 118 spin_unlock(&acct_globals.lock);
118 119
119 /* May block */ 120 /* May block */
120 if (vfs_statfs(file->f_dentry, &sbuf)) 121 if (vfs_statfs(file->f_path.dentry, &sbuf))
121 return res; 122 return res;
122 suspend = sbuf.f_blocks * SUSPEND; 123 suspend = sbuf.f_blocks * SUSPEND;
123 resume = sbuf.f_blocks * RESUME; 124 resume = sbuf.f_blocks * RESUME;
@@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file)
193 add_timer(&acct_globals.timer); 194 add_timer(&acct_globals.timer);
194 } 195 }
195 if (old_acct) { 196 if (old_acct) {
196 mnt_unpin(old_acct->f_vfsmnt); 197 mnt_unpin(old_acct->f_path.mnt);
197 spin_unlock(&acct_globals.lock); 198 spin_unlock(&acct_globals.lock);
198 do_acct_process(old_acct); 199 do_acct_process(old_acct);
199 filp_close(old_acct, NULL); 200 filp_close(old_acct, NULL);
@@ -211,7 +212,7 @@ static int acct_on(char *name)
211 if (IS_ERR(file)) 212 if (IS_ERR(file))
212 return PTR_ERR(file); 213 return PTR_ERR(file);
213 214
214 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { 215 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
215 filp_close(file, NULL); 216 filp_close(file, NULL);
216 return -EACCES; 217 return -EACCES;
217 } 218 }
@@ -228,11 +229,11 @@ static int acct_on(char *name)
228 } 229 }
229 230
230 spin_lock(&acct_globals.lock); 231 spin_lock(&acct_globals.lock);
231 mnt_pin(file->f_vfsmnt); 232 mnt_pin(file->f_path.mnt);
232 acct_file_reopen(file); 233 acct_file_reopen(file);
233 spin_unlock(&acct_globals.lock); 234 spin_unlock(&acct_globals.lock);
234 235
235 mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ 236 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
236 237
237 return 0; 238 return 0;
238} 239}
@@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
282void acct_auto_close_mnt(struct vfsmount *m) 283void acct_auto_close_mnt(struct vfsmount *m)
283{ 284{
284 spin_lock(&acct_globals.lock); 285 spin_lock(&acct_globals.lock);
285 if (acct_globals.file && acct_globals.file->f_vfsmnt == m) 286 if (acct_globals.file && acct_globals.file->f_path.mnt == m)
286 acct_file_reopen(NULL); 287 acct_file_reopen(NULL);
287 spin_unlock(&acct_globals.lock); 288 spin_unlock(&acct_globals.lock);
288} 289}
@@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
298{ 299{
299 spin_lock(&acct_globals.lock); 300 spin_lock(&acct_globals.lock);
300 if (acct_globals.file && 301 if (acct_globals.file &&
301 acct_globals.file->f_vfsmnt->mnt_sb == sb) { 302 acct_globals.file->f_path.mnt->mnt_sb == sb) {
302 acct_file_reopen(NULL); 303 acct_file_reopen(NULL);
303 } 304 }
304 spin_unlock(&acct_globals.lock); 305 spin_unlock(&acct_globals.lock);
@@ -427,6 +428,7 @@ static void do_acct_process(struct file *file)
427 u64 elapsed; 428 u64 elapsed;
428 u64 run_time; 429 u64 run_time;
429 struct timespec uptime; 430 struct timespec uptime;
431 struct tty_struct *tty;
430 432
431 /* 433 /*
432 * First check to see if there is enough free_space to continue 434 * First check to see if there is enough free_space to continue
@@ -483,16 +485,9 @@ static void do_acct_process(struct file *file)
483 ac.ac_ppid = current->parent->tgid; 485 ac.ac_ppid = current->parent->tgid;
484#endif 486#endif
485 487
486 mutex_lock(&tty_mutex);
487 /* FIXME: Whoever is responsible for current->signal locking needs
488 to use the same locking all over the kernel and document it */
489 read_lock(&tasklist_lock);
490 ac.ac_tty = current->signal->tty ?
491 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
492 read_unlock(&tasklist_lock);
493 mutex_unlock(&tty_mutex);
494
495 spin_lock_irq(&current->sighand->siglock); 488 spin_lock_irq(&current->sighand->siglock);
489 tty = current->signal->tty;
490 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
496 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 491 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
497 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 492 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
498 ac.ac_flag = pacct->ac_flag; 493 ac.ac_flag = pacct->ac_flag;
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b0..d9b690ac684b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h> 59#include <linux/inotify.h>
60#include <linux/freezer.h>
60 61
61#include "audit.h" 62#include "audit.h"
62 63
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8e..2e896f8ae29e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
636 struct audit_rule *rule; 636 struct audit_rule *rule;
637 int i; 637 int i;
638 638
639 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 639 rule = kzalloc(sizeof(*rule), GFP_KERNEL);
640 if (unlikely(!rule)) 640 if (unlikely(!rule))
641 return NULL; 641 return NULL;
642 memset(rule, 0, sizeof(*rule));
643 642
644 rule->flags = krule->flags | krule->listnr; 643 rule->flags = krule->flags | krule->listnr;
645 rule->action = krule->action; 644 rule->action = krule->action;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 42f2f1179711..298897559ca4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
64#include <linux/tty.h> 64#include <linux/tty.h>
65#include <linux/selinux.h> 65#include <linux/selinux.h>
66#include <linux/binfmts.h> 66#include <linux/binfmts.h>
67#include <linux/highmem.h>
67#include <linux/syscalls.h> 68#include <linux/syscalls.h>
68 69
69#include "audit.h" 70#include "audit.h"
@@ -730,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context)
730 printk(KERN_ERR "audit: freed %d contexts\n", count); 731 printk(KERN_ERR "audit: freed %d contexts\n", count);
731} 732}
732 733
733static void audit_log_task_context(struct audit_buffer *ab) 734void audit_log_task_context(struct audit_buffer *ab)
734{ 735{
735 char *ctx = NULL; 736 char *ctx = NULL;
736 ssize_t len = 0; 737 ssize_t len = 0;
@@ -759,6 +760,8 @@ error_path:
759 return; 760 return;
760} 761}
761 762
763EXPORT_SYMBOL(audit_log_task_context);
764
762static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 765static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
763{ 766{
764 char name[sizeof(tsk->comm)]; 767 char name[sizeof(tsk->comm)];
@@ -778,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
778 if ((vma->vm_flags & VM_EXECUTABLE) && 781 if ((vma->vm_flags & VM_EXECUTABLE) &&
779 vma->vm_file) { 782 vma->vm_file) {
780 audit_log_d_path(ab, "exe=", 783 audit_log_d_path(ab, "exe=",
781 vma->vm_file->f_dentry, 784 vma->vm_file->f_path.dentry,
782 vma->vm_file->f_vfsmnt); 785 vma->vm_file->f_path.mnt);
783 break; 786 break;
784 } 787 }
785 vma = vma->vm_next; 788 vma = vma->vm_next;
@@ -823,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
823 context->return_code); 826 context->return_code);
824 827
825 mutex_lock(&tty_mutex); 828 mutex_lock(&tty_mutex);
829 read_lock(&tasklist_lock);
826 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 830 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
827 tty = tsk->signal->tty->name; 831 tty = tsk->signal->tty->name;
828 else 832 else
829 tty = "(none)"; 833 tty = "(none)";
834 read_unlock(&tasklist_lock);
830 audit_log_format(ab, 835 audit_log_format(ab,
831 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 836 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
832 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 837 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1487,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1487 return ctx ? ctx->loginuid : -1; 1492 return ctx ? ctx->loginuid : -1;
1488} 1493}
1489 1494
1495EXPORT_SYMBOL(audit_get_loginuid);
1496
1490/** 1497/**
1491 * __audit_mq_open - record audit data for a POSIX MQ open 1498 * __audit_mq_open - record audit data for a POSIX MQ open
1492 * @oflag: open flag 1499 * @oflag: open flag
diff --git a/kernel/compat.c b/kernel/compat.c
index d4898aad6cfa..6952dd057300 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
982 } 982 }
983 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); 983 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
984} 984}
985
986asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
987 compat_ulong_t maxnode,
988 const compat_ulong_t __user *old_nodes,
989 const compat_ulong_t __user *new_nodes)
990{
991 unsigned long __user *old = NULL;
992 unsigned long __user *new = NULL;
993 nodemask_t tmp_mask;
994 unsigned long nr_bits;
995 unsigned long size;
996
997 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
998 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
999 if (old_nodes) {
1000 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1001 return -EFAULT;
1002 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1003 if (new_nodes)
1004 new = old + size / sizeof(unsigned long);
1005 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1006 return -EFAULT;
1007 }
1008 if (new_nodes) {
1009 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1010 return -EFAULT;
1011 if (new == NULL)
1012 new = compat_alloc_user_space(size);
1013 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1014 return -EFAULT;
1015 }
1016 return sys_migrate_pages(pid, nr_bits + 1, old, new);
1017}
985#endif 1018#endif
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4ad..8fa1fb28f8a7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
75 return count; 75 return count;
76} 76}
77 77
78static struct file_operations ikconfig_file_ops = { 78static const struct file_operations ikconfig_file_ops = {
79 .owner = THIS_MODULE, 79 .owner = THIS_MODULE,
80 .read = ikconfig_read_current, 80 .read = ikconfig_read_current,
81}; 81};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 663c920b2234..9124669f4586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void)
58 recursive_depth--; 58 recursive_depth--;
59 return; 59 return;
60 } 60 }
61 mutex_unlock(&cpu_bitmask_lock);
62 recursive = NULL; 61 recursive = NULL;
62 mutex_unlock(&cpu_bitmask_lock);
63} 63}
64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
65 65
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
270 goto out; 270 goto out;
271 } 271 }
272 } 272 }
273 error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); 273
274 if (error) {
275 printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
276 goto out;
277 }
278 /* We take down all of the non-boot CPUs in one shot to avoid races 274 /* We take down all of the non-boot CPUs in one shot to avoid races
279 * with the userspace trying to use the CPU hotplug at the same time 275 * with the userspace trying to use the CPU hotplug at the same time
280 */ 276 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930e..232aed2b10f9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
413 * 413 *
414 * 414 *
415 * When reading/writing to a file: 415 * When reading/writing to a file:
416 * - the cpuset to use in file->f_dentry->d_parent->d_fsdata 416 * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
417 * - the 'cftype' of the file is file->f_dentry->d_fsdata 417 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
418 */ 418 */
419 419
420struct cftype { 420struct cftype {
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
729 } 729 }
730 730
731 /* Remaining checks don't apply to root cpuset */ 731 /* Remaining checks don't apply to root cpuset */
732 if ((par = cur->parent) == NULL) 732 if (cur == &top_cpuset)
733 return 0; 733 return 0;
734 734
735 par = cur->parent;
736
735 /* We must be a subset of our parent cpuset */ 737 /* We must be a subset of our parent cpuset */
736 if (!is_cpuset_subset(trial, par)) 738 if (!is_cpuset_subset(trial, par))
737 return -EACCES; 739 return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1060 cpu_exclusive_changed = 1062 cpu_exclusive_changed =
1061 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 1063 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
1062 mutex_lock(&callback_mutex); 1064 mutex_lock(&callback_mutex);
1063 if (turning_on) 1065 cs->flags = trialcs.flags;
1064 set_bit(bit, &cs->flags);
1065 else
1066 clear_bit(bit, &cs->flags);
1067 mutex_unlock(&callback_mutex); 1066 mutex_unlock(&callback_mutex);
1068 1067
1069 if (cpu_exclusive_changed) 1068 if (cpu_exclusive_changed)
@@ -1281,18 +1280,19 @@ typedef enum {
1281 FILE_TASKLIST, 1280 FILE_TASKLIST,
1282} cpuset_filetype_t; 1281} cpuset_filetype_t;
1283 1282
1284static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, 1283static ssize_t cpuset_common_file_write(struct file *file,
1284 const char __user *userbuf,
1285 size_t nbytes, loff_t *unused_ppos) 1285 size_t nbytes, loff_t *unused_ppos)
1286{ 1286{
1287 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1287 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1288 struct cftype *cft = __d_cft(file->f_dentry); 1288 struct cftype *cft = __d_cft(file->f_path.dentry);
1289 cpuset_filetype_t type = cft->private; 1289 cpuset_filetype_t type = cft->private;
1290 char *buffer; 1290 char *buffer;
1291 char *pathbuf = NULL; 1291 char *pathbuf = NULL;
1292 int retval = 0; 1292 int retval = 0;
1293 1293
1294 /* Crude upper limit on largest legitimate cpulist user might write. */ 1294 /* Crude upper limit on largest legitimate cpulist user might write. */
1295 if (nbytes > 100 + 6 * NR_CPUS) 1295 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
1296 return -E2BIG; 1296 return -E2BIG;
1297 1297
1298 /* +1 for nul-terminator */ 1298 /* +1 for nul-terminator */
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
1367 size_t nbytes, loff_t *ppos) 1367 size_t nbytes, loff_t *ppos)
1368{ 1368{
1369 ssize_t retval = 0; 1369 ssize_t retval = 0;
1370 struct cftype *cft = __d_cft(file->f_dentry); 1370 struct cftype *cft = __d_cft(file->f_path.dentry);
1371 if (!cft) 1371 if (!cft)
1372 return -ENODEV; 1372 return -ENODEV;
1373 1373
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1417static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, 1417static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1418 size_t nbytes, loff_t *ppos) 1418 size_t nbytes, loff_t *ppos)
1419{ 1419{
1420 struct cftype *cft = __d_cft(file->f_dentry); 1420 struct cftype *cft = __d_cft(file->f_path.dentry);
1421 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1421 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1422 cpuset_filetype_t type = cft->private; 1422 cpuset_filetype_t type = cft->private;
1423 char *page; 1423 char *page;
1424 ssize_t retval = 0; 1424 ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
1476 loff_t *ppos) 1476 loff_t *ppos)
1477{ 1477{
1478 ssize_t retval = 0; 1478 ssize_t retval = 0;
1479 struct cftype *cft = __d_cft(file->f_dentry); 1479 struct cftype *cft = __d_cft(file->f_path.dentry);
1480 if (!cft) 1480 if (!cft)
1481 return -ENODEV; 1481 return -ENODEV;
1482 1482
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
1498 if (err) 1498 if (err)
1499 return err; 1499 return err;
1500 1500
1501 cft = __d_cft(file->f_dentry); 1501 cft = __d_cft(file->f_path.dentry);
1502 if (!cft) 1502 if (!cft)
1503 return -ENODEV; 1503 return -ENODEV;
1504 if (cft->open) 1504 if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
1511 1511
1512static int cpuset_file_release(struct inode *inode, struct file *file) 1512static int cpuset_file_release(struct inode *inode, struct file *file)
1513{ 1513{
1514 struct cftype *cft = __d_cft(file->f_dentry); 1514 struct cftype *cft = __d_cft(file->f_path.dentry);
1515 if (cft->release) 1515 if (cft->release)
1516 return cft->release(inode, file); 1516 return cft->release(inode, file);
1517 return 0; 1517 return 0;
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1533} 1533}
1534 1534
1535static struct file_operations cpuset_file_operations = { 1535static const struct file_operations cpuset_file_operations = {
1536 .read = cpuset_file_read, 1536 .read = cpuset_file_read,
1537 .write = cpuset_file_write, 1537 .write = cpuset_file_write,
1538 .llseek = generic_file_llseek, 1538 .llseek = generic_file_llseek,
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1700 */ 1700 */
1701static int cpuset_tasks_open(struct inode *unused, struct file *file) 1701static int cpuset_tasks_open(struct inode *unused, struct file *file)
1702{ 1702{
1703 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1703 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1704 struct ctr_struct *ctr; 1704 struct ctr_struct *ctr;
1705 pid_t *pidarray; 1705 pid_t *pidarray;
1706 int npids; 1706 int npids;
@@ -2045,7 +2045,6 @@ out:
2045 return err; 2045 return err;
2046} 2046}
2047 2047
2048#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
2049/* 2048/*
2050 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 2049 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
2051 * or memory nodes, we need to walk over the cpuset hierarchy, 2050 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void)
2109 mutex_unlock(&callback_mutex); 2108 mutex_unlock(&callback_mutex);
2110 mutex_unlock(&manage_mutex); 2109 mutex_unlock(&manage_mutex);
2111} 2110}
2112#endif
2113 2111
2114#ifdef CONFIG_HOTPLUG_CPU
2115/* 2112/*
2116 * The top_cpuset tracks what CPUs and Memory Nodes are online, 2113 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2117 * period. This is necessary in order to make cpusets transparent 2114 * period. This is necessary in order to make cpusets transparent
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
2128 common_cpu_mem_hotplug_unplug(); 2125 common_cpu_mem_hotplug_unplug();
2129 return 0; 2126 return 0;
2130} 2127}
2131#endif
2132 2128
2133#ifdef CONFIG_MEMORY_HOTPLUG 2129#ifdef CONFIG_MEMORY_HOTPLUG
2134/* 2130/*
@@ -2346,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2346} 2342}
2347 2343
2348/** 2344/**
2349 * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? 2345 * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
2350 * @z: is this zone on an allowed node? 2346 * @z: is this zone on an allowed node?
2351 * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) 2347 * @gfp_mask: memory allocation flags
2352 * 2348 *
2353 * If we're in interrupt, yes, we can always allocate. If zone 2349 * If we're in interrupt, yes, we can always allocate. If
2350 * __GFP_THISNODE is set, yes, we can always allocate. If zone
2354 * z's node is in our tasks mems_allowed, yes. If it's not a 2351 * z's node is in our tasks mems_allowed, yes. If it's not a
2355 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2352 * __GFP_HARDWALL request and this zone's nodes is in the nearest
2356 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2353 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
2357 * Otherwise, no. 2354 * Otherwise, no.
2358 * 2355 *
2356 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
2357 * reduces to cpuset_zone_allowed_hardwall(). Otherwise,
2358 * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
2359 * from an enclosing cpuset.
2360 *
2361 * cpuset_zone_allowed_hardwall() only handles the simpler case of
2362 * hardwall cpusets, and never sleeps.
2363 *
2364 * The __GFP_THISNODE placement logic is really handled elsewhere,
2365 * by forcibly using a zonelist starting at a specified node, and by
2366 * (in get_page_from_freelist()) refusing to consider the zones for
2367 * any node on the zonelist except the first. By the time any such
2368 * calls get to this routine, we should just shut up and say 'yes'.
2369 *
2359 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2370 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2360 * and do not allow allocations outside the current tasks cpuset. 2371 * and do not allow allocations outside the current tasks cpuset.
2361 * GFP_KERNEL allocations are not so marked, so can escape to the 2372 * GFP_KERNEL allocations are not so marked, so can escape to the
2362 * nearest mem_exclusive ancestor cpuset. 2373 * nearest enclosing mem_exclusive ancestor cpuset.
2363 * 2374 *
2364 * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() 2375 * Scanning up parent cpusets requires callback_mutex. The
2365 * routine only calls here with __GFP_HARDWALL bit _not_ set if 2376 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2366 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 2377 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2367 * mems_allowed came up empty on the first pass over the zonelist. 2378 * current tasks mems_allowed came up empty on the first pass over
2368 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 2379 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
2369 * short of memory, might require taking the callback_mutex mutex. 2380 * cpuset are short of memory, might require taking the callback_mutex
2381 * mutex.
2370 * 2382 *
2371 * The first call here from mm/page_alloc:get_page_from_freelist() 2383 * The first call here from mm/page_alloc:get_page_from_freelist()
2372 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so 2384 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
2373 * no allocation on a node outside the cpuset is allowed (unless in 2385 * so no allocation on a node outside the cpuset is allowed (unless
2374 * interrupt, of course). 2386 * in interrupt, of course).
2375 * 2387 *
2376 * The second pass through get_page_from_freelist() doesn't even call 2388 * The second pass through get_page_from_freelist() doesn't even call
2377 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 2389 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
@@ -2384,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2384 * GFP_USER - only nodes in current tasks mems allowed ok. 2396 * GFP_USER - only nodes in current tasks mems allowed ok.
2385 * 2397 *
2386 * Rule: 2398 * Rule:
2387 * Don't call cpuset_zone_allowed() if you can't sleep, unless you 2399 * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
2388 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables 2400 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2389 * the code that might scan up ancestor cpusets and sleep. 2401 * the code that might scan up ancestor cpusets and sleep.
2390 **/ 2402 */
2391 2403
2392int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 2404int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2393{ 2405{
2394 int node; /* node that zone z is on */ 2406 int node; /* node that zone z is on */
2395 const struct cpuset *cs; /* current cpuset ancestors */ 2407 const struct cpuset *cs; /* current cpuset ancestors */
@@ -2419,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2419 return allowed; 2431 return allowed;
2420} 2432}
2421 2433
2434/*
2435 * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
2436 * @z: is this zone on an allowed node?
2437 * @gfp_mask: memory allocation flags
2438 *
2439 * If we're in interrupt, yes, we can always allocate.
2440 * If __GFP_THISNODE is set, yes, we can always allocate. If zone
2441 * z's node is in our tasks mems_allowed, yes. Otherwise, no.
2442 *
2443 * The __GFP_THISNODE placement logic is really handled elsewhere,
2444 * by forcibly using a zonelist starting at a specified node, and by
2445 * (in get_page_from_freelist()) refusing to consider the zones for
2446 * any node on the zonelist except the first. By the time any such
2447 * calls get to this routine, we should just shut up and say 'yes'.
2448 *
2449 * Unlike the cpuset_zone_allowed_softwall() variant, above,
2450 * this variant requires that the zone be in the current tasks
2451 * mems_allowed or that we're in interrupt. It does not scan up the
2452 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2453 * It never sleeps.
2454 */
2455
2456int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2457{
2458 int node; /* node that zone z is on */
2459
2460 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2461 return 1;
2462 node = zone_to_nid(z);
2463 if (node_isset(node, current->mems_allowed))
2464 return 1;
2465 return 0;
2466}
2467
2422/** 2468/**
2423 * cpuset_lock - lock out any changes to cpuset structures 2469 * cpuset_lock - lock out any changes to cpuset structures
2424 * 2470 *
@@ -2610,7 +2656,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
2610 return single_open(file, proc_cpuset_show, pid); 2656 return single_open(file, proc_cpuset_show, pid);
2611} 2657}
2612 2658
2613struct file_operations proc_cpuset_operations = { 2659const struct file_operations proc_cpuset_operations = {
2614 .open = cpuset_open, 2660 .open = cpuset_open,
2615 .read = seq_read, 2661 .read = seq_read,
2616 .llseek = seq_lseek, 2662 .llseek = seq_lseek,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 36752f124c6a..766d5912b26a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
20#include <linux/delayacct.h> 20#include <linux/delayacct.h>
21 21
22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
23kmem_cache_t *delayacct_cache; 23struct kmem_cache *delayacct_cache;
24 24
25static int __init delayacct_setup_disable(char *str) 25static int __init delayacct_setup_disable(char *str)
26{ 26{
@@ -41,7 +41,7 @@ void delayacct_init(void)
41 41
42void __delayacct_tsk_init(struct task_struct *tsk) 42void __delayacct_tsk_init(struct task_struct *tsk)
43{ 43{
44 tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); 44 tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
45 if (tsk->delays) 45 if (tsk->delays)
46 spin_lock_init(&tsk->delays->lock); 46 spin_lock_init(&tsk->delays->lock);
47} 47}
@@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
66{ 66{
67 struct timespec ts; 67 struct timespec ts;
68 s64 ns; 68 s64 ns;
69 unsigned long flags;
69 70
70 do_posix_clock_monotonic_gettime(end); 71 do_posix_clock_monotonic_gettime(end);
71 ts = timespec_sub(*end, *start); 72 ts = timespec_sub(*end, *start);
@@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
73 if (ns < 0) 74 if (ns < 0)
74 return; 75 return;
75 76
76 spin_lock(&current->delays->lock); 77 spin_lock_irqsave(&current->delays->lock, flags);
77 *total += ns; 78 *total += ns;
78 (*count)++; 79 (*count)++;
79 spin_unlock(&current->delays->lock); 80 spin_unlock_irqrestore(&current->delays->lock, flags);
80} 81}
81 82
82void __delayacct_blkio_start(void) 83void __delayacct_blkio_start(void)
@@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
104 s64 tmp; 105 s64 tmp;
105 struct timespec ts; 106 struct timespec ts;
106 unsigned long t1,t2,t3; 107 unsigned long t1,t2,t3;
108 unsigned long flags;
107 109
108 /* Though tsk->delays accessed later, early exit avoids 110 /* Though tsk->delays accessed later, early exit avoids
109 * unnecessary returning of other data 111 * unnecessary returning of other data
@@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
136 138
137 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ 139 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
138 140
139 spin_lock(&tsk->delays->lock); 141 spin_lock_irqsave(&tsk->delays->lock, flags);
140 tmp = d->blkio_delay_total + tsk->delays->blkio_delay; 142 tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
141 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; 143 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
142 tmp = d->swapin_delay_total + tsk->delays->swapin_delay; 144 tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
143 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; 145 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
144 d->blkio_count += tsk->delays->blkio_count; 146 d->blkio_count += tsk->delays->blkio_count;
145 d->swapin_count += tsk->delays->swapin_count; 147 d->swapin_count += tsk->delays->swapin_count;
146 spin_unlock(&tsk->delays->lock); 148 spin_unlock_irqrestore(&tsk->delays->lock, flags);
147 149
148done: 150done:
149 return 0; 151 return 0;
@@ -152,11 +154,12 @@ done:
152__u64 __delayacct_blkio_ticks(struct task_struct *tsk) 154__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
153{ 155{
154 __u64 ret; 156 __u64 ret;
157 unsigned long flags;
155 158
156 spin_lock(&tsk->delays->lock); 159 spin_lock_irqsave(&tsk->delays->lock, flags);
157 ret = nsec_to_clock_t(tsk->delays->blkio_delay + 160 ret = nsec_to_clock_t(tsk->delays->blkio_delay +
158 tsk->delays->swapin_delay); 161 tsk->delays->swapin_delay);
159 spin_unlock(&tsk->delays->lock); 162 spin_unlock_irqrestore(&tsk->delays->lock, flags);
160 return ret; 163 return ret;
161} 164}
162 165
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938a..937b13ca33ba 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
140 return single_open(file, proc_dma_show, NULL); 140 return single_open(file, proc_dma_show, NULL);
141} 141}
142 142
143static struct file_operations proc_dma_operations = { 143static const struct file_operations proc_dma_operations = {
144 .open = proc_dma_open, 144 .open = proc_dma_open,
145 .read = seq_read, 145 .read = seq_read,
146 .llseek = seq_lseek, 146 .llseek = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4e8ca3..122fadb972fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/tty.h> 15#include <linux/tty.h>
16#include <linux/namespace.h> 16#include <linux/mnt_namespace.h>
17#include <linux/key.h> 17#include <linux/key.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -22,6 +22,7 @@
22#include <linux/file.h> 22#include <linux/file.h>
23#include <linux/binfmts.h> 23#include <linux/binfmts.h>
24#include <linux/nsproxy.h> 24#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h>
25#include <linux/ptrace.h> 26#include <linux/ptrace.h>
26#include <linux/profile.h> 27#include <linux/profile.h>
27#include <linux/mount.h> 28#include <linux/mount.h>
@@ -48,7 +49,6 @@
48#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
49 50
50extern void sem_exit (void); 51extern void sem_exit (void);
51extern struct task_struct *child_reaper;
52 52
53static void exit_mm(struct task_struct * tsk); 53static void exit_mm(struct task_struct * tsk);
54 54
@@ -189,21 +189,18 @@ repeat:
189int session_of_pgrp(int pgrp) 189int session_of_pgrp(int pgrp)
190{ 190{
191 struct task_struct *p; 191 struct task_struct *p;
192 int sid = -1; 192 int sid = 0;
193 193
194 read_lock(&tasklist_lock); 194 read_lock(&tasklist_lock);
195 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 195
196 if (p->signal->session > 0) { 196 p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
197 sid = p->signal->session; 197 if (p == NULL)
198 goto out; 198 p = find_task_by_pid(pgrp);
199 } 199 if (p != NULL)
200 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 200 sid = process_session(p);
201 p = find_task_by_pid(pgrp); 201
202 if (p)
203 sid = p->signal->session;
204out:
205 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
206 203
207 return sid; 204 return sid;
208} 205}
209 206
@@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
225 || p->exit_state 222 || p->exit_state
226 || is_init(p->real_parent)) 223 || is_init(p->real_parent))
227 continue; 224 continue;
228 if (process_group(p->real_parent) != pgrp 225 if (process_group(p->real_parent) != pgrp &&
229 && p->real_parent->signal->session == p->signal->session) { 226 process_session(p->real_parent) == process_session(p)) {
230 ret = 0; 227 ret = 0;
231 break; 228 break;
232 } 229 }
@@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp)
260} 257}
261 258
262/** 259/**
263 * reparent_to_init - Reparent the calling kernel thread to the init task. 260 * reparent_to_init - Reparent the calling kernel thread to the init task
261 * of the pid space that the thread belongs to.
264 * 262 *
265 * If a kernel thread is launched as a result of a system call, or if 263 * If a kernel thread is launched as a result of a system call, or if
266 * it ever exits, it should generally reparent itself to init so that 264 * it ever exits, it should generally reparent itself to init so that
@@ -278,8 +276,8 @@ static void reparent_to_init(void)
278 ptrace_unlink(current); 276 ptrace_unlink(current);
279 /* Reparent to init */ 277 /* Reparent to init */
280 remove_parent(current); 278 remove_parent(current);
281 current->parent = child_reaper; 279 current->parent = child_reaper(current);
282 current->real_parent = child_reaper; 280 current->real_parent = child_reaper(current);
283 add_parent(current); 281 add_parent(current);
284 282
285 /* Set the exit signal to SIGCHLD so we signal init on exit */ 283 /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
302{ 300{
303 struct task_struct *curr = current->group_leader; 301 struct task_struct *curr = current->group_leader;
304 302
305 if (curr->signal->session != session) { 303 if (process_session(curr) != session) {
306 detach_pid(curr, PIDTYPE_SID); 304 detach_pid(curr, PIDTYPE_SID);
307 curr->signal->session = session; 305 set_signal_session(curr->signal, session);
308 attach_pid(curr, PIDTYPE_SID, session); 306 attach_pid(curr, PIDTYPE_SID, session);
309 } 307 }
310 if (process_group(curr) != pgrp) { 308 if (process_group(curr) != pgrp) {
@@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
314 } 312 }
315} 313}
316 314
317void set_special_pids(pid_t session, pid_t pgrp) 315static void set_special_pids(pid_t session, pid_t pgrp)
318{ 316{
319 write_lock_irq(&tasklist_lock); 317 write_lock_irq(&tasklist_lock);
320 __set_special_pids(session, pgrp); 318 __set_special_pids(session, pgrp);
@@ -384,9 +382,7 @@ void daemonize(const char *name, ...)
384 exit_mm(current); 382 exit_mm(current);
385 383
386 set_special_pids(1, 1); 384 set_special_pids(1, 1);
387 mutex_lock(&tty_mutex); 385 proc_clear_tty(current);
388 current->signal->tty = NULL;
389 mutex_unlock(&tty_mutex);
390 386
391 /* Block and flush all signals */ 387 /* Block and flush all signals */
392 sigfillset(&blocked); 388 sigfillset(&blocked);
@@ -429,7 +425,7 @@ static void close_files(struct files_struct * files)
429 for (;;) { 425 for (;;) {
430 unsigned long set; 426 unsigned long set;
431 i = j * __NFDBITS; 427 i = j * __NFDBITS;
432 if (i >= fdt->max_fdset || i >= fdt->max_fds) 428 if (i >= fdt->max_fds)
433 break; 429 break;
434 set = fdt->open_fds->fds_bits[j++]; 430 set = fdt->open_fds->fds_bits[j++];
435 while (set) { 431 while (set) {
@@ -470,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files)
470 * you can free files immediately. 466 * you can free files immediately.
471 */ 467 */
472 fdt = files_fdtable(files); 468 fdt = files_fdtable(files);
473 if (fdt == &files->fdtab) 469 if (fdt != &files->fdtab)
474 fdt->free_files = files;
475 else
476 kmem_cache_free(files_cachep, files); 470 kmem_cache_free(files_cachep, files);
477 free_fdtable(fdt); 471 call_rcu(&fdt->rcu, free_fdtable_rcu);
478 } 472 }
479} 473}
480 474
@@ -649,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
649 * outside, so the child pgrp is now orphaned. 643 * outside, so the child pgrp is now orphaned.
650 */ 644 */
651 if ((process_group(p) != process_group(father)) && 645 if ((process_group(p) != process_group(father)) &&
652 (p->signal->session == father->signal->session)) { 646 (process_session(p) == process_session(father))) {
653 int pgrp = process_group(p); 647 int pgrp = process_group(p);
654 648
655 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { 649 if (will_become_orphaned_pgrp(pgrp, NULL) &&
650 has_stopped_jobs(pgrp)) {
656 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); 651 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
657 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); 652 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
658 } 653 }
@@ -663,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
663 * When we die, we re-parent all our children. 658 * When we die, we re-parent all our children.
664 * Try to give them to another thread in our thread 659 * Try to give them to another thread in our thread
665 * group, and if no such member exists, give it to 660 * group, and if no such member exists, give it to
666 * the global child reaper process (ie "init") 661 * the child reaper process (ie "init") in our pid
662 * space.
667 */ 663 */
668static void 664static void
669forget_original_parent(struct task_struct *father, struct list_head *to_release) 665forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -674,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
674 do { 670 do {
675 reaper = next_thread(reaper); 671 reaper = next_thread(reaper);
676 if (reaper == father) { 672 if (reaper == father) {
677 reaper = child_reaper; 673 reaper = child_reaper(father);
678 break; 674 break;
679 } 675 }
680 } while (reaper->exit_state); 676 } while (reaper->exit_state);
@@ -786,7 +782,7 @@ static void exit_notify(struct task_struct *tsk)
786 t = tsk->real_parent; 782 t = tsk->real_parent;
787 783
788 if ((process_group(t) != process_group(tsk)) && 784 if ((process_group(t) != process_group(tsk)) &&
789 (t->signal->session == tsk->signal->session) && 785 (process_session(t) == process_session(tsk)) &&
790 will_become_orphaned_pgrp(process_group(tsk), tsk) && 786 will_become_orphaned_pgrp(process_group(tsk), tsk) &&
791 has_stopped_jobs(process_group(tsk))) { 787 has_stopped_jobs(process_group(tsk))) {
792 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); 788 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
@@ -850,9 +846,7 @@ static void exit_notify(struct task_struct *tsk)
850fastcall NORET_TYPE void do_exit(long code) 846fastcall NORET_TYPE void do_exit(long code)
851{ 847{
852 struct task_struct *tsk = current; 848 struct task_struct *tsk = current;
853 struct taskstats *tidstats;
854 int group_dead; 849 int group_dead;
855 unsigned int mycpu;
856 850
857 profile_task_exit(tsk); 851 profile_task_exit(tsk);
858 852
@@ -862,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code)
862 panic("Aiee, killing interrupt handler!"); 856 panic("Aiee, killing interrupt handler!");
863 if (unlikely(!tsk->pid)) 857 if (unlikely(!tsk->pid))
864 panic("Attempted to kill the idle task!"); 858 panic("Attempted to kill the idle task!");
865 if (unlikely(tsk == child_reaper)) 859 if (unlikely(tsk == child_reaper(tsk))) {
866 panic("Attempted to kill init!"); 860 if (tsk->nsproxy->pid_ns != &init_pid_ns)
861 tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
862 else
863 panic("Attempted to kill init!");
864 }
865
867 866
868 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 867 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
869 current->ptrace_message = code; 868 current->ptrace_message = code;
@@ -890,8 +889,6 @@ fastcall NORET_TYPE void do_exit(long code)
890 current->comm, current->pid, 889 current->comm, current->pid,
891 preempt_count()); 890 preempt_count());
892 891
893 taskstats_exit_alloc(&tidstats, &mycpu);
894
895 acct_update_integrals(tsk); 892 acct_update_integrals(tsk);
896 if (tsk->mm) { 893 if (tsk->mm) {
897 update_hiwater_rss(tsk->mm); 894 update_hiwater_rss(tsk->mm);
@@ -911,8 +908,8 @@ fastcall NORET_TYPE void do_exit(long code)
911#endif 908#endif
912 if (unlikely(tsk->audit_context)) 909 if (unlikely(tsk->audit_context))
913 audit_free(tsk); 910 audit_free(tsk);
914 taskstats_exit_send(tsk, tidstats, group_dead, mycpu); 911
915 taskstats_exit_free(tidstats); 912 taskstats_exit(tsk, group_dead);
916 913
917 exit_mm(tsk); 914 exit_mm(tsk);
918 915
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da978eec791..fc723e595cd5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/completion.h> 20#include <linux/completion.h>
21#include <linux/namespace.h> 21#include <linux/mnt_namespace.h>
22#include <linux/personality.h> 22#include <linux/personality.h>
23#include <linux/mempolicy.h> 23#include <linux/mempolicy.h>
24#include <linux/sem.h> 24#include <linux/sem.h>
@@ -36,6 +36,7 @@
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/jiffies.h> 37#include <linux/jiffies.h>
38#include <linux/futex.h> 38#include <linux/futex.h>
39#include <linux/task_io_accounting_ops.h>
39#include <linux/rcupdate.h> 40#include <linux/rcupdate.h>
40#include <linux/ptrace.h> 41#include <linux/ptrace.h>
41#include <linux/mount.h> 42#include <linux/mount.h>
@@ -82,26 +83,26 @@ int nr_processes(void)
82#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 83#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
83# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 84# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
84# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 85# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
85static kmem_cache_t *task_struct_cachep; 86static struct kmem_cache *task_struct_cachep;
86#endif 87#endif
87 88
88/* SLAB cache for signal_struct structures (tsk->signal) */ 89/* SLAB cache for signal_struct structures (tsk->signal) */
89static kmem_cache_t *signal_cachep; 90static struct kmem_cache *signal_cachep;
90 91
91/* SLAB cache for sighand_struct structures (tsk->sighand) */ 92/* SLAB cache for sighand_struct structures (tsk->sighand) */
92kmem_cache_t *sighand_cachep; 93struct kmem_cache *sighand_cachep;
93 94
94/* SLAB cache for files_struct structures (tsk->files) */ 95/* SLAB cache for files_struct structures (tsk->files) */
95kmem_cache_t *files_cachep; 96struct kmem_cache *files_cachep;
96 97
97/* SLAB cache for fs_struct structures (tsk->fs) */ 98/* SLAB cache for fs_struct structures (tsk->fs) */
98kmem_cache_t *fs_cachep; 99struct kmem_cache *fs_cachep;
99 100
100/* SLAB cache for vm_area_struct structures */ 101/* SLAB cache for vm_area_struct structures */
101kmem_cache_t *vm_area_cachep; 102struct kmem_cache *vm_area_cachep;
102 103
103/* SLAB cache for mm_struct structures (tsk->mm) */ 104/* SLAB cache for mm_struct structures (tsk->mm) */
104static kmem_cache_t *mm_cachep; 105static struct kmem_cache *mm_cachep;
105 106
106void free_task(struct task_struct *tsk) 107void free_task(struct task_struct *tsk)
107{ 108{
@@ -202,7 +203,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
202 struct mempolicy *pol; 203 struct mempolicy *pol;
203 204
204 down_write(&oldmm->mmap_sem); 205 down_write(&oldmm->mmap_sem);
205 flush_cache_mm(oldmm); 206 flush_cache_dup_mm(oldmm);
206 /* 207 /*
207 * Not linked in yet - no deadlock potential: 208 * Not linked in yet - no deadlock potential:
208 */ 209 */
@@ -237,7 +238,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
237 goto fail_nomem; 238 goto fail_nomem;
238 charge = len; 239 charge = len;
239 } 240 }
240 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 241 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
241 if (!tmp) 242 if (!tmp)
242 goto fail_nomem; 243 goto fail_nomem;
243 *tmp = *mpnt; 244 *tmp = *mpnt;
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
252 anon_vma_link(tmp); 253 anon_vma_link(tmp);
253 file = tmp->vm_file; 254 file = tmp->vm_file;
254 if (file) { 255 if (file) {
255 struct inode *inode = file->f_dentry->d_inode; 256 struct inode *inode = file->f_path.dentry->d_inode;
256 get_file(file); 257 get_file(file);
257 if (tmp->vm_flags & VM_DENYWRITE) 258 if (tmp->vm_flags & VM_DENYWRITE)
258 atomic_dec(&inode->i_writecount); 259 atomic_dec(&inode->i_writecount);
@@ -319,7 +320,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
319 320
320 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 321 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
321 322
322#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) 323#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
323#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 324#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
324 325
325#include <linux/init_task.h> 326#include <linux/init_task.h>
@@ -448,7 +449,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
448 tsk->vfork_done = NULL; 449 tsk->vfork_done = NULL;
449 complete(vfork_done); 450 complete(vfork_done);
450 } 451 }
451 if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { 452
453 /*
454 * If we're exiting normally, clear a user-space tid field if
455 * requested. We leave this alone when dying by signal, to leave
456 * the value intact in a core dump, and to save the unnecessary
457 * trouble otherwise. Userland only wants this done for a sys_exit.
458 */
459 if (tsk->clear_child_tid
460 && !(tsk->flags & PF_SIGNALED)
461 && atomic_read(&mm->mm_users) > 1) {
452 u32 __user * tidptr = tsk->clear_child_tid; 462 u32 __user * tidptr = tsk->clear_child_tid;
453 tsk->clear_child_tid = NULL; 463 tsk->clear_child_tid = NULL;
454 464
@@ -479,6 +489,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
479 489
480 memcpy(mm, oldmm, sizeof(*mm)); 490 memcpy(mm, oldmm, sizeof(*mm));
481 491
492 /* Initializing for Swap token stuff */
493 mm->token_priority = 0;
494 mm->last_interval = 0;
495
482 if (!mm_init(mm)) 496 if (!mm_init(mm))
483 goto fail_nomem; 497 goto fail_nomem;
484 498
@@ -542,6 +556,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
542 goto fail_nomem; 556 goto fail_nomem;
543 557
544good_mm: 558good_mm:
559 /* Initializing for Swap token stuff */
560 mm->token_priority = 0;
561 mm->last_interval = 0;
562
545 tsk->mm = mm; 563 tsk->mm = mm;
546 tsk->active_mm = mm; 564 tsk->active_mm = mm;
547 return 0; 565 return 0;
@@ -596,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
596 614
597static int count_open_files(struct fdtable *fdt) 615static int count_open_files(struct fdtable *fdt)
598{ 616{
599 int size = fdt->max_fdset; 617 int size = fdt->max_fds;
600 int i; 618 int i;
601 619
602 /* Find the last open fd */ 620 /* Find the last open fd */
@@ -613,7 +631,7 @@ static struct files_struct *alloc_files(void)
613 struct files_struct *newf; 631 struct files_struct *newf;
614 struct fdtable *fdt; 632 struct fdtable *fdt;
615 633
616 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); 634 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
617 if (!newf) 635 if (!newf)
618 goto out; 636 goto out;
619 637
@@ -623,12 +641,10 @@ static struct files_struct *alloc_files(void)
623 newf->next_fd = 0; 641 newf->next_fd = 0;
624 fdt = &newf->fdtab; 642 fdt = &newf->fdtab;
625 fdt->max_fds = NR_OPEN_DEFAULT; 643 fdt->max_fds = NR_OPEN_DEFAULT;
626 fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
627 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; 644 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
628 fdt->open_fds = (fd_set *)&newf->open_fds_init; 645 fdt->open_fds = (fd_set *)&newf->open_fds_init;
629 fdt->fd = &newf->fd_array[0]; 646 fdt->fd = &newf->fd_array[0];
630 INIT_RCU_HEAD(&fdt->rcu); 647 INIT_RCU_HEAD(&fdt->rcu);
631 fdt->free_files = NULL;
632 fdt->next = NULL; 648 fdt->next = NULL;
633 rcu_assign_pointer(newf->fdt, fdt); 649 rcu_assign_pointer(newf->fdt, fdt);
634out: 650out:
@@ -644,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
644{ 660{
645 struct files_struct *newf; 661 struct files_struct *newf;
646 struct file **old_fds, **new_fds; 662 struct file **old_fds, **new_fds;
647 int open_files, size, i, expand; 663 int open_files, size, i;
648 struct fdtable *old_fdt, *new_fdt; 664 struct fdtable *old_fdt, *new_fdt;
649 665
650 *errorp = -ENOMEM; 666 *errorp = -ENOMEM;
@@ -655,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
655 spin_lock(&oldf->file_lock); 671 spin_lock(&oldf->file_lock);
656 old_fdt = files_fdtable(oldf); 672 old_fdt = files_fdtable(oldf);
657 new_fdt = files_fdtable(newf); 673 new_fdt = files_fdtable(newf);
658 size = old_fdt->max_fdset;
659 open_files = count_open_files(old_fdt); 674 open_files = count_open_files(old_fdt);
660 expand = 0;
661 675
662 /* 676 /*
663 * Check whether we need to allocate a larger fd array or fd set. 677 * Check whether we need to allocate a larger fd array and fd set.
664 * Note: we're not a clone task, so the open count won't change. 678 * Note: we're not a clone task, so the open count won't change.
665 */ 679 */
666 if (open_files > new_fdt->max_fdset) {
667 new_fdt->max_fdset = 0;
668 expand = 1;
669 }
670 if (open_files > new_fdt->max_fds) { 680 if (open_files > new_fdt->max_fds) {
671 new_fdt->max_fds = 0; 681 new_fdt->max_fds = 0;
672 expand = 1;
673 }
674
675 /* if the old fdset gets grown now, we'll only copy up to "size" fds */
676 if (expand) {
677 spin_unlock(&oldf->file_lock); 682 spin_unlock(&oldf->file_lock);
678 spin_lock(&newf->file_lock); 683 spin_lock(&newf->file_lock);
679 *errorp = expand_files(newf, open_files-1); 684 *errorp = expand_files(newf, open_files-1);
@@ -693,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
693 old_fds = old_fdt->fd; 698 old_fds = old_fdt->fd;
694 new_fds = new_fdt->fd; 699 new_fds = new_fdt->fd;
695 700
696 memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); 701 memcpy(new_fdt->open_fds->fds_bits,
697 memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); 702 old_fdt->open_fds->fds_bits, open_files/8);
703 memcpy(new_fdt->close_on_exec->fds_bits,
704 old_fdt->close_on_exec->fds_bits, open_files/8);
698 705
699 for (i = open_files; i != 0; i--) { 706 for (i = open_files; i != 0; i--) {
700 struct file *f = *old_fds++; 707 struct file *f = *old_fds++;
@@ -719,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
719 /* This is long word aligned thus could use a optimized version */ 726 /* This is long word aligned thus could use a optimized version */
720 memset(new_fds, 0, size); 727 memset(new_fds, 0, size);
721 728
722 if (new_fdt->max_fdset > open_files) { 729 if (new_fdt->max_fds > open_files) {
723 int left = (new_fdt->max_fdset-open_files)/8; 730 int left = (new_fdt->max_fds-open_files)/8;
724 int start = open_files / (8 * sizeof(unsigned long)); 731 int start = open_files / (8 * sizeof(unsigned long));
725 732
726 memset(&new_fdt->open_fds->fds_bits[start], 0, left); 733 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
727 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); 734 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
728 } 735 }
729 736
730out:
731 return newf; 737 return newf;
732 738
733out_release: 739out_release:
734 free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
735 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
736 free_fd_array(new_fdt->fd, new_fdt->max_fds);
737 kmem_cache_free(files_cachep, newf); 740 kmem_cache_free(files_cachep, newf);
741out:
738 return NULL; 742 return NULL;
739} 743}
740 744
@@ -830,7 +834,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
830 if (clone_flags & CLONE_THREAD) { 834 if (clone_flags & CLONE_THREAD) {
831 atomic_inc(&current->signal->count); 835 atomic_inc(&current->signal->count);
832 atomic_inc(&current->signal->live); 836 atomic_inc(&current->signal->live);
833 taskstats_tgid_alloc(current);
834 return 0; 837 return 0;
835 } 838 }
836 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 839 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -1039,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1039 p->wchar = 0; /* I/O counter: bytes written */ 1042 p->wchar = 0; /* I/O counter: bytes written */
1040 p->syscr = 0; /* I/O counter: read syscalls */ 1043 p->syscr = 0; /* I/O counter: read syscalls */
1041 p->syscw = 0; /* I/O counter: write syscalls */ 1044 p->syscw = 0; /* I/O counter: write syscalls */
1045 task_io_accounting_init(p);
1042 acct_clear_integrals(p); 1046 acct_clear_integrals(p);
1043 1047
1044 p->it_virt_expires = cputime_zero; 1048 p->it_virt_expires = cputime_zero;
@@ -1243,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1243 if (thread_group_leader(p)) { 1247 if (thread_group_leader(p)) {
1244 p->signal->tty = current->signal->tty; 1248 p->signal->tty = current->signal->tty;
1245 p->signal->pgrp = process_group(current); 1249 p->signal->pgrp = process_group(current);
1246 p->signal->session = current->signal->session; 1250 set_signal_session(p->signal, process_session(current));
1247 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1251 attach_pid(p, PIDTYPE_PGID, process_group(p));
1248 attach_pid(p, PIDTYPE_SID, p->signal->session); 1252 attach_pid(p, PIDTYPE_SID, process_session(p));
1249 1253
1250 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1254 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1251 __get_cpu_var(process_counts)++; 1255 __get_cpu_var(process_counts)++;
@@ -1303,7 +1307,7 @@ fork_out:
1303 return ERR_PTR(retval); 1307 return ERR_PTR(retval);
1304} 1308}
1305 1309
1306struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) 1310noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1307{ 1311{
1308 memset(regs, 0, sizeof(struct pt_regs)); 1312 memset(regs, 0, sizeof(struct pt_regs));
1309 return regs; 1313 return regs;
@@ -1315,9 +1319,8 @@ struct task_struct * __devinit fork_idle(int cpu)
1315 struct pt_regs regs; 1319 struct pt_regs regs;
1316 1320
1317 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0); 1321 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
1318 if (!task) 1322 if (!IS_ERR(task))
1319 return ERR_PTR(-ENOMEM); 1323 init_idle(task, cpu);
1320 init_idle(task, cpu);
1321 1324
1322 return task; 1325 return task;
1323} 1326}
@@ -1414,7 +1417,7 @@ long do_fork(unsigned long clone_flags,
1414#define ARCH_MIN_MMSTRUCT_ALIGN 0 1417#define ARCH_MIN_MMSTRUCT_ALIGN 0
1415#endif 1418#endif
1416 1419
1417static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) 1420static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
1418{ 1421{
1419 struct sighand_struct *sighand = data; 1422 struct sighand_struct *sighand = data;
1420 1423
@@ -1510,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1510} 1513}
1511 1514
1512/* 1515/*
1513 * Unshare the namespace structure if it is being shared 1516 * Unshare the mnt_namespace structure if it is being shared
1514 */ 1517 */
1515static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) 1518static int unshare_mnt_namespace(unsigned long unshare_flags,
1519 struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
1516{ 1520{
1517 struct namespace *ns = current->nsproxy->namespace; 1521 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
1518 1522
1519 if ((unshare_flags & CLONE_NEWNS) && ns) { 1523 if ((unshare_flags & CLONE_NEWNS) && ns) {
1520 if (!capable(CAP_SYS_ADMIN)) 1524 if (!capable(CAP_SYS_ADMIN))
1521 return -EPERM; 1525 return -EPERM;
1522 1526
1523 *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); 1527 *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
1524 if (!*new_nsp) 1528 if (!*new_nsp)
1525 return -ENOMEM; 1529 return -ENOMEM;
1526 } 1530 }
@@ -1529,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
1529} 1533}
1530 1534
1531/* 1535/*
1532 * Unsharing of sighand for tasks created with CLONE_SIGHAND is not 1536 * Unsharing of sighand is not supported yet
1533 * supported yet
1534 */ 1537 */
1535static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) 1538static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1536{ 1539{
1537 struct sighand_struct *sigh = current->sighand; 1540 struct sighand_struct *sigh = current->sighand;
1538 1541
1539 if ((unshare_flags & CLONE_SIGHAND) && 1542 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1540 (sigh && atomic_read(&sigh->count) > 1))
1541 return -EINVAL; 1543 return -EINVAL;
1542 else 1544 else
1543 return 0; 1545 return 0;
@@ -1610,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1610{ 1612{
1611 int err = 0; 1613 int err = 0;
1612 struct fs_struct *fs, *new_fs = NULL; 1614 struct fs_struct *fs, *new_fs = NULL;
1613 struct namespace *ns, *new_ns = NULL; 1615 struct mnt_namespace *ns, *new_ns = NULL;
1614 struct sighand_struct *sigh, *new_sigh = NULL; 1616 struct sighand_struct *new_sigh = NULL;
1615 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1617 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1616 struct files_struct *fd, *new_fd = NULL; 1618 struct files_struct *fd, *new_fd = NULL;
1617 struct sem_undo_list *new_ulist = NULL; 1619 struct sem_undo_list *new_ulist = NULL;
@@ -1632,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1632 goto bad_unshare_out; 1634 goto bad_unshare_out;
1633 if ((err = unshare_fs(unshare_flags, &new_fs))) 1635 if ((err = unshare_fs(unshare_flags, &new_fs)))
1634 goto bad_unshare_cleanup_thread; 1636 goto bad_unshare_cleanup_thread;
1635 if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) 1637 if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
1636 goto bad_unshare_cleanup_fs; 1638 goto bad_unshare_cleanup_fs;
1637 if ((err = unshare_sighand(unshare_flags, &new_sigh))) 1639 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1638 goto bad_unshare_cleanup_ns; 1640 goto bad_unshare_cleanup_ns;
@@ -1656,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1656 } 1658 }
1657 } 1659 }
1658 1660
1659 if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || 1661 if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
1660 new_uts || new_ipc) { 1662 new_uts || new_ipc) {
1661 1663
1662 task_lock(current); 1664 task_lock(current);
@@ -1673,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1673 } 1675 }
1674 1676
1675 if (new_ns) { 1677 if (new_ns) {
1676 ns = current->nsproxy->namespace; 1678 ns = current->nsproxy->mnt_ns;
1677 current->nsproxy->namespace = new_ns; 1679 current->nsproxy->mnt_ns = new_ns;
1678 new_ns = ns; 1680 new_ns = ns;
1679 } 1681 }
1680 1682
1681 if (new_sigh) {
1682 sigh = current->sighand;
1683 rcu_assign_pointer(current->sighand, new_sigh);
1684 new_sigh = sigh;
1685 }
1686
1687 if (new_mm) { 1683 if (new_mm) {
1688 mm = current->mm; 1684 mm = current->mm;
1689 active_mm = current->active_mm; 1685 active_mm = current->active_mm;
@@ -1741,7 +1737,7 @@ bad_unshare_cleanup_sigh:
1741 1737
1742bad_unshare_cleanup_ns: 1738bad_unshare_cleanup_ns:
1743 if (new_ns) 1739 if (new_ns)
1744 put_namespace(new_ns); 1740 put_mnt_ns(new_ns);
1745 1741
1746bad_unshare_cleanup_fs: 1742bad_unshare_cleanup_fs:
1747 if (new_fs) 1743 if (new_fs)
diff --git a/kernel/futex.c b/kernel/futex.c
index b364e0026191..5a737de857d3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
166/* 166/*
167 * Get parameters which are the keys for a futex. 167 * Get parameters which are the keys for a futex.
168 * 168 *
169 * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, 169 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
170 * offset_within_page). For private mappings, it's (uaddr, current->mm). 170 * offset_within_page). For private mappings, it's (uaddr, current->mm).
171 * We can usually work out the index without swapping in the page. 171 * We can usually work out the index without swapping in the page.
172 * 172 *
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
223 /* 223 /*
224 * Linear file mappings are also simple. 224 * Linear file mappings are also simple.
225 */ 225 */
226 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
282{ 282{
283 int ret; 283 int ret;
284 284
285 inc_preempt_count(); 285 pagefault_disable();
286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
287 dec_preempt_count(); 287 pagefault_enable();
288 288
289 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
290} 290}
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
324 if (likely(current->pi_state_cache)) 324 if (likely(current->pi_state_cache))
325 return 0; 325 return 0;
326 326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); 327 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
328 328
329 if (!pi_state) 329 if (!pi_state)
330 return -ENOMEM; 330 return -ENOMEM;
331 331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list); 332 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */ 333 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL; 334 pi_state->owner = NULL;
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q)
553 * at the end of wake_up_all() does not prevent this store from 552 * at the end of wake_up_all() does not prevent this store from
554 * moving. 553 * moving.
555 */ 554 */
556 wmb(); 555 smp_wmb();
557 q->lock_ptr = NULL; 556 q->lock_ptr = NULL;
558} 557}
559 558
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
585 if (!(uval & FUTEX_OWNER_DIED)) { 584 if (!(uval & FUTEX_OWNER_DIED)) {
586 newval = FUTEX_WAITERS | new_owner->pid; 585 newval = FUTEX_WAITERS | new_owner->pid;
587 586
588 inc_preempt_count(); 587 pagefault_disable();
589 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 588 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
590 dec_preempt_count(); 589 pagefault_enable();
591 if (curval == -EFAULT) 590 if (curval == -EFAULT)
592 return -EFAULT; 591 return -EFAULT;
593 if (curval != uval) 592 if (curval != uval)
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
618 * There is no waiter, so we unlock the futex. The owner died 617 * There is no waiter, so we unlock the futex. The owner died
619 * bit has not to be preserved here. We are the owner: 618 * bit has not to be preserved here. We are the owner:
620 */ 619 */
621 inc_preempt_count(); 620 pagefault_disable();
622 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); 621 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
623 dec_preempt_count(); 622 pagefault_enable();
624 623
625 if (oldval == -EFAULT) 624 if (oldval == -EFAULT)
626 return oldval; 625 return oldval;
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1158 */ 1157 */
1159 newval = current->pid; 1158 newval = current->pid;
1160 1159
1161 inc_preempt_count(); 1160 pagefault_disable();
1162 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); 1161 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1163 dec_preempt_count(); 1162 pagefault_enable();
1164 1163
1165 if (unlikely(curval == -EFAULT)) 1164 if (unlikely(curval == -EFAULT))
1166 goto uaddr_faulted; 1165 goto uaddr_faulted;
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1183 uval = curval; 1182 uval = curval;
1184 newval = uval | FUTEX_WAITERS; 1183 newval = uval | FUTEX_WAITERS;
1185 1184
1186 inc_preempt_count(); 1185 pagefault_disable();
1187 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1186 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1188 dec_preempt_count(); 1187 pagefault_enable();
1189 1188
1190 if (unlikely(curval == -EFAULT)) 1189 if (unlikely(curval == -EFAULT))
1191 goto uaddr_faulted; 1190 goto uaddr_faulted;
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1215 newval = current->pid | 1214 newval = current->pid |
1216 FUTEX_OWNER_DIED | FUTEX_WAITERS; 1215 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1217 1216
1218 inc_preempt_count(); 1217 pagefault_disable();
1219 curval = futex_atomic_cmpxchg_inatomic(uaddr, 1218 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1220 uval, newval); 1219 uval, newval);
1221 dec_preempt_count(); 1220 pagefault_enable();
1222 1221
1223 if (unlikely(curval == -EFAULT)) 1222 if (unlikely(curval == -EFAULT))
1224 goto uaddr_faulted; 1223 goto uaddr_faulted;
@@ -1390,9 +1389,9 @@ retry_locked:
1390 * anyone else up: 1389 * anyone else up:
1391 */ 1390 */
1392 if (!(uval & FUTEX_OWNER_DIED)) { 1391 if (!(uval & FUTEX_OWNER_DIED)) {
1393 inc_preempt_count(); 1392 pagefault_disable();
1394 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1393 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1395 dec_preempt_count(); 1394 pagefault_enable();
1396 } 1395 }
1397 1396
1398 if (unlikely(uval == -EFAULT)) 1397 if (unlikely(uval == -EFAULT))
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
1493 return ret; 1492 return ret;
1494} 1493}
1495 1494
1496static struct file_operations futex_fops = { 1495static const struct file_operations futex_fops = {
1497 .release = futex_close, 1496 .release = futex_close,
1498 .poll = futex_poll, 1497 .poll = futex_poll,
1499}; 1498};
@@ -1507,6 +1506,13 @@ static int futex_fd(u32 __user *uaddr, int signal)
1507 struct futex_q *q; 1506 struct futex_q *q;
1508 struct file *filp; 1507 struct file *filp;
1509 int ret, err; 1508 int ret, err;
1509 static unsigned long printk_interval;
1510
1511 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
1512 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
1513 "will be removed from the kernel in June 2007\n",
1514 current->comm);
1515 }
1510 1516
1511 ret = -EINVAL; 1517 ret = -EINVAL;
1512 if (!valid_signal(signal)) 1518 if (!valid_signal(signal))
@@ -1522,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
1522 goto out; 1528 goto out;
1523 } 1529 }
1524 filp->f_op = &futex_fops; 1530 filp->f_op = &futex_fops;
1525 filp->f_vfsmnt = mntget(futex_mnt); 1531 filp->f_path.mnt = mntget(futex_mnt);
1526 filp->f_dentry = dget(futex_mnt->mnt_root); 1532 filp->f_path.dentry = dget(futex_mnt->mnt_root);
1527 filp->f_mapping = filp->f_dentry->d_inode->i_mapping; 1533 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
1528 1534
1529 if (signal) { 1535 if (signal) {
1530 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); 1536 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
@@ -1851,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
1851 1857
1852static int __init init(void) 1858static int __init init(void)
1853{ 1859{
1854 unsigned int i; 1860 int i = register_filesystem(&futex_fs_type);
1861
1862 if (i)
1863 return i;
1855 1864
1856 register_filesystem(&futex_fs_type);
1857 futex_mnt = kern_mount(&futex_fs_type); 1865 futex_mnt = kern_mount(&futex_fs_type);
1866 if (IS_ERR(futex_mnt)) {
1867 unregister_filesystem(&futex_fs_type);
1868 return PTR_ERR(futex_mnt);
1869 }
1858 1870
1859 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 1871 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1860 INIT_LIST_HEAD(&futex_queues[i].chain); 1872 INIT_LIST_HEAD(&futex_queues[i].chain);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2d0dc3efe813..ebfd24a41858 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
233 chip->shutdown = chip->disable; 233 chip->shutdown = chip->disable;
234 if (!chip->name) 234 if (!chip->name)
235 chip->name = chip->typename; 235 chip->name = chip->typename;
236 if (!chip->end)
237 chip->end = dummy_irq_chip.end;
236} 238}
237 239
238static inline void mask_ack_irq(struct irq_desc *desc, int irq) 240static inline void mask_ack_irq(struct irq_desc *desc, int irq)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 42aa6f1a3f0f..aff1f0fabb0d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
54 .chip = &no_irq_chip, 54 .chip = &no_irq_chip,
55 .handle_irq = handle_bad_irq, 55 .handle_irq = handle_bad_irq,
56 .depth = 1, 56 .depth = 1,
57 .lock = SPIN_LOCK_UNLOCKED, 57 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
58#ifdef CONFIG_SMP 58#ifdef CONFIG_SMP
59 .affinity = CPU_MASK_ALL 59 .affinity = CPU_MASK_ALL
60#endif 60#endif
@@ -231,10 +231,10 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
231 spin_unlock(&desc->lock); 231 spin_unlock(&desc->lock);
232 232
233 action_ret = handle_IRQ_event(irq, action); 233 action_ret = handle_IRQ_event(irq, action);
234
235 spin_lock(&desc->lock);
236 if (!noirqdebug) 234 if (!noirqdebug)
237 note_interrupt(irq, desc, action_ret); 235 note_interrupt(irq, desc, action_ret);
236
237 spin_lock(&desc->lock);
238 if (likely(!(desc->status & IRQ_PENDING))) 238 if (likely(!(desc->status & IRQ_PENDING)))
239 break; 239 break;
240 desc->status &= ~IRQ_PENDING; 240 desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6879202afe9a..b385878c6e80 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
216{ 216{
217 struct irq_desc *desc = irq_desc + irq; 217 struct irq_desc *desc = irq_desc + irq;
218 struct irqaction *old, **p; 218 struct irqaction *old, **p;
219 const char *old_name = NULL;
219 unsigned long flags; 220 unsigned long flags;
220 int shared = 0; 221 int shared = 0;
221 222
@@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
255 * set the trigger type must match. 256 * set the trigger type must match.
256 */ 257 */
257 if (!((old->flags & new->flags) & IRQF_SHARED) || 258 if (!((old->flags & new->flags) & IRQF_SHARED) ||
258 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) 259 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
260 old_name = old->name;
259 goto mismatch; 261 goto mismatch;
262 }
260 263
261#if defined(CONFIG_IRQ_PER_CPU) 264#if defined(CONFIG_IRQ_PER_CPU)
262 /* All handlers must agree on per-cpuness */ 265 /* All handlers must agree on per-cpuness */
@@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
322 return 0; 325 return 0;
323 326
324mismatch: 327mismatch:
325 spin_unlock_irqrestore(&desc->lock, flags);
326 if (!(new->flags & IRQF_PROBE_SHARED)) { 328 if (!(new->flags & IRQF_PROBE_SHARED)) {
327 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); 329 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
330 if (old_name)
331 printk(KERN_ERR "current handler: %s\n", old_name);
328 dump_stack(); 332 dump_stack();
329 } 333 }
334 spin_unlock_irqrestore(&desc->lock, flags);
330 return -EBUSY; 335 return -EBUSY;
331} 336}
332 337
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a352667007c..61f5c717a8f5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
54 unsigned int irq = (int)(long)data, full_count = count, err; 54 unsigned int irq = (int)(long)data, full_count = count, err;
55 cpumask_t new_value, tmp; 55 cpumask_t new_value, tmp;
56 56
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
58 CHECK_IRQ_PER_CPU(irq_desc[irq].status))
58 return -EIO; 59 return -EIO;
59 60
60 err = cpumask_parse_user(buffer, count, new_value); 61 err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2b..6f294ff4f9ee 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/ctype.h>
23 24
24#include <asm/sections.h> 25#include <asm/sections.h>
25 26
@@ -30,14 +31,14 @@
30#endif 31#endif
31 32
32/* These will be re-linked against their real values during the second link stage */ 33/* These will be re-linked against their real values during the second link stage */
33extern unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const unsigned long kallsyms_addresses[] __attribute__((weak));
34extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); 35extern const unsigned long kallsyms_num_syms __attribute__((weak));
35extern u8 kallsyms_names[] __attribute__((weak)); 36extern const u8 kallsyms_names[] __attribute__((weak));
36 37
37extern u8 kallsyms_token_table[] __attribute__((weak)); 38extern const u8 kallsyms_token_table[] __attribute__((weak));
38extern u16 kallsyms_token_index[] __attribute__((weak)); 39extern const u16 kallsyms_token_index[] __attribute__((weak));
39 40
40extern unsigned long kallsyms_markers[] __attribute__((weak)); 41extern const unsigned long kallsyms_markers[] __attribute__((weak));
41 42
42static inline int is_kernel_inittext(unsigned long addr) 43static inline int is_kernel_inittext(unsigned long addr)
43{ 44{
@@ -83,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
83static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 84static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
84{ 85{
85 int len, skipped_first = 0; 86 int len, skipped_first = 0;
86 u8 *tptr, *data; 87 const u8 *tptr, *data;
87 88
88 /* get the compressed symbol length from the first symbol byte */ 89 /* get the compressed symbol length from the first symbol byte */
89 data = &kallsyms_names[off]; 90 data = &kallsyms_names[off];
@@ -131,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
131 * kallsyms array */ 132 * kallsyms array */
132static unsigned int get_symbol_offset(unsigned long pos) 133static unsigned int get_symbol_offset(unsigned long pos)
133{ 134{
134 u8 *name; 135 const u8 *name;
135 int i; 136 int i;
136 137
137 /* use the closest marker we have. We have markers every 256 positions, 138 /* use the closest marker we have. We have markers every 256 positions,
@@ -301,13 +302,6 @@ struct kallsym_iter
301 char name[KSYM_NAME_LEN+1]; 302 char name[KSYM_NAME_LEN+1];
302}; 303};
303 304
304/* Only label it "global" if it is exported. */
305static void upcase_if_global(struct kallsym_iter *iter)
306{
307 if (is_exported(iter->name, iter->owner))
308 iter->type += 'A' - 'a';
309}
310
311static int get_ksymbol_mod(struct kallsym_iter *iter) 305static int get_ksymbol_mod(struct kallsym_iter *iter)
312{ 306{
313 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 307 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
316 if (iter->owner == NULL) 310 if (iter->owner == NULL)
317 return 0; 311 return 0;
318 312
319 upcase_if_global(iter); 313 /* Label it "global" if it is exported, "local" if not exported. */
314 iter->type = is_exported(iter->name, iter->owner)
315 ? toupper(iter->type) : tolower(iter->type);
316
320 return 1; 317 return 1;
321} 318}
322 319
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
401 return 0; 398 return 0;
402} 399}
403 400
404static struct seq_operations kallsyms_op = { 401static const struct seq_operations kallsyms_op = {
405 .start = s_start, 402 .start = s_start,
406 .next = s_next, 403 .next = s_next,
407 .stop = s_stop, 404 .stop = s_stop,
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
436 return seq_release(inode, file); 433 return seq_release(inode, file);
437} 434}
438 435
439static struct file_operations kallsyms_operations = { 436static const struct file_operations kallsyms_operations = {
440 .open = kallsyms_open, 437 .open = kallsyms_open,
441 .read = seq_read, 438 .read = seq_read,
442 .llseek = seq_lseek, 439 .llseek = seq_lseek,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f4..2a59c8a01ae0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/elf.h>
24#include <linux/elfcore.h>
23 25
24#include <asm/page.h> 26#include <asm/page.h>
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
108 110
109 /* Allocate a controlling structure */ 111 /* Allocate a controlling structure */
110 result = -ENOMEM; 112 result = -ENOMEM;
111 image = kmalloc(sizeof(*image), GFP_KERNEL); 113 image = kzalloc(sizeof(*image), GFP_KERNEL);
112 if (!image) 114 if (!image)
113 goto out; 115 goto out;
114 116
115 memset(image, 0, sizeof(*image));
116 image->head = 0; 117 image->head = 0;
117 image->entry = &image->head; 118 image->entry = &image->head;
118 image->last_entry = &image->head; 119 image->last_entry = &image->head;
@@ -851,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image,
851 memset(ptr + uchunk, 0, mchunk - uchunk); 852 memset(ptr + uchunk, 0, mchunk - uchunk);
852 } 853 }
853 result = copy_from_user(ptr, buf, uchunk); 854 result = copy_from_user(ptr, buf, uchunk);
855 kexec_flush_icache_page(page);
854 kunmap(page); 856 kunmap(page);
855 if (result) { 857 if (result) {
856 result = (result < 0) ? result : -EIO; 858 result = (result < 0) ? result : -EIO;
@@ -1067,6 +1069,60 @@ void crash_kexec(struct pt_regs *regs)
1067 } 1069 }
1068} 1070}
1069 1071
1072static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1073 size_t data_len)
1074{
1075 struct elf_note note;
1076
1077 note.n_namesz = strlen(name) + 1;
1078 note.n_descsz = data_len;
1079 note.n_type = type;
1080 memcpy(buf, &note, sizeof(note));
1081 buf += (sizeof(note) + 3)/4;
1082 memcpy(buf, name, note.n_namesz);
1083 buf += (note.n_namesz + 3)/4;
1084 memcpy(buf, data, note.n_descsz);
1085 buf += (note.n_descsz + 3)/4;
1086
1087 return buf;
1088}
1089
1090static void final_note(u32 *buf)
1091{
1092 struct elf_note note;
1093
1094 note.n_namesz = 0;
1095 note.n_descsz = 0;
1096 note.n_type = 0;
1097 memcpy(buf, &note, sizeof(note));
1098}
1099
1100void crash_save_cpu(struct pt_regs *regs, int cpu)
1101{
1102 struct elf_prstatus prstatus;
1103 u32 *buf;
1104
1105 if ((cpu < 0) || (cpu >= NR_CPUS))
1106 return;
1107
1108 /* Using ELF notes here is opportunistic.
1109 * I need a well defined structure format
1110 * for the data I pass, and I need tags
1111 * on the data to indicate what information I have
1112 * squirrelled away. ELF notes happen to provide
1113 * all of that, so there is no need to invent something new.
1114 */
1115 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1116 if (!buf)
1117 return;
1118 memset(&prstatus, 0, sizeof(prstatus));
1119 prstatus.pr_pid = current->pid;
1120 elf_core_copy_regs(&prstatus.pr_reg, regs);
1121 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
1122 sizeof(prstatus));
1123 final_note(buf);
1124}
1125
1070static int __init crash_notes_memory_init(void) 1126static int __init crash_notes_memory_init(void)
1071{ 1127{
1072 /* Allocate memory for saving cpu registers. */ 1128 /* Allocate memory for saving cpu registers. */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb4e29d924e4..3a7379aa31ca 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/namespace.h> 28#include <linux/mnt_namespace.h>
29#include <linux/completion.h> 29#include <linux/completion.h>
30#include <linux/file.h> 30#include <linux/file.h>
31#include <linux/workqueue.h> 31#include <linux/workqueue.h>
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
114#endif /* CONFIG_KMOD */ 114#endif /* CONFIG_KMOD */
115 115
116struct subprocess_info { 116struct subprocess_info {
117 struct work_struct work;
117 struct completion *complete; 118 struct completion *complete;
118 char *path; 119 char *path;
119 char **argv; 120 char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
221} 222}
222 223
223/* This is run by khelper thread */ 224/* This is run by khelper thread */
224static void __call_usermodehelper(void *data) 225static void __call_usermodehelper(struct work_struct *work)
225{ 226{
226 struct subprocess_info *sub_info = data; 227 struct subprocess_info *sub_info =
228 container_of(work, struct subprocess_info, work);
227 pid_t pid; 229 pid_t pid;
228 int wait = sub_info->wait; 230 int wait = sub_info->wait;
229 231
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
264{ 266{
265 DECLARE_COMPLETION_ONSTACK(done); 267 DECLARE_COMPLETION_ONSTACK(done);
266 struct subprocess_info sub_info = { 268 struct subprocess_info sub_info = {
269 .work = __WORK_INITIALIZER(sub_info.work,
270 __call_usermodehelper),
267 .complete = &done, 271 .complete = &done,
268 .path = path, 272 .path = path,
269 .argv = argv, 273 .argv = argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
272 .wait = wait, 276 .wait = wait,
273 .retval = 0, 277 .retval = 0,
274 }; 278 };
275 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
276 279
277 if (!khelper_wq) 280 if (!khelper_wq)
278 return -EBUSY; 281 return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
280 if (path[0] == '\0') 283 if (path[0] == '\0')
281 return 0; 284 return 0;
282 285
283 queue_work(khelper_wq, &work); 286 queue_work(khelper_wq, &sub_info.work);
284 wait_for_completion(&done); 287 wait_for_completion(&done);
285 return sub_info.retval; 288 return sub_info.retval;
286} 289}
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
291{ 294{
292 DECLARE_COMPLETION(done); 295 DECLARE_COMPLETION(done);
293 struct subprocess_info sub_info = { 296 struct subprocess_info sub_info = {
297 .work = __WORK_INITIALIZER(sub_info.work,
298 __call_usermodehelper),
294 .complete = &done, 299 .complete = &done,
295 .path = path, 300 .path = path,
296 .argv = argv, 301 .argv = argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
298 .retval = 0, 303 .retval = 0,
299 }; 304 };
300 struct file *f; 305 struct file *f;
301 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
302 306
303 if (!khelper_wq) 307 if (!khelper_wq)
304 return -EBUSY; 308 return -EBUSY;
@@ -307,18 +311,18 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
307 return 0; 311 return 0;
308 312
309 f = create_write_pipe(); 313 f = create_write_pipe();
310 if (!f) 314 if (IS_ERR(f))
311 return -ENOMEM; 315 return PTR_ERR(f);
312 *filp = f; 316 *filp = f;
313 317
314 f = create_read_pipe(f); 318 f = create_read_pipe(f);
315 if (!f) { 319 if (IS_ERR(f)) {
316 free_write_pipe(*filp); 320 free_write_pipe(*filp);
317 return -ENOMEM; 321 return PTR_ERR(f);
318 } 322 }
319 sub_info.stdin = f; 323 sub_info.stdin = f;
320 324
321 queue_work(khelper_wq, &work); 325 queue_work(khelper_wq, &sub_info.work);
322 wait_for_completion(&done); 326 wait_for_completion(&done);
323 return sub_info.retval; 327 return sub_info.retval;
324} 328}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e0..17ec4afb0994 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
41#include <linux/freezer.h>
41#include <asm-generic/sections.h> 42#include <asm-generic/sections.h>
42#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
43#include <asm/errno.h> 44#include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
83 kprobe_opcode_t *insns; /* Page of instruction slots */ 84 kprobe_opcode_t *insns; /* Page of instruction slots */
84 char slot_used[INSNS_PER_PAGE]; 85 char slot_used[INSNS_PER_PAGE];
85 int nused; 86 int nused;
87 int ngarbage;
86}; 88};
87 89
88static struct hlist_head kprobe_insn_pages; 90static struct hlist_head kprobe_insn_pages;
91static int kprobe_garbage_slots;
92static int collect_garbage_slots(void);
93
94static int __kprobes check_safety(void)
95{
96 int ret = 0;
97#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
98 ret = freeze_processes();
99 if (ret == 0) {
100 struct task_struct *p, *q;
101 do_each_thread(p, q) {
102 if (p != current && p->state == TASK_RUNNING &&
103 p->pid != 0) {
104 printk("Check failed: %s is running\n",p->comm);
105 ret = -1;
106 goto loop_end;
107 }
108 } while_each_thread(p, q);
109 }
110loop_end:
111 thaw_processes();
112#else
113 synchronize_sched();
114#endif
115 return ret;
116}
89 117
90/** 118/**
91 * get_insn_slot() - Find a slot on an executable page for an instruction. 119 * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
96 struct kprobe_insn_page *kip; 124 struct kprobe_insn_page *kip;
97 struct hlist_node *pos; 125 struct hlist_node *pos;
98 126
127 retry:
99 hlist_for_each(pos, &kprobe_insn_pages) { 128 hlist_for_each(pos, &kprobe_insn_pages) {
100 kip = hlist_entry(pos, struct kprobe_insn_page, hlist); 129 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
101 if (kip->nused < INSNS_PER_PAGE) { 130 if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
112 } 141 }
113 } 142 }
114 143
115 /* All out of space. Need to allocate a new page. Use slot 0.*/ 144 /* If there are any garbage slots, collect it and try again. */
145 if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
146 goto retry;
147 }
148 /* All out of space. Need to allocate a new page. Use slot 0. */
116 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 149 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
117 if (!kip) { 150 if (!kip) {
118 return NULL; 151 return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
133 memset(kip->slot_used, 0, INSNS_PER_PAGE); 166 memset(kip->slot_used, 0, INSNS_PER_PAGE);
134 kip->slot_used[0] = 1; 167 kip->slot_used[0] = 1;
135 kip->nused = 1; 168 kip->nused = 1;
169 kip->ngarbage = 0;
136 return kip->insns; 170 return kip->insns;
137} 171}
138 172
139void __kprobes free_insn_slot(kprobe_opcode_t *slot) 173/* Return 1 if all garbages are collected, otherwise 0. */
174static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
175{
176 kip->slot_used[idx] = 0;
177 kip->nused--;
178 if (kip->nused == 0) {
179 /*
180 * Page is no longer in use. Free it unless
181 * it's the last one. We keep the last one
182 * so as not to have to set it up again the
183 * next time somebody inserts a probe.
184 */
185 hlist_del(&kip->hlist);
186 if (hlist_empty(&kprobe_insn_pages)) {
187 INIT_HLIST_NODE(&kip->hlist);
188 hlist_add_head(&kip->hlist,
189 &kprobe_insn_pages);
190 } else {
191 module_free(NULL, kip->insns);
192 kfree(kip);
193 }
194 return 1;
195 }
196 return 0;
197}
198
199static int __kprobes collect_garbage_slots(void)
200{
201 struct kprobe_insn_page *kip;
202 struct hlist_node *pos, *next;
203
204 /* Ensure no-one is preepmted on the garbages */
205 if (check_safety() != 0)
206 return -EAGAIN;
207
208 hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
209 int i;
210 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
211 if (kip->ngarbage == 0)
212 continue;
213 kip->ngarbage = 0; /* we will collect all garbages */
214 for (i = 0; i < INSNS_PER_PAGE; i++) {
215 if (kip->slot_used[i] == -1 &&
216 collect_one_slot(kip, i))
217 break;
218 }
219 }
220 kprobe_garbage_slots = 0;
221 return 0;
222}
223
224void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
140{ 225{
141 struct kprobe_insn_page *kip; 226 struct kprobe_insn_page *kip;
142 struct hlist_node *pos; 227 struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
146 if (kip->insns <= slot && 231 if (kip->insns <= slot &&
147 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 232 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
148 int i = (slot - kip->insns) / MAX_INSN_SIZE; 233 int i = (slot - kip->insns) / MAX_INSN_SIZE;
149 kip->slot_used[i] = 0; 234 if (dirty) {
150 kip->nused--; 235 kip->slot_used[i] = -1;
151 if (kip->nused == 0) { 236 kip->ngarbage++;
152 /* 237 } else {
153 * Page is no longer in use. Free it unless 238 collect_one_slot(kip, i);
154 * it's the last one. We keep the last one
155 * so as not to have to set it up again the
156 * next time somebody inserts a probe.
157 */
158 hlist_del(&kip->hlist);
159 if (hlist_empty(&kprobe_insn_pages)) {
160 INIT_HLIST_NODE(&kip->hlist);
161 hlist_add_head(&kip->hlist,
162 &kprobe_insn_pages);
163 } else {
164 module_free(NULL, kip->insns);
165 kfree(kip);
166 }
167 } 239 }
168 return; 240 break;
169 } 241 }
170 } 242 }
243 if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
244 collect_garbage_slots();
245 }
171} 246}
172#endif 247#endif
173 248
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e8..1db8c72d0d38 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
31 /* Result passed back to kthread_create() from keventd. */ 31 /* Result passed back to kthread_create() from keventd. */
32 struct task_struct *result; 32 struct task_struct *result;
33 struct completion done; 33 struct completion done;
34
35 struct work_struct work;
34}; 36};
35 37
36struct kthread_stop_info 38struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
111} 113}
112 114
113/* We are keventd: create a thread. */ 115/* We are keventd: create a thread. */
114static void keventd_create_kthread(void *_create) 116static void keventd_create_kthread(struct work_struct *work)
115{ 117{
116 struct kthread_create_info *create = _create; 118 struct kthread_create_info *create =
119 container_of(work, struct kthread_create_info, work);
117 int pid; 120 int pid;
118 121
119 /* We want our own signal handler (we take no signals by default). */ 122 /* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
154 ...) 157 ...)
155{ 158{
156 struct kthread_create_info create; 159 struct kthread_create_info create;
157 DECLARE_WORK(work, keventd_create_kthread, &create);
158 160
159 create.threadfn = threadfn; 161 create.threadfn = threadfn;
160 create.data = data; 162 create.data = data;
161 init_completion(&create.started); 163 init_completion(&create.started);
162 init_completion(&create.done); 164 init_completion(&create.done);
165 INIT_WORK(&create.work, keventd_create_kthread);
163 166
164 /* 167 /*
165 * The workqueue needs to start up first: 168 * The workqueue needs to start up first:
166 */ 169 */
167 if (!helper_wq) 170 if (!helper_wq)
168 work.func(work.data); 171 create.work.func(&create.work);
169 else { 172 else {
170 queue_work(helper_wq, &work); 173 queue_work(helper_wq, &create.work);
171 wait_for_completion(&create.done); 174 wait_for_completion(&create.done);
172 } 175 }
173 if (!IS_ERR(create.result)) { 176 if (!IS_ERR(create.result)) {
diff --git a/kernel/latency.c b/kernel/latency.c
index 258f2555abbc..e63fcacb61a7 100644
--- a/kernel/latency.c
+++ b/kernel/latency.c
@@ -36,6 +36,7 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
39#include <linux/jiffies.h>
39#include <asm/atomic.h> 40#include <asm/atomic.h>
40 41
41struct latency_info { 42struct latency_info {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b739be2a6dc9..01e750559034 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,13 +43,49 @@
43#include "lockdep_internals.h" 43#include "lockdep_internals.h"
44 44
45/* 45/*
46 * hash_lock: protects the lockdep hashes and class/list/hash allocators. 46 * lockdep_lock: protects the lockdep graph, the hashes and the
47 * class/list/hash allocators.
47 * 48 *
48 * This is one of the rare exceptions where it's justified 49 * This is one of the rare exceptions where it's justified
49 * to use a raw spinlock - we really dont want the spinlock 50 * to use a raw spinlock - we really dont want the spinlock
50 * code to recurse back into the lockdep code. 51 * code to recurse back into the lockdep code...
51 */ 52 */
52static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 53static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
54
55static int graph_lock(void)
56{
57 __raw_spin_lock(&lockdep_lock);
58 /*
59 * Make sure that if another CPU detected a bug while
60 * walking the graph we dont change it (while the other
61 * CPU is busy printing out stuff with the graph lock
62 * dropped already)
63 */
64 if (!debug_locks) {
65 __raw_spin_unlock(&lockdep_lock);
66 return 0;
67 }
68 return 1;
69}
70
71static inline int graph_unlock(void)
72{
73 __raw_spin_unlock(&lockdep_lock);
74 return 0;
75}
76
77/*
78 * Turn lock debugging off and return with 0 if it was off already,
79 * and also release the graph lock:
80 */
81static inline int debug_locks_off_graph_unlock(void)
82{
83 int ret = debug_locks_off();
84
85 __raw_spin_unlock(&lockdep_lock);
86
87 return ret;
88}
53 89
54static int lockdep_initialized; 90static int lockdep_initialized;
55 91
@@ -57,14 +93,15 @@ unsigned long nr_list_entries;
57static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; 93static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
58 94
59/* 95/*
60 * Allocate a lockdep entry. (assumes hash_lock held, returns 96 * Allocate a lockdep entry. (assumes the graph_lock held, returns
61 * with NULL on failure) 97 * with NULL on failure)
62 */ 98 */
63static struct lock_list *alloc_list_entry(void) 99static struct lock_list *alloc_list_entry(void)
64{ 100{
65 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { 101 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
66 __raw_spin_unlock(&hash_lock); 102 if (!debug_locks_off_graph_unlock())
67 debug_locks_off(); 103 return NULL;
104
68 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 105 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
69 printk("turning off the locking correctness validator.\n"); 106 printk("turning off the locking correctness validator.\n");
70 return NULL; 107 return NULL;
@@ -140,21 +177,12 @@ void lockdep_on(void)
140 177
141EXPORT_SYMBOL(lockdep_on); 178EXPORT_SYMBOL(lockdep_on);
142 179
143int lockdep_internal(void)
144{
145 return current->lockdep_recursion != 0;
146}
147
148EXPORT_SYMBOL(lockdep_internal);
149
150/* 180/*
151 * Debugging switches: 181 * Debugging switches:
152 */ 182 */
153 183
154#define VERBOSE 0 184#define VERBOSE 0
155#ifdef VERBOSE 185#define VERY_VERBOSE 0
156# define VERY_VERBOSE 0
157#endif
158 186
159#if VERBOSE 187#if VERBOSE
160# define HARDIRQ_VERBOSE 1 188# define HARDIRQ_VERBOSE 1
@@ -179,8 +207,8 @@ static int class_filter(struct lock_class *class)
179 !strcmp(class->name, "&struct->lockfield")) 207 !strcmp(class->name, "&struct->lockfield"))
180 return 1; 208 return 1;
181#endif 209#endif
182 /* Allow everything else. 0 would be filter everything else */ 210 /* Filter everything else. 1 would be to allow everything else */
183 return 1; 211 return 0;
184} 212}
185#endif 213#endif
186 214
@@ -214,7 +242,7 @@ static int softirq_verbose(struct lock_class *class)
214 242
215/* 243/*
216 * Stack-trace: tightly packed array of stack backtrace 244 * Stack-trace: tightly packed array of stack backtrace
217 * addresses. Protected by the hash_lock. 245 * addresses. Protected by the graph_lock.
218 */ 246 */
219unsigned long nr_stack_trace_entries; 247unsigned long nr_stack_trace_entries;
220static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; 248static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
@@ -228,25 +256,20 @@ static int save_trace(struct stack_trace *trace)
228 trace->skip = 3; 256 trace->skip = 3;
229 trace->all_contexts = 0; 257 trace->all_contexts = 0;
230 258
231 /* Make sure to not recurse in case the the unwinder needs to tak
232e locks. */
233 lockdep_off();
234 save_stack_trace(trace, NULL); 259 save_stack_trace(trace, NULL);
235 lockdep_on();
236 260
237 trace->max_entries = trace->nr_entries; 261 trace->max_entries = trace->nr_entries;
238 262
239 nr_stack_trace_entries += trace->nr_entries; 263 nr_stack_trace_entries += trace->nr_entries;
240 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
241 return 0;
242 264
243 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 265 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
244 __raw_spin_unlock(&hash_lock); 266 if (!debug_locks_off_graph_unlock())
245 if (debug_locks_off()) { 267 return 0;
246 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); 268
247 printk("turning off the locking correctness validator.\n"); 269 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
248 dump_stack(); 270 printk("turning off the locking correctness validator.\n");
249 } 271 dump_stack();
272
250 return 0; 273 return 0;
251 } 274 }
252 275
@@ -357,7 +380,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
357 380
358static void print_lock_name(struct lock_class *class) 381static void print_lock_name(struct lock_class *class)
359{ 382{
360 char str[128], c1, c2, c3, c4; 383 char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
361 const char *name; 384 const char *name;
362 385
363 get_usage_chars(class, &c1, &c2, &c3, &c4); 386 get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -379,7 +402,7 @@ static void print_lock_name(struct lock_class *class)
379static void print_lockdep_cache(struct lockdep_map *lock) 402static void print_lockdep_cache(struct lockdep_map *lock)
380{ 403{
381 const char *name; 404 const char *name;
382 char str[128]; 405 char str[KSYM_NAME_LEN + 1];
383 406
384 name = lock->name; 407 name = lock->name;
385 if (!name) 408 if (!name)
@@ -449,7 +472,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
449 print_lock_class_header(class, depth); 472 print_lock_class_header(class, depth);
450 473
451 list_for_each_entry(entry, &class->locks_after, entry) { 474 list_for_each_entry(entry, &class->locks_after, entry) {
452 DEBUG_LOCKS_WARN_ON(!entry->class); 475 if (DEBUG_LOCKS_WARN_ON(!entry->class))
476 return;
477
453 print_lock_dependencies(entry->class, depth + 1); 478 print_lock_dependencies(entry->class, depth + 1);
454 479
455 printk("%*s ... acquired at:\n",depth,""); 480 printk("%*s ... acquired at:\n",depth,"");
@@ -474,7 +499,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
474 return 0; 499 return 0;
475 500
476 entry->class = this; 501 entry->class = this;
477 save_trace(&entry->trace); 502 if (!save_trace(&entry->trace))
503 return 0;
478 504
479 /* 505 /*
480 * Since we never remove from the dependency list, the list can 506 * Since we never remove from the dependency list, the list can
@@ -532,9 +558,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
532{ 558{
533 struct task_struct *curr = current; 559 struct task_struct *curr = current;
534 560
535 __raw_spin_unlock(&hash_lock); 561 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
536 debug_locks_off();
537 if (debug_locks_silent)
538 return 0; 562 return 0;
539 563
540 printk("\n=======================================================\n"); 564 printk("\n=======================================================\n");
@@ -563,7 +587,9 @@ static noinline int print_circular_bug_tail(void)
563 return 0; 587 return 0;
564 588
565 this.class = check_source->class; 589 this.class = check_source->class;
566 save_trace(&this.trace); 590 if (!save_trace(&this.trace))
591 return 0;
592
567 print_circular_bug_entry(&this, 0); 593 print_circular_bug_entry(&this, 0);
568 594
569 printk("\nother info that might help us debug this:\n\n"); 595 printk("\nother info that might help us debug this:\n\n");
@@ -579,8 +605,10 @@ static noinline int print_circular_bug_tail(void)
579 605
580static int noinline print_infinite_recursion_bug(void) 606static int noinline print_infinite_recursion_bug(void)
581{ 607{
582 __raw_spin_unlock(&hash_lock); 608 if (!debug_locks_off_graph_unlock())
583 DEBUG_LOCKS_WARN_ON(1); 609 return 0;
610
611 WARN_ON(1);
584 612
585 return 0; 613 return 0;
586} 614}
@@ -715,9 +743,7 @@ print_bad_irq_dependency(struct task_struct *curr,
715 enum lock_usage_bit bit2, 743 enum lock_usage_bit bit2,
716 const char *irqclass) 744 const char *irqclass)
717{ 745{
718 __raw_spin_unlock(&hash_lock); 746 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
719 debug_locks_off();
720 if (debug_locks_silent)
721 return 0; 747 return 0;
722 748
723 printk("\n======================================================\n"); 749 printk("\n======================================================\n");
@@ -798,9 +824,7 @@ static int
798print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 824print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
799 struct held_lock *next) 825 struct held_lock *next)
800{ 826{
801 debug_locks_off(); 827 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
802 __raw_spin_unlock(&hash_lock);
803 if (debug_locks_silent)
804 return 0; 828 return 0;
805 829
806 printk("\n=============================================\n"); 830 printk("\n=============================================\n");
@@ -966,27 +990,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
966 &prev->class->locks_after, next->acquire_ip); 990 &prev->class->locks_after, next->acquire_ip);
967 if (!ret) 991 if (!ret)
968 return 0; 992 return 0;
969 /* 993
970 * Return value of 2 signals 'dependency already added',
971 * in that case we dont have to add the backlink either.
972 */
973 if (ret == 2)
974 return 2;
975 ret = add_lock_to_list(next->class, prev->class, 994 ret = add_lock_to_list(next->class, prev->class,
976 &next->class->locks_before, next->acquire_ip); 995 &next->class->locks_before, next->acquire_ip);
996 if (!ret)
997 return 0;
977 998
978 /* 999 /*
979 * Debugging printouts: 1000 * Debugging printouts:
980 */ 1001 */
981 if (verbose(prev->class) || verbose(next->class)) { 1002 if (verbose(prev->class) || verbose(next->class)) {
982 __raw_spin_unlock(&hash_lock); 1003 graph_unlock();
983 printk("\n new dependency: "); 1004 printk("\n new dependency: ");
984 print_lock_name(prev->class); 1005 print_lock_name(prev->class);
985 printk(" => "); 1006 printk(" => ");
986 print_lock_name(next->class); 1007 print_lock_name(next->class);
987 printk("\n"); 1008 printk("\n");
988 dump_stack(); 1009 dump_stack();
989 __raw_spin_lock(&hash_lock); 1010 return graph_lock();
990 } 1011 }
991 return 1; 1012 return 1;
992} 1013}
@@ -1025,7 +1046,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1025 * added: 1046 * added:
1026 */ 1047 */
1027 if (hlock->read != 2) { 1048 if (hlock->read != 2) {
1028 check_prev_add(curr, hlock, next); 1049 if (!check_prev_add(curr, hlock, next))
1050 return 0;
1029 /* 1051 /*
1030 * Stop after the first non-trylock entry, 1052 * Stop after the first non-trylock entry,
1031 * as non-trylock entries have added their 1053 * as non-trylock entries have added their
@@ -1050,8 +1072,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1050 } 1072 }
1051 return 1; 1073 return 1;
1052out_bug: 1074out_bug:
1053 __raw_spin_unlock(&hash_lock); 1075 if (!debug_locks_off_graph_unlock())
1054 DEBUG_LOCKS_WARN_ON(1); 1076 return 0;
1077
1078 WARN_ON(1);
1055 1079
1056 return 0; 1080 return 0;
1057} 1081}
@@ -1081,7 +1105,8 @@ static int static_obj(void *obj)
1081 */ 1105 */
1082 for_each_possible_cpu(i) { 1106 for_each_possible_cpu(i) {
1083 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); 1107 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
1084 end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); 1108 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
1109 + per_cpu_offset(i);
1085 1110
1086 if ((addr >= start) && (addr < end)) 1111 if ((addr >= start) && (addr < end))
1087 return 1; 1112 return 1;
@@ -1181,6 +1206,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1181 struct lockdep_subclass_key *key; 1206 struct lockdep_subclass_key *key;
1182 struct list_head *hash_head; 1207 struct list_head *hash_head;
1183 struct lock_class *class; 1208 struct lock_class *class;
1209 unsigned long flags;
1184 1210
1185 class = look_up_lock_class(lock, subclass); 1211 class = look_up_lock_class(lock, subclass);
1186 if (likely(class)) 1212 if (likely(class))
@@ -1202,7 +1228,11 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1202 key = lock->key->subkeys + subclass; 1228 key = lock->key->subkeys + subclass;
1203 hash_head = classhashentry(key); 1229 hash_head = classhashentry(key);
1204 1230
1205 __raw_spin_lock(&hash_lock); 1231 raw_local_irq_save(flags);
1232 if (!graph_lock()) {
1233 raw_local_irq_restore(flags);
1234 return NULL;
1235 }
1206 /* 1236 /*
1207 * We have to do the hash-walk again, to avoid races 1237 * We have to do the hash-walk again, to avoid races
1208 * with another CPU: 1238 * with another CPU:
@@ -1215,8 +1245,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1215 * the hash: 1245 * the hash:
1216 */ 1246 */
1217 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { 1247 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1218 __raw_spin_unlock(&hash_lock); 1248 if (!debug_locks_off_graph_unlock()) {
1219 debug_locks_off(); 1249 raw_local_irq_restore(flags);
1250 return NULL;
1251 }
1252 raw_local_irq_restore(flags);
1253
1220 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 1254 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1221 printk("turning off the locking correctness validator.\n"); 1255 printk("turning off the locking correctness validator.\n");
1222 return NULL; 1256 return NULL;
@@ -1237,16 +1271,24 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1237 list_add_tail_rcu(&class->hash_entry, hash_head); 1271 list_add_tail_rcu(&class->hash_entry, hash_head);
1238 1272
1239 if (verbose(class)) { 1273 if (verbose(class)) {
1240 __raw_spin_unlock(&hash_lock); 1274 graph_unlock();
1275 raw_local_irq_restore(flags);
1276
1241 printk("\nnew class %p: %s", class->key, class->name); 1277 printk("\nnew class %p: %s", class->key, class->name);
1242 if (class->name_version > 1) 1278 if (class->name_version > 1)
1243 printk("#%d", class->name_version); 1279 printk("#%d", class->name_version);
1244 printk("\n"); 1280 printk("\n");
1245 dump_stack(); 1281 dump_stack();
1246 __raw_spin_lock(&hash_lock); 1282
1283 raw_local_irq_save(flags);
1284 if (!graph_lock()) {
1285 raw_local_irq_restore(flags);
1286 return NULL;
1287 }
1247 } 1288 }
1248out_unlock_set: 1289out_unlock_set:
1249 __raw_spin_unlock(&hash_lock); 1290 graph_unlock();
1291 raw_local_irq_restore(flags);
1250 1292
1251 if (!subclass || force) 1293 if (!subclass || force)
1252 lock->class_cache = class; 1294 lock->class_cache = class;
@@ -1261,7 +1303,7 @@ out_unlock_set:
1261 * add it and return 0 - in this case the new dependency chain is 1303 * add it and return 0 - in this case the new dependency chain is
1262 * validated. If the key is already hashed, return 1. 1304 * validated. If the key is already hashed, return 1.
1263 */ 1305 */
1264static inline int lookup_chain_cache(u64 chain_key) 1306static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
1265{ 1307{
1266 struct list_head *hash_head = chainhashentry(chain_key); 1308 struct list_head *hash_head = chainhashentry(chain_key);
1267 struct lock_chain *chain; 1309 struct lock_chain *chain;
@@ -1275,34 +1317,32 @@ static inline int lookup_chain_cache(u64 chain_key)
1275 if (chain->chain_key == chain_key) { 1317 if (chain->chain_key == chain_key) {
1276cache_hit: 1318cache_hit:
1277 debug_atomic_inc(&chain_lookup_hits); 1319 debug_atomic_inc(&chain_lookup_hits);
1278 /* 1320 if (very_verbose(class))
1279 * In the debugging case, force redundant checking 1321 printk("\nhash chain already cached, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
1280 * by returning 1:
1281 */
1282#ifdef CONFIG_DEBUG_LOCKDEP
1283 __raw_spin_lock(&hash_lock);
1284 return 1;
1285#endif
1286 return 0; 1322 return 0;
1287 } 1323 }
1288 } 1324 }
1325 if (very_verbose(class))
1326 printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
1289 /* 1327 /*
1290 * Allocate a new chain entry from the static array, and add 1328 * Allocate a new chain entry from the static array, and add
1291 * it to the hash: 1329 * it to the hash:
1292 */ 1330 */
1293 __raw_spin_lock(&hash_lock); 1331 if (!graph_lock())
1332 return 0;
1294 /* 1333 /*
1295 * We have to walk the chain again locked - to avoid duplicates: 1334 * We have to walk the chain again locked - to avoid duplicates:
1296 */ 1335 */
1297 list_for_each_entry(chain, hash_head, entry) { 1336 list_for_each_entry(chain, hash_head, entry) {
1298 if (chain->chain_key == chain_key) { 1337 if (chain->chain_key == chain_key) {
1299 __raw_spin_unlock(&hash_lock); 1338 graph_unlock();
1300 goto cache_hit; 1339 goto cache_hit;
1301 } 1340 }
1302 } 1341 }
1303 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { 1342 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
1304 __raw_spin_unlock(&hash_lock); 1343 if (!debug_locks_off_graph_unlock())
1305 debug_locks_off(); 1344 return 0;
1345
1306 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 1346 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
1307 printk("turning off the locking correctness validator.\n"); 1347 printk("turning off the locking correctness validator.\n");
1308 return 0; 1348 return 0;
@@ -1378,9 +1418,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1378 struct held_lock *this, int forwards, 1418 struct held_lock *this, int forwards,
1379 const char *irqclass) 1419 const char *irqclass)
1380{ 1420{
1381 __raw_spin_unlock(&hash_lock); 1421 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1382 debug_locks_off();
1383 if (debug_locks_silent)
1384 return 0; 1422 return 0;
1385 1423
1386 printk("\n=========================================================\n"); 1424 printk("\n=========================================================\n");
@@ -1450,7 +1488,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1450 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 1488 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
1451} 1489}
1452 1490
1453static inline void print_irqtrace_events(struct task_struct *curr) 1491void print_irqtrace_events(struct task_struct *curr)
1454{ 1492{
1455 printk("irq event stamp: %u\n", curr->irq_events); 1493 printk("irq event stamp: %u\n", curr->irq_events);
1456 printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); 1494 printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
@@ -1463,19 +1501,13 @@ static inline void print_irqtrace_events(struct task_struct *curr)
1463 print_ip_sym(curr->softirq_disable_ip); 1501 print_ip_sym(curr->softirq_disable_ip);
1464} 1502}
1465 1503
1466#else
1467static inline void print_irqtrace_events(struct task_struct *curr)
1468{
1469}
1470#endif 1504#endif
1471 1505
1472static int 1506static int
1473print_usage_bug(struct task_struct *curr, struct held_lock *this, 1507print_usage_bug(struct task_struct *curr, struct held_lock *this,
1474 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 1508 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1475{ 1509{
1476 __raw_spin_unlock(&hash_lock); 1510 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1477 debug_locks_off();
1478 if (debug_locks_silent)
1479 return 0; 1511 return 0;
1480 1512
1481 printk("\n=================================\n"); 1513 printk("\n=================================\n");
@@ -1536,12 +1568,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1536 if (likely(this->class->usage_mask & new_mask)) 1568 if (likely(this->class->usage_mask & new_mask))
1537 return 1; 1569 return 1;
1538 1570
1539 __raw_spin_lock(&hash_lock); 1571 if (!graph_lock())
1572 return 0;
1540 /* 1573 /*
1541 * Make sure we didnt race: 1574 * Make sure we didnt race:
1542 */ 1575 */
1543 if (unlikely(this->class->usage_mask & new_mask)) { 1576 if (unlikely(this->class->usage_mask & new_mask)) {
1544 __raw_spin_unlock(&hash_lock); 1577 graph_unlock();
1545 return 1; 1578 return 1;
1546 } 1579 }
1547 1580
@@ -1727,15 +1760,16 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1727 debug_atomic_dec(&nr_unused_locks); 1760 debug_atomic_dec(&nr_unused_locks);
1728 break; 1761 break;
1729 default: 1762 default:
1730 debug_locks_off(); 1763 if (!debug_locks_off_graph_unlock())
1764 return 0;
1731 WARN_ON(1); 1765 WARN_ON(1);
1732 return 0; 1766 return 0;
1733 } 1767 }
1734 1768
1735 __raw_spin_unlock(&hash_lock); 1769 graph_unlock();
1736 1770
1737 /* 1771 /*
1738 * We must printk outside of the hash_lock: 1772 * We must printk outside of the graph_lock:
1739 */ 1773 */
1740 if (ret == 2) { 1774 if (ret == 2) {
1741 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); 1775 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
@@ -2133,9 +2167,9 @@ out_calc_hash:
2133 * We look up the chain_key and do the O(N^2) check and update of 2167 * We look up the chain_key and do the O(N^2) check and update of
2134 * the dependencies only if this is a new dependency chain. 2168 * the dependencies only if this is a new dependency chain.
2135 * (If lookup_chain_cache() returns with 1 it acquires 2169 * (If lookup_chain_cache() returns with 1 it acquires
2136 * hash_lock for us) 2170 * graph_lock for us)
2137 */ 2171 */
2138 if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) { 2172 if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
2139 /* 2173 /*
2140 * Check whether last held lock: 2174 * Check whether last held lock:
2141 * 2175 *
@@ -2166,7 +2200,7 @@ out_calc_hash:
2166 if (!chain_head && ret != 2) 2200 if (!chain_head && ret != 2)
2167 if (!check_prevs_add(curr, hlock)) 2201 if (!check_prevs_add(curr, hlock))
2168 return 0; 2202 return 0;
2169 __raw_spin_unlock(&hash_lock); 2203 graph_unlock();
2170 } 2204 }
2171 curr->lockdep_depth++; 2205 curr->lockdep_depth++;
2172 check_chain_key(curr); 2206 check_chain_key(curr);
@@ -2429,6 +2463,7 @@ EXPORT_SYMBOL_GPL(lock_release);
2429void lockdep_reset(void) 2463void lockdep_reset(void)
2430{ 2464{
2431 unsigned long flags; 2465 unsigned long flags;
2466 int i;
2432 2467
2433 raw_local_irq_save(flags); 2468 raw_local_irq_save(flags);
2434 current->curr_chain_key = 0; 2469 current->curr_chain_key = 0;
@@ -2439,6 +2474,8 @@ void lockdep_reset(void)
2439 nr_softirq_chains = 0; 2474 nr_softirq_chains = 0;
2440 nr_process_chains = 0; 2475 nr_process_chains = 0;
2441 debug_locks = 1; 2476 debug_locks = 1;
2477 for (i = 0; i < CHAINHASH_SIZE; i++)
2478 INIT_LIST_HEAD(chainhash_table + i);
2442 raw_local_irq_restore(flags); 2479 raw_local_irq_restore(flags);
2443} 2480}
2444 2481
@@ -2475,7 +2512,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
2475 int i; 2512 int i;
2476 2513
2477 raw_local_irq_save(flags); 2514 raw_local_irq_save(flags);
2478 __raw_spin_lock(&hash_lock); 2515 graph_lock();
2479 2516
2480 /* 2517 /*
2481 * Unhash all classes that were created by this module: 2518 * Unhash all classes that were created by this module:
@@ -2489,7 +2526,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
2489 zap_class(class); 2526 zap_class(class);
2490 } 2527 }
2491 2528
2492 __raw_spin_unlock(&hash_lock); 2529 graph_unlock();
2493 raw_local_irq_restore(flags); 2530 raw_local_irq_restore(flags);
2494} 2531}
2495 2532
@@ -2517,20 +2554,20 @@ void lockdep_reset_lock(struct lockdep_map *lock)
2517 * Debug check: in the end all mapped classes should 2554 * Debug check: in the end all mapped classes should
2518 * be gone. 2555 * be gone.
2519 */ 2556 */
2520 __raw_spin_lock(&hash_lock); 2557 graph_lock();
2521 for (i = 0; i < CLASSHASH_SIZE; i++) { 2558 for (i = 0; i < CLASSHASH_SIZE; i++) {
2522 head = classhash_table + i; 2559 head = classhash_table + i;
2523 if (list_empty(head)) 2560 if (list_empty(head))
2524 continue; 2561 continue;
2525 list_for_each_entry_safe(class, next, head, hash_entry) { 2562 list_for_each_entry_safe(class, next, head, hash_entry) {
2526 if (unlikely(class == lock->class_cache)) { 2563 if (unlikely(class == lock->class_cache)) {
2527 __raw_spin_unlock(&hash_lock); 2564 if (debug_locks_off_graph_unlock())
2528 DEBUG_LOCKS_WARN_ON(1); 2565 WARN_ON(1);
2529 goto out_restore; 2566 goto out_restore;
2530 } 2567 }
2531 } 2568 }
2532 } 2569 }
2533 __raw_spin_unlock(&hash_lock); 2570 graph_unlock();
2534 2571
2535out_restore: 2572out_restore:
2536 raw_local_irq_restore(flags); 2573 raw_local_irq_restore(flags);
@@ -2644,6 +2681,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
2644 } 2681 }
2645 local_irq_restore(flags); 2682 local_irq_restore(flags);
2646} 2683}
2684EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
2647 2685
2648static void print_held_locks_bug(struct task_struct *curr) 2686static void print_held_locks_bug(struct task_struct *curr)
2649{ 2687{
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb2..8ce09bc4613d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
20#define MAX_LOCKDEP_KEYS_BITS 11 20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) 21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22 22
23#define MAX_LOCKDEP_CHAINS_BITS 13 23#define MAX_LOCKDEP_CHAINS_BITS 14
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25 25
26/* 26/*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3fa..b554b40a4aa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
113 return 0; 113 return 0;
114} 114}
115 115
116static struct seq_operations lockdep_ops = { 116static const struct seq_operations lockdep_ops = {
117 .start = l_start, 117 .start = l_start,
118 .next = l_next, 118 .next = l_next,
119 .stop = l_stop, 119 .stop = l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
135 return res; 135 return res;
136} 136}
137 137
138static struct file_operations proc_lockdep_operations = { 138static const struct file_operations proc_lockdep_operations = {
139 .open = lockdep_open, 139 .open = lockdep_open,
140 .read = seq_read, 140 .read = seq_read,
141 .llseek = seq_lseek, 141 .llseek = seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
319 return single_open(file, lockdep_stats_show, NULL); 319 return single_open(file, lockdep_stats_show, NULL);
320} 320}
321 321
322static struct file_operations proc_lockdep_stats_operations = { 322static const struct file_operations proc_lockdep_stats_operations = {
323 .open = lockdep_stats_open, 323 .open = lockdep_stats_open,
324 .read = seq_read, 324 .read = seq_read,
325 .llseek = seq_lseek, 325 .llseek = seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index 5072a943fe35..b565eaeff7e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -34,10 +34,10 @@
34#include <linux/err.h> 34#include <linux/err.h>
35#include <linux/vermagic.h> 35#include <linux/vermagic.h>
36#include <linux/notifier.h> 36#include <linux/notifier.h>
37#include <linux/sched.h>
37#include <linux/stop_machine.h> 38#include <linux/stop_machine.h>
38#include <linux/device.h> 39#include <linux/device.h>
39#include <linux/string.h> 40#include <linux/string.h>
40#include <linux/sched.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/unwind.h> 42#include <linux/unwind.h>
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -790,6 +790,19 @@ static struct module_attribute refcnt = {
790 .show = show_refcnt, 790 .show = show_refcnt,
791}; 791};
792 792
793void module_put(struct module *module)
794{
795 if (module) {
796 unsigned int cpu = get_cpu();
797 local_dec(&module->ref[cpu].count);
798 /* Maybe they're waiting for us to drop reference? */
799 if (unlikely(!module_is_live(module)))
800 wake_up_process(module->waiter);
801 put_cpu();
802 }
803}
804EXPORT_SYMBOL(module_put);
805
793#else /* !CONFIG_MODULE_UNLOAD */ 806#else /* !CONFIG_MODULE_UNLOAD */
794static void print_unload_info(struct seq_file *m, struct module *mod) 807static void print_unload_info(struct seq_file *m, struct module *mod)
795{ 808{
@@ -811,9 +824,34 @@ static inline void module_unload_init(struct module *mod)
811} 824}
812#endif /* CONFIG_MODULE_UNLOAD */ 825#endif /* CONFIG_MODULE_UNLOAD */
813 826
827static ssize_t show_initstate(struct module_attribute *mattr,
828 struct module *mod, char *buffer)
829{
830 const char *state = "unknown";
831
832 switch (mod->state) {
833 case MODULE_STATE_LIVE:
834 state = "live";
835 break;
836 case MODULE_STATE_COMING:
837 state = "coming";
838 break;
839 case MODULE_STATE_GOING:
840 state = "going";
841 break;
842 }
843 return sprintf(buffer, "%s\n", state);
844}
845
846static struct module_attribute initstate = {
847 .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE },
848 .show = show_initstate,
849};
850
814static struct module_attribute *modinfo_attrs[] = { 851static struct module_attribute *modinfo_attrs[] = {
815 &modinfo_version, 852 &modinfo_version,
816 &modinfo_srcversion, 853 &modinfo_srcversion,
854 &initstate,
817#ifdef CONFIG_MODULE_UNLOAD 855#ifdef CONFIG_MODULE_UNLOAD
818 &refcnt, 856 &refcnt,
819#endif 857#endif
@@ -1086,22 +1124,35 @@ static int mod_sysfs_setup(struct module *mod,
1086 goto out; 1124 goto out;
1087 kobj_set_kset_s(&mod->mkobj, module_subsys); 1125 kobj_set_kset_s(&mod->mkobj, module_subsys);
1088 mod->mkobj.mod = mod; 1126 mod->mkobj.mod = mod;
1089 err = kobject_register(&mod->mkobj.kobj); 1127
1128 /* delay uevent until full sysfs population */
1129 kobject_init(&mod->mkobj.kobj);
1130 err = kobject_add(&mod->mkobj.kobj);
1090 if (err) 1131 if (err)
1091 goto out; 1132 goto out;
1092 1133
1134 mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers");
1135 if (!mod->drivers_dir)
1136 goto out_unreg;
1137
1093 err = module_param_sysfs_setup(mod, kparam, num_params); 1138 err = module_param_sysfs_setup(mod, kparam, num_params);
1094 if (err) 1139 if (err)
1095 goto out_unreg; 1140 goto out_unreg_drivers;
1096 1141
1097 err = module_add_modinfo_attrs(mod); 1142 err = module_add_modinfo_attrs(mod);
1098 if (err) 1143 if (err)
1099 goto out_unreg; 1144 goto out_unreg_param;
1100 1145
1146 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1101 return 0; 1147 return 0;
1102 1148
1149out_unreg_drivers:
1150 kobject_unregister(mod->drivers_dir);
1151out_unreg_param:
1152 module_param_sysfs_remove(mod);
1103out_unreg: 1153out_unreg:
1104 kobject_unregister(&mod->mkobj.kobj); 1154 kobject_del(&mod->mkobj.kobj);
1155 kobject_put(&mod->mkobj.kobj);
1105out: 1156out:
1106 return err; 1157 return err;
1107} 1158}
@@ -1110,6 +1161,7 @@ static void mod_kobject_remove(struct module *mod)
1110{ 1161{
1111 module_remove_modinfo_attrs(mod); 1162 module_remove_modinfo_attrs(mod);
1112 module_param_sysfs_remove(mod); 1163 module_param_sysfs_remove(mod);
1164 kobject_unregister(mod->drivers_dir);
1113 1165
1114 kobject_unregister(&mod->mkobj.kobj); 1166 kobject_unregister(&mod->mkobj.kobj);
1115} 1167}
@@ -1718,7 +1770,7 @@ static struct module *load_module(void __user *umod,
1718 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1770 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1719 1771
1720 if (strcmp(mod->name, "ndiswrapper") == 0) 1772 if (strcmp(mod->name, "ndiswrapper") == 0)
1721 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1773 add_taint(TAINT_PROPRIETARY_MODULE);
1722 if (strcmp(mod->name, "driverloader") == 0) 1774 if (strcmp(mod->name, "driverloader") == 0)
1723 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1775 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1724 1776
@@ -2182,7 +2234,7 @@ static int m_show(struct seq_file *m, void *p)
2182 Where refcount is a number or -, and deps is a comma-separated list 2234 Where refcount is a number or -, and deps is a comma-separated list
2183 of depends or -. 2235 of depends or -.
2184*/ 2236*/
2185struct seq_operations modules_op = { 2237const struct seq_operations modules_op = {
2186 .start = m_start, 2238 .start = m_start,
2187 .next = m_next, 2239 .next = m_next,
2188 .stop = m_stop, 2240 .stop = m_stop,
@@ -2275,11 +2327,14 @@ void print_modules(void)
2275 2327
2276void module_add_driver(struct module *mod, struct device_driver *drv) 2328void module_add_driver(struct module *mod, struct device_driver *drv)
2277{ 2329{
2330 int no_warn;
2331
2278 if (!mod || !drv) 2332 if (!mod || !drv)
2279 return; 2333 return;
2280 2334
2281 /* Don't check return code; this call is idempotent */ 2335 /* Don't check return codes; these calls are idempotent */
2282 sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); 2336 no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
2337 no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name);
2283} 2338}
2284EXPORT_SYMBOL(module_add_driver); 2339EXPORT_SYMBOL(module_add_driver);
2285 2340
@@ -2288,6 +2343,8 @@ void module_remove_driver(struct device_driver *drv)
2288 if (!drv) 2343 if (!drv)
2289 return; 2344 return;
2290 sysfs_remove_link(&drv->kobj, "module"); 2345 sysfs_remove_link(&drv->kobj, "module");
2346 if (drv->owner && drv->owner->drivers_dir)
2347 sysfs_remove_link(drv->owner->drivers_dir, drv->name);
2291} 2348}
2292EXPORT_SYMBOL(module_remove_driver); 2349EXPORT_SYMBOL(module_remove_driver);
2293 2350
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 18651641a7b5..841539d72c55 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
77 77
78void debug_mutex_unlock(struct mutex *lock) 78void debug_mutex_unlock(struct mutex *lock)
79{ 79{
80 if (unlikely(!debug_locks))
81 return;
82
80 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
81 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 84 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 85 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a497..e7cbbb82765b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
206} 206}
207 207
208EXPORT_SYMBOL_GPL(mutex_lock_nested); 208EXPORT_SYMBOL_GPL(mutex_lock_nested);
209
210int __sched
211mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
212{
213 might_sleep();
214 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
215}
216
217EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
209#endif 218#endif
210 219
211/* 220/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 674aceb7335a..f5b9ee6f6bbb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,8 +17,9 @@
17#include <linux/version.h> 17#include <linux/version.h>
18#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
19#include <linux/init_task.h> 19#include <linux/init_task.h>
20#include <linux/namespace.h> 20#include <linux/mnt_namespace.h>
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h>
22 23
23struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 24struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
24 25
@@ -60,12 +61,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
60 struct nsproxy *ns = clone_namespaces(orig); 61 struct nsproxy *ns = clone_namespaces(orig);
61 62
62 if (ns) { 63 if (ns) {
63 if (ns->namespace) 64 if (ns->mnt_ns)
64 get_namespace(ns->namespace); 65 get_mnt_ns(ns->mnt_ns);
65 if (ns->uts_ns) 66 if (ns->uts_ns)
66 get_uts_ns(ns->uts_ns); 67 get_uts_ns(ns->uts_ns);
67 if (ns->ipc_ns) 68 if (ns->ipc_ns)
68 get_ipc_ns(ns->ipc_ns); 69 get_ipc_ns(ns->ipc_ns);
70 if (ns->pid_ns)
71 get_pid_ns(ns->pid_ns);
69 } 72 }
70 73
71 return ns; 74 return ns;
@@ -97,7 +100,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
97 100
98 tsk->nsproxy = new_ns; 101 tsk->nsproxy = new_ns;
99 102
100 err = copy_namespace(flags, tsk); 103 err = copy_mnt_ns(flags, tsk);
101 if (err) 104 if (err)
102 goto out_ns; 105 goto out_ns;
103 106
@@ -109,16 +112,23 @@ int copy_namespaces(int flags, struct task_struct *tsk)
109 if (err) 112 if (err)
110 goto out_ipc; 113 goto out_ipc;
111 114
115 err = copy_pid_ns(flags, tsk);
116 if (err)
117 goto out_pid;
118
112out: 119out:
113 put_nsproxy(old_ns); 120 put_nsproxy(old_ns);
114 return err; 121 return err;
115 122
123out_pid:
124 if (new_ns->ipc_ns)
125 put_ipc_ns(new_ns->ipc_ns);
116out_ipc: 126out_ipc:
117 if (new_ns->uts_ns) 127 if (new_ns->uts_ns)
118 put_uts_ns(new_ns->uts_ns); 128 put_uts_ns(new_ns->uts_ns);
119out_uts: 129out_uts:
120 if (new_ns->namespace) 130 if (new_ns->mnt_ns)
121 put_namespace(new_ns->namespace); 131 put_mnt_ns(new_ns->mnt_ns);
122out_ns: 132out_ns:
123 tsk->nsproxy = old_ns; 133 tsk->nsproxy = old_ns;
124 kfree(new_ns); 134 kfree(new_ns);
@@ -127,11 +137,13 @@ out_ns:
127 137
128void free_nsproxy(struct nsproxy *ns) 138void free_nsproxy(struct nsproxy *ns)
129{ 139{
130 if (ns->namespace) 140 if (ns->mnt_ns)
131 put_namespace(ns->namespace); 141 put_mnt_ns(ns->mnt_ns);
132 if (ns->uts_ns) 142 if (ns->uts_ns)
133 put_uts_ns(ns->uts_ns); 143 put_uts_ns(ns->uts_ns);
134 if (ns->ipc_ns) 144 if (ns->ipc_ns)
135 put_ipc_ns(ns->ipc_ns); 145 put_ipc_ns(ns->ipc_ns);
136 kfree(ns); 146 if (ns->pid_ns)
147 put_pid_ns(ns->pid_ns);
148 kfree(ns);
137} 149}
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f9..2efe9d8d367b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,12 +26,12 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/hash.h> 28#include <linux/hash.h>
29#include <linux/pspace.h> 29#include <linux/pid_namespace.h>
30 30
31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
32static struct hlist_head *pid_hash; 32static struct hlist_head *pid_hash;
33static int pidhash_shift; 33static int pidhash_shift;
34static kmem_cache_t *pid_cachep; 34static struct kmem_cache *pid_cachep;
35 35
36int pid_max = PID_MAX_DEFAULT; 36int pid_max = PID_MAX_DEFAULT;
37 37
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
43#define BITS_PER_PAGE (PAGE_SIZE*8) 43#define BITS_PER_PAGE (PAGE_SIZE*8)
44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
45 45
46static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) 46static inline int mk_pid(struct pid_namespace *pid_ns,
47 struct pidmap *map, int off)
47{ 48{
48 return (map - pspace->pidmap)*BITS_PER_PAGE + off; 49 return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
49} 50}
50 51
51#define find_next_offset(map, off) \ 52#define find_next_offset(map, off) \
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
57 * value does not cause lots of bitmaps to be allocated, but 58 * value does not cause lots of bitmaps to be allocated, but
58 * the scheme scales to up to 4 million PIDs, runtime. 59 * the scheme scales to up to 4 million PIDs, runtime.
59 */ 60 */
60struct pspace init_pspace = { 61struct pid_namespace init_pid_ns = {
62 .kref = {
63 .refcount = ATOMIC_INIT(2),
64 },
61 .pidmap = { 65 .pidmap = {
62 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 66 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
63 }, 67 },
64 .last_pid = 0 68 .last_pid = 0,
69 .child_reaper = &init_task
65}; 70};
66 71
67/* 72/*
@@ -80,25 +85,25 @@ struct pspace init_pspace = {
80 85
81static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 86static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
82 87
83static fastcall void free_pidmap(struct pspace *pspace, int pid) 88static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
84{ 89{
85 struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; 90 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
86 int offset = pid & BITS_PER_PAGE_MASK; 91 int offset = pid & BITS_PER_PAGE_MASK;
87 92
88 clear_bit(offset, map->page); 93 clear_bit(offset, map->page);
89 atomic_inc(&map->nr_free); 94 atomic_inc(&map->nr_free);
90} 95}
91 96
92static int alloc_pidmap(struct pspace *pspace) 97static int alloc_pidmap(struct pid_namespace *pid_ns)
93{ 98{
94 int i, offset, max_scan, pid, last = pspace->last_pid; 99 int i, offset, max_scan, pid, last = pid_ns->last_pid;
95 struct pidmap *map; 100 struct pidmap *map;
96 101
97 pid = last + 1; 102 pid = last + 1;
98 if (pid >= pid_max) 103 if (pid >= pid_max)
99 pid = RESERVED_PIDS; 104 pid = RESERVED_PIDS;
100 offset = pid & BITS_PER_PAGE_MASK; 105 offset = pid & BITS_PER_PAGE_MASK;
101 map = &pspace->pidmap[pid/BITS_PER_PAGE]; 106 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
102 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 107 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
103 for (i = 0; i <= max_scan; ++i) { 108 for (i = 0; i <= max_scan; ++i) {
104 if (unlikely(!map->page)) { 109 if (unlikely(!map->page)) {
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace)
120 do { 125 do {
121 if (!test_and_set_bit(offset, map->page)) { 126 if (!test_and_set_bit(offset, map->page)) {
122 atomic_dec(&map->nr_free); 127 atomic_dec(&map->nr_free);
123 pspace->last_pid = pid; 128 pid_ns->last_pid = pid;
124 return pid; 129 return pid;
125 } 130 }
126 offset = find_next_offset(map, offset); 131 offset = find_next_offset(map, offset);
127 pid = mk_pid(pspace, map, offset); 132 pid = mk_pid(pid_ns, map, offset);
128 /* 133 /*
129 * find_next_offset() found a bit, the pid from it 134 * find_next_offset() found a bit, the pid from it
130 * is in-bounds, and if we fell back to the last 135 * is in-bounds, and if we fell back to the last
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace)
135 (i != max_scan || pid < last || 140 (i != max_scan || pid < last ||
136 !((last+1) & BITS_PER_PAGE_MASK))); 141 !((last+1) & BITS_PER_PAGE_MASK)));
137 } 142 }
138 if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 143 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
139 ++map; 144 ++map;
140 offset = 0; 145 offset = 0;
141 } else { 146 } else {
142 map = &pspace->pidmap[0]; 147 map = &pid_ns->pidmap[0];
143 offset = RESERVED_PIDS; 148 offset = RESERVED_PIDS;
144 if (unlikely(last == offset)) 149 if (unlikely(last == offset))
145 break; 150 break;
146 } 151 }
147 pid = mk_pid(pspace, map, offset); 152 pid = mk_pid(pid_ns, map, offset);
148 } 153 }
149 return -1; 154 return -1;
150} 155}
151 156
152static int next_pidmap(struct pspace *pspace, int last) 157static int next_pidmap(struct pid_namespace *pid_ns, int last)
153{ 158{
154 int offset; 159 int offset;
155 struct pidmap *map, *end; 160 struct pidmap *map, *end;
156 161
157 offset = (last + 1) & BITS_PER_PAGE_MASK; 162 offset = (last + 1) & BITS_PER_PAGE_MASK;
158 map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; 163 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
159 end = &pspace->pidmap[PIDMAP_ENTRIES]; 164 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
160 for (; map < end; map++, offset = 0) { 165 for (; map < end; map++, offset = 0) {
161 if (unlikely(!map->page)) 166 if (unlikely(!map->page))
162 continue; 167 continue;
163 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); 168 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
164 if (offset < BITS_PER_PAGE) 169 if (offset < BITS_PER_PAGE)
165 return mk_pid(pspace, map, offset); 170 return mk_pid(pid_ns, map, offset);
166 } 171 }
167 return -1; 172 return -1;
168} 173}
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid)
192 hlist_del_rcu(&pid->pid_chain); 197 hlist_del_rcu(&pid->pid_chain);
193 spin_unlock_irqrestore(&pidmap_lock, flags); 198 spin_unlock_irqrestore(&pidmap_lock, flags);
194 199
195 free_pidmap(&init_pspace, pid->nr); 200 free_pidmap(current->nsproxy->pid_ns, pid->nr);
196 call_rcu(&pid->rcu, delayed_put_pid); 201 call_rcu(&pid->rcu, delayed_put_pid);
197} 202}
198 203
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void)
206 if (!pid) 211 if (!pid)
207 goto out; 212 goto out;
208 213
209 nr = alloc_pidmap(&init_pspace); 214 nr = alloc_pidmap(current->nsproxy->pid_ns);
210 if (nr < 0) 215 if (nr < 0)
211 goto out_free; 216 goto out_free;
212 217
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr)
348 pid = find_pid(nr); 353 pid = find_pid(nr);
349 if (pid) 354 if (pid)
350 break; 355 break;
351 nr = next_pidmap(&init_pspace, nr); 356 nr = next_pidmap(current->nsproxy->pid_ns, nr);
352 } while (nr > 0); 357 } while (nr > 0);
353 358
354 return pid; 359 return pid;
355} 360}
356EXPORT_SYMBOL_GPL(find_get_pid); 361EXPORT_SYMBOL_GPL(find_get_pid);
357 362
363int copy_pid_ns(int flags, struct task_struct *tsk)
364{
365 struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
366 int err = 0;
367
368 if (!old_ns)
369 return 0;
370
371 get_pid_ns(old_ns);
372 return err;
373}
374
375void free_pid_ns(struct kref *kref)
376{
377 struct pid_namespace *ns;
378
379 ns = container_of(kref, struct pid_namespace, kref);
380 kfree(ns);
381}
382
358/* 383/*
359 * The pid hash table is scaled according to the amount of memory in the 384 * The pid hash table is scaled according to the amount of memory in the
360 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 385 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -382,10 +407,10 @@ void __init pidhash_init(void)
382 407
383void __init pidmap_init(void) 408void __init pidmap_init(void)
384{ 409{
385 init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 410 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
386 /* Reserve PID 0. We never call free_pidmap(0) */ 411 /* Reserve PID 0. We never call free_pidmap(0) */
387 set_bit(0, init_pspace.pidmap[0].page); 412 set_bit(0, init_pid_ns.pidmap[0].page);
388 atomic_dec(&init_pspace.pidmap[0].nr_free); 413 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
389 414
390 pid_cachep = kmem_cache_create("pid", sizeof(struct pid), 415 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
391 __alignof__(struct pid), 416 __alignof__(struct pid),
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06f..5fe87de10ff0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
70/* 70/*
71 * Lets keep our timers in a slab cache :-) 71 * Lets keep our timers in a slab cache :-)
72 */ 72 */
73static kmem_cache_t *posix_timers_cache; 73static struct kmem_cache *posix_timers_cache;
74static struct idr posix_timers_id; 74static struct idr posix_timers_id;
75static DEFINE_SPINLOCK(idr_lock); 75static DEFINE_SPINLOCK(idr_lock);
76 76
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca3479..ed296225dcd4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -20,13 +20,14 @@ config PM
20 sending the processor to sleep and saving power. 20 sending the processor to sleep and saving power.
21 21
22config PM_LEGACY 22config PM_LEGACY
23 bool "Legacy Power Management API" 23 bool "Legacy Power Management API (DEPRECATED)"
24 depends on PM 24 depends on PM
25 default y 25 default n
26 ---help--- 26 ---help---
27 Support for pm_register() and friends. 27 Support for pm_register() and friends. This old API is obsoleted
28 by the driver model.
28 29
29 If unsure, say Y. 30 If unsure, say N.
30 31
31config PM_DEBUG 32config PM_DEBUG
32 bool "Power Management Debug Support" 33 bool "Power Management Debug Support"
@@ -78,7 +79,7 @@ config PM_SYSFS_DEPRECATED
78 79
79config SOFTWARE_SUSPEND 80config SOFTWARE_SUSPEND
80 bool "Software Suspend" 81 bool "Software Suspend"
81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) 82 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
82 ---help--- 83 ---help---
83 Enable the possibility of suspending the machine. 84 Enable the possibility of suspending the machine.
84 It doesn't need ACPI or APM. 85 It doesn't need ACPI or APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index ae6bbc903b7d..88fc5d7ac737 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/console.h> 21#include <linux/console.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/freezer.h>
23 24
24#include "power.h" 25#include "power.h"
25 26
@@ -27,6 +28,23 @@
27static int noresume = 0; 28static int noresume = 0;
28char resume_file[256] = CONFIG_PM_STD_PARTITION; 29char resume_file[256] = CONFIG_PM_STD_PARTITION;
29dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block;
32
33/**
34 * platform_prepare - prepare the machine for hibernation using the
35 * platform driver if so configured and return an error code if it fails
36 */
37
38static inline int platform_prepare(void)
39{
40 int error = 0;
41
42 if (pm_disk_mode == PM_DISK_PLATFORM) {
43 if (pm_ops && pm_ops->prepare)
44 error = pm_ops->prepare(PM_SUSPEND_DISK);
45 }
46 return error;
47}
30 48
31/** 49/**
32 * power_down - Shut machine down for hibernate. 50 * power_down - Shut machine down for hibernate.
@@ -40,13 +58,11 @@ dev_t swsusp_resume_device;
40 58
41static void power_down(suspend_disk_method_t mode) 59static void power_down(suspend_disk_method_t mode)
42{ 60{
43 int error = 0;
44
45 switch(mode) { 61 switch(mode) {
46 case PM_DISK_PLATFORM: 62 case PM_DISK_PLATFORM:
47 if (pm_ops && pm_ops->enter) { 63 if (pm_ops && pm_ops->enter) {
48 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 64 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
49 error = pm_ops->enter(PM_SUSPEND_DISK); 65 pm_ops->enter(PM_SUSPEND_DISK);
50 break; 66 break;
51 } 67 }
52 case PM_DISK_SHUTDOWN: 68 case PM_DISK_SHUTDOWN:
@@ -73,7 +89,7 @@ static inline void platform_finish(void)
73 89
74static int prepare_processes(void) 90static int prepare_processes(void)
75{ 91{
76 int error; 92 int error = 0;
77 93
78 pm_prepare_console(); 94 pm_prepare_console();
79 95
@@ -86,12 +102,24 @@ static int prepare_processes(void)
86 goto thaw; 102 goto thaw;
87 } 103 }
88 104
105 if (pm_disk_mode == PM_DISK_TESTPROC) {
106 printk("swsusp debug: Waiting for 5 seconds.\n");
107 mdelay(5000);
108 goto thaw;
109 }
110
111 error = platform_prepare();
112 if (error)
113 goto thaw;
114
89 /* Free memory before shutting down devices. */ 115 /* Free memory before shutting down devices. */
90 if (!(error = swsusp_shrink_memory())) 116 if (!(error = swsusp_shrink_memory()))
91 return 0; 117 return 0;
92thaw: 118
119 platform_finish();
120 thaw:
93 thaw_processes(); 121 thaw_processes();
94enable_cpus: 122 enable_cpus:
95 enable_nonboot_cpus(); 123 enable_nonboot_cpus();
96 pm_restore_console(); 124 pm_restore_console();
97 return error; 125 return error;
@@ -122,13 +150,21 @@ int pm_suspend_disk(void)
122 if (error) 150 if (error)
123 return error; 151 return error;
124 152
153 if (pm_disk_mode == PM_DISK_TESTPROC)
154 return 0;
155
125 suspend_console(); 156 suspend_console();
126 error = device_suspend(PMSG_FREEZE); 157 error = device_suspend(PMSG_FREEZE);
127 if (error) { 158 if (error) {
128 resume_console(); 159 resume_console();
129 printk("Some devices failed to suspend\n"); 160 printk("Some devices failed to suspend\n");
130 unprepare_processes(); 161 goto Thaw;
131 return error; 162 }
163
164 if (pm_disk_mode == PM_DISK_TEST) {
165 printk("swsusp debug: Waiting for 5 seconds.\n");
166 mdelay(5000);
167 goto Done;
132 } 168 }
133 169
134 pr_debug("PM: snapshotting memory.\n"); 170 pr_debug("PM: snapshotting memory.\n");
@@ -145,16 +181,17 @@ int pm_suspend_disk(void)
145 power_down(pm_disk_mode); 181 power_down(pm_disk_mode);
146 else { 182 else {
147 swsusp_free(); 183 swsusp_free();
148 unprepare_processes(); 184 goto Thaw;
149 return error;
150 } 185 }
151 } else 186 } else {
152 pr_debug("PM: Image restored successfully.\n"); 187 pr_debug("PM: Image restored successfully.\n");
188 }
153 189
154 swsusp_free(); 190 swsusp_free();
155 Done: 191 Done:
156 device_resume(); 192 device_resume();
157 resume_console(); 193 resume_console();
194 Thaw:
158 unprepare_processes(); 195 unprepare_processes();
159 return error; 196 return error;
160} 197}
@@ -176,10 +213,10 @@ static int software_resume(void)
176{ 213{
177 int error; 214 int error;
178 215
179 down(&pm_sem); 216 mutex_lock(&pm_mutex);
180 if (!swsusp_resume_device) { 217 if (!swsusp_resume_device) {
181 if (!strlen(resume_file)) { 218 if (!strlen(resume_file)) {
182 up(&pm_sem); 219 mutex_unlock(&pm_mutex);
183 return -ENOENT; 220 return -ENOENT;
184 } 221 }
185 swsusp_resume_device = name_to_dev_t(resume_file); 222 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -194,7 +231,7 @@ static int software_resume(void)
194 * FIXME: If noresume is specified, we need to find the partition 231 * FIXME: If noresume is specified, we need to find the partition
195 * and reset it back to normal swap space. 232 * and reset it back to normal swap space.
196 */ 233 */
197 up(&pm_sem); 234 mutex_unlock(&pm_mutex);
198 return 0; 235 return 0;
199 } 236 }
200 237
@@ -238,7 +275,7 @@ static int software_resume(void)
238 unprepare_processes(); 275 unprepare_processes();
239 Done: 276 Done:
240 /* For success case, the suspend path will release the lock */ 277 /* For success case, the suspend path will release the lock */
241 up(&pm_sem); 278 mutex_unlock(&pm_mutex);
242 pr_debug("PM: Resume from disk failed.\n"); 279 pr_debug("PM: Resume from disk failed.\n");
243 return 0; 280 return 0;
244} 281}
@@ -251,6 +288,8 @@ static const char * const pm_disk_modes[] = {
251 [PM_DISK_PLATFORM] = "platform", 288 [PM_DISK_PLATFORM] = "platform",
252 [PM_DISK_SHUTDOWN] = "shutdown", 289 [PM_DISK_SHUTDOWN] = "shutdown",
253 [PM_DISK_REBOOT] = "reboot", 290 [PM_DISK_REBOOT] = "reboot",
291 [PM_DISK_TEST] = "test",
292 [PM_DISK_TESTPROC] = "testproc",
254}; 293};
255 294
256/** 295/**
@@ -297,7 +336,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
297 p = memchr(buf, '\n', n); 336 p = memchr(buf, '\n', n);
298 len = p ? p - buf : n; 337 len = p ? p - buf : n;
299 338
300 down(&pm_sem); 339 mutex_lock(&pm_mutex);
301 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { 340 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
302 if (!strncmp(buf, pm_disk_modes[i], len)) { 341 if (!strncmp(buf, pm_disk_modes[i], len)) {
303 mode = i; 342 mode = i;
@@ -305,21 +344,23 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
305 } 344 }
306 } 345 }
307 if (mode) { 346 if (mode) {
308 if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) 347 if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT ||
348 mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) {
309 pm_disk_mode = mode; 349 pm_disk_mode = mode;
310 else { 350 } else {
311 if (pm_ops && pm_ops->enter && 351 if (pm_ops && pm_ops->enter &&
312 (mode == pm_ops->pm_disk_mode)) 352 (mode == pm_ops->pm_disk_mode))
313 pm_disk_mode = mode; 353 pm_disk_mode = mode;
314 else 354 else
315 error = -EINVAL; 355 error = -EINVAL;
316 } 356 }
317 } else 357 } else {
318 error = -EINVAL; 358 error = -EINVAL;
359 }
319 360
320 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 361 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
321 pm_disk_modes[mode]); 362 pm_disk_modes[mode]);
322 up(&pm_sem); 363 mutex_unlock(&pm_mutex);
323 return error ? error : n; 364 return error ? error : n;
324} 365}
325 366
@@ -344,14 +385,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
344 if (maj != MAJOR(res) || min != MINOR(res)) 385 if (maj != MAJOR(res) || min != MINOR(res))
345 goto out; 386 goto out;
346 387
347 down(&pm_sem); 388 mutex_lock(&pm_mutex);
348 swsusp_resume_device = res; 389 swsusp_resume_device = res;
349 up(&pm_sem); 390 mutex_unlock(&pm_mutex);
350 printk("Attempting manual resume\n"); 391 printk("Attempting manual resume\n");
351 noresume = 0; 392 noresume = 0;
352 software_resume(); 393 software_resume();
353 ret = n; 394 ret = n;
354out: 395 out:
355 return ret; 396 return ret;
356} 397}
357 398
@@ -406,6 +447,19 @@ static int __init resume_setup(char *str)
406 return 1; 447 return 1;
407} 448}
408 449
450static int __init resume_offset_setup(char *str)
451{
452 unsigned long long offset;
453
454 if (noresume)
455 return 1;
456
457 if (sscanf(str, "%llu", &offset) == 1)
458 swsusp_resume_block = offset;
459
460 return 1;
461}
462
409static int __init noresume_setup(char *str) 463static int __init noresume_setup(char *str)
410{ 464{
411 noresume = 1; 465 noresume = 1;
@@ -413,4 +467,5 @@ static int __init noresume_setup(char *str)
413} 467}
414 468
415__setup("noresume", noresume_setup); 469__setup("noresume", noresume_setup);
470__setup("resume_offset=", resume_offset_setup);
416__setup("resume=", resume_setup); 471__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1210961a5aa7..ff3a6182f5f0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/module.h>
11#include <linux/suspend.h> 12#include <linux/suspend.h>
12#include <linux/kobject.h> 13#include <linux/kobject.h>
13#include <linux/string.h> 14#include <linux/string.h>
@@ -18,13 +19,14 @@
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/resume-trace.h> 21#include <linux/resume-trace.h>
22#include <linux/freezer.h>
21 23
22#include "power.h" 24#include "power.h"
23 25
24/*This is just an arbitrary number */ 26/*This is just an arbitrary number */
25#define FREE_PAGE_NUMBER (100) 27#define FREE_PAGE_NUMBER (100)
26 28
27DECLARE_MUTEX(pm_sem); 29DEFINE_MUTEX(pm_mutex);
28 30
29struct pm_ops *pm_ops; 31struct pm_ops *pm_ops;
30suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM; 32suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
36 38
37void pm_set_ops(struct pm_ops * ops) 39void pm_set_ops(struct pm_ops * ops)
38{ 40{
39 down(&pm_sem); 41 mutex_lock(&pm_mutex);
40 pm_ops = ops; 42 pm_ops = ops;
41 up(&pm_sem); 43 mutex_unlock(&pm_mutex);
42} 44}
43 45
44 46
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state)
182 184
183 if (!valid_state(state)) 185 if (!valid_state(state))
184 return -ENODEV; 186 return -ENODEV;
185 if (down_trylock(&pm_sem)) 187 if (!mutex_trylock(&pm_mutex))
186 return -EBUSY; 188 return -EBUSY;
187 189
188 if (state == PM_SUSPEND_DISK) { 190 if (state == PM_SUSPEND_DISK) {
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state)
200 pr_debug("PM: Finishing wakeup.\n"); 202 pr_debug("PM: Finishing wakeup.\n");
201 suspend_finish(state); 203 suspend_finish(state);
202 Unlock: 204 Unlock:
203 up(&pm_sem); 205 mutex_unlock(&pm_mutex);
204 return error; 206 return error;
205} 207}
206 208
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state)
229 return -EINVAL; 231 return -EINVAL;
230} 232}
231 233
232 234EXPORT_SYMBOL(pm_suspend);
233 235
234decl_subsys(power,NULL,NULL); 236decl_subsys(power,NULL,NULL);
235 237
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b272..eb461b816bf4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
22 return -EPERM; 22 return -EPERM;
23} 23}
24#endif 24#endif
25extern struct semaphore pm_sem; 25
26extern struct mutex pm_mutex;
27
26#define power_attr(_name) \ 28#define power_attr(_name) \
27static struct subsys_attribute _name##_attr = { \ 29static struct subsys_attribute _name##_attr = { \
28 .attr = { \ 30 .attr = { \
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end;
42extern unsigned long image_size; 44extern unsigned long image_size;
43extern int in_suspend; 45extern int in_suspend;
44extern dev_t swsusp_resume_device; 46extern dev_t swsusp_resume_device;
47extern sector_t swsusp_resume_block;
45 48
46extern asmlinkage int swsusp_arch_suspend(void); 49extern asmlinkage int swsusp_arch_suspend(void);
47extern asmlinkage int swsusp_arch_resume(void); 50extern asmlinkage int swsusp_arch_resume(void);
@@ -102,8 +105,18 @@ struct snapshot_handle {
102extern unsigned int snapshot_additional_pages(struct zone *zone); 105extern unsigned int snapshot_additional_pages(struct zone *zone);
103extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 106extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
104extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 107extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
108extern void snapshot_write_finalize(struct snapshot_handle *handle);
105extern int snapshot_image_loaded(struct snapshot_handle *handle); 109extern int snapshot_image_loaded(struct snapshot_handle *handle);
106extern void snapshot_free_unused_memory(struct snapshot_handle *handle); 110
111/*
112 * This structure is used to pass the values needed for the identification
113 * of the resume swap area from a user space to the kernel via the
114 * SNAPSHOT_SET_SWAP_AREA ioctl
115 */
116struct resume_swap_area {
117 loff_t offset;
118 u_int32_t dev;
119} __attribute__((packed));
107 120
108#define SNAPSHOT_IOC_MAGIC '3' 121#define SNAPSHOT_IOC_MAGIC '3'
109#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) 122#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
117#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) 130#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
118#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) 131#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
119#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) 132#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
120#define SNAPSHOT_IOC_MAXNR 11 133#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
134#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \
135 struct resume_swap_area)
136#define SNAPSHOT_IOC_MAXNR 13
137
138#define PMOPS_PREPARE 1
139#define PMOPS_ENTER 2
140#define PMOPS_FINISH 3
121 141
122/** 142/**
123 * The bitmap is used for tracing allocated swap pages 143 * The bitmap is used for tracing allocated swap pages
@@ -141,7 +161,7 @@ struct bitmap_page {
141 161
142extern void free_bitmap(struct bitmap_page *bitmap); 162extern void free_bitmap(struct bitmap_page *bitmap);
143extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); 163extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
144extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); 164extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
145extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); 165extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
146 166
147extern int swsusp_check(void); 167extern int swsusp_check(void);
@@ -153,3 +173,7 @@ extern int swsusp_read(void);
153extern int swsusp_write(void); 173extern int swsusp_write(void);
154extern void swsusp_close(void); 174extern void swsusp_close(void);
155extern int suspend_enter(suspend_state_t state); 175extern int suspend_enter(suspend_state_t state);
176
177struct timeval;
178extern void swsusp_show_speed(struct timeval *, struct timeval *,
179 unsigned int, char *);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac3164..678ec736076b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
16 * callback we use. 16 * callback we use.
17 */ 17 */
18 18
19static void do_poweroff(void *dummy) 19static void do_poweroff(struct work_struct *dummy)
20{ 20{
21 kernel_power_off(); 21 kernel_power_off();
22} 22}
23 23
24static DECLARE_WORK(poweroff_work, do_poweroff, NULL); 24static DECLARE_WORK(poweroff_work, do_poweroff);
25 25
26static void handle_poweroff(int key, struct tty_struct *tty) 26static void handle_poweroff(int key, struct tty_struct *tty)
27{ 27{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e6..6d566bf7085c 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,20 +13,22 @@
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h>
16 17
17/* 18/*
18 * Timeout for stopping processes 19 * Timeout for stopping processes
19 */ 20 */
20#define TIMEOUT (20 * HZ) 21#define TIMEOUT (20 * HZ)
21 22
23#define FREEZER_KERNEL_THREADS 0
24#define FREEZER_USER_SPACE 1
22 25
23static inline int freezeable(struct task_struct * p) 26static inline int freezeable(struct task_struct * p)
24{ 27{
25 if ((p == current) || 28 if ((p == current) ||
26 (p->flags & PF_NOFREEZE) || 29 (p->flags & PF_NOFREEZE) ||
27 (p->exit_state == EXIT_ZOMBIE) || 30 (p->exit_state == EXIT_ZOMBIE) ||
28 (p->exit_state == EXIT_DEAD) || 31 (p->exit_state == EXIT_DEAD))
29 (p->state == TASK_STOPPED))
30 return 0; 32 return 0;
31 return 1; 33 return 1;
32} 34}
@@ -39,7 +41,6 @@ void refrigerator(void)
39 long save; 41 long save;
40 save = current->state; 42 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 43 pr_debug("%s entered refrigerator\n", current->comm);
42 printk("=");
43 44
44 frozen_process(current); 45 frozen_process(current);
45 spin_lock_irq(&current->sighand->siglock); 46 spin_lock_irq(&current->sighand->siglock);
@@ -59,10 +60,16 @@ static inline void freeze_process(struct task_struct *p)
59 unsigned long flags; 60 unsigned long flags;
60 61
61 if (!freezing(p)) { 62 if (!freezing(p)) {
62 freeze(p); 63 rmb();
63 spin_lock_irqsave(&p->sighand->siglock, flags); 64 if (!frozen(p)) {
64 signal_wake_up(p, 0); 65 if (p->state == TASK_STOPPED)
65 spin_unlock_irqrestore(&p->sighand->siglock, flags); 66 force_sig_specific(SIGSTOP, p);
67
68 freeze(p);
69 spin_lock_irqsave(&p->sighand->siglock, flags);
70 signal_wake_up(p, p->state == TASK_STOPPED);
71 spin_unlock_irqrestore(&p->sighand->siglock, flags);
72 }
66 } 73 }
67} 74}
68 75
@@ -79,96 +86,134 @@ static void cancel_freezing(struct task_struct *p)
79 } 86 }
80} 87}
81 88
82/* 0 = success, else # of processes that we failed to stop */ 89static inline int is_user_space(struct task_struct *p)
83int freeze_processes(void) 90{
91 return p->mm && !(p->flags & PF_BORROWED_MM);
92}
93
94static unsigned int try_to_freeze_tasks(int freeze_user_space)
84{ 95{
85 int todo, nr_user, user_frozen;
86 unsigned long start_time;
87 struct task_struct *g, *p; 96 struct task_struct *g, *p;
97 unsigned long end_time;
98 unsigned int todo;
88 99
89 printk( "Stopping tasks: " ); 100 end_time = jiffies + TIMEOUT;
90 start_time = jiffies;
91 user_frozen = 0;
92 do { 101 do {
93 nr_user = todo = 0; 102 todo = 0;
94 read_lock(&tasklist_lock); 103 read_lock(&tasklist_lock);
95 do_each_thread(g, p) { 104 do_each_thread(g, p) {
96 if (!freezeable(p)) 105 if (!freezeable(p))
97 continue; 106 continue;
107
98 if (frozen(p)) 108 if (frozen(p))
99 continue; 109 continue;
110
100 if (p->state == TASK_TRACED && frozen(p->parent)) { 111 if (p->state == TASK_TRACED && frozen(p->parent)) {
101 cancel_freezing(p); 112 cancel_freezing(p);
102 continue; 113 continue;
103 } 114 }
104 if (p->mm && !(p->flags & PF_BORROWED_MM)) { 115 if (is_user_space(p)) {
105 /* The task is a user-space one. 116 if (!freeze_user_space)
106 * Freeze it unless there's a vfork completion 117 continue;
107 * pending 118
119 /* Freeze the task unless there is a vfork
120 * completion pending
108 */ 121 */
109 if (!p->vfork_done) 122 if (!p->vfork_done)
110 freeze_process(p); 123 freeze_process(p);
111 nr_user++;
112 } else { 124 } else {
113 /* Freeze only if the user space is frozen */ 125 if (freeze_user_space)
114 if (user_frozen) 126 continue;
115 freeze_process(p); 127
116 todo++; 128 freeze_process(p);
117 } 129 }
130 todo++;
118 } while_each_thread(g, p); 131 } while_each_thread(g, p);
119 read_unlock(&tasklist_lock); 132 read_unlock(&tasklist_lock);
120 todo += nr_user;
121 if (!user_frozen && !nr_user) {
122 sys_sync();
123 start_time = jiffies;
124 }
125 user_frozen = !nr_user;
126 yield(); /* Yield is okay here */ 133 yield(); /* Yield is okay here */
127 if (todo && time_after(jiffies, start_time + TIMEOUT)) 134 if (todo && time_after(jiffies, end_time))
128 break; 135 break;
129 } while(todo); 136 } while (todo);
130 137
131 /* This does not unfreeze processes that are already frozen
132 * (we have slightly ugly calling convention in that respect,
133 * and caller must call thaw_processes() if something fails),
134 * but it cleans up leftover PF_FREEZE requests.
135 */
136 if (todo) { 138 if (todo) {
137 printk( "\n" ); 139 /* This does not unfreeze processes that are already frozen
138 printk(KERN_ERR " stopping tasks timed out " 140 * (we have slightly ugly calling convention in that respect,
139 "after %d seconds (%d tasks remaining):\n", 141 * and caller must call thaw_processes() if something fails),
140 TIMEOUT / HZ, todo); 142 * but it cleans up leftover PF_FREEZE requests.
143 */
144 printk("\n");
145 printk(KERN_ERR "Stopping %s timed out after %d seconds "
146 "(%d tasks refusing to freeze):\n",
147 freeze_user_space ? "user space processes" :
148 "kernel threads",
149 TIMEOUT / HZ, todo);
141 read_lock(&tasklist_lock); 150 read_lock(&tasklist_lock);
142 do_each_thread(g, p) { 151 do_each_thread(g, p) {
152 if (is_user_space(p) == !freeze_user_space)
153 continue;
154
143 if (freezeable(p) && !frozen(p)) 155 if (freezeable(p) && !frozen(p))
144 printk(KERN_ERR " %s\n", p->comm); 156 printk(KERN_ERR " %s\n", p->comm);
157
145 cancel_freezing(p); 158 cancel_freezing(p);
146 } while_each_thread(g, p); 159 } while_each_thread(g, p);
147 read_unlock(&tasklist_lock); 160 read_unlock(&tasklist_lock);
148 return todo;
149 } 161 }
150 162
151 printk( "|\n" ); 163 return todo;
164}
165
166/**
167 * freeze_processes - tell processes to enter the refrigerator
168 *
169 * Returns 0 on success, or the number of processes that didn't freeze,
170 * although they were told to.
171 */
172int freeze_processes(void)
173{
174 unsigned int nr_unfrozen;
175
176 printk("Stopping tasks ... ");
177 nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
178 if (nr_unfrozen)
179 return nr_unfrozen;
180
181 sys_sync();
182 nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
183 if (nr_unfrozen)
184 return nr_unfrozen;
185
186 printk("done.\n");
152 BUG_ON(in_atomic()); 187 BUG_ON(in_atomic());
153 return 0; 188 return 0;
154} 189}
155 190
156void thaw_processes(void) 191static void thaw_tasks(int thaw_user_space)
157{ 192{
158 struct task_struct *g, *p; 193 struct task_struct *g, *p;
159 194
160 printk( "Restarting tasks..." );
161 read_lock(&tasklist_lock); 195 read_lock(&tasklist_lock);
162 do_each_thread(g, p) { 196 do_each_thread(g, p) {
163 if (!freezeable(p)) 197 if (!freezeable(p))
164 continue; 198 continue;
199
200 if (is_user_space(p) == !thaw_user_space)
201 continue;
202
165 if (!thaw_process(p)) 203 if (!thaw_process(p))
166 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 204 printk(KERN_WARNING " Strange, %s not stopped\n",
205 p->comm );
167 } while_each_thread(g, p); 206 } while_each_thread(g, p);
168
169 read_unlock(&tasklist_lock); 207 read_unlock(&tasklist_lock);
208}
209
210void thaw_processes(void)
211{
212 printk("Restarting tasks ... ");
213 thaw_tasks(FREEZER_KERNEL_THREADS);
214 thaw_tasks(FREEZER_USER_SPACE);
170 schedule(); 215 schedule();
171 printk( " done\n" ); 216 printk("done.\n");
172} 217}
173 218
174EXPORT_SYMBOL(refrigerator); 219EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d6..c024606221c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
1/* 1/*
2 * linux/kernel/power/snapshot.c 2 * linux/kernel/power/snapshot.c
3 * 3 *
4 * This file provide system snapshot/restore functionality. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 * 8 *
8 * This file is released under the GPLv2, and is based on swsusp.c. 9 * This file is released under the GPLv2.
9 * 10 *
10 */ 11 */
11 12
12
13#include <linux/version.h> 13#include <linux/version.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
@@ -34,137 +34,24 @@
34 34
35#include "power.h" 35#include "power.h"
36 36
37/* List of PBEs used for creating and restoring the suspend image */ 37/* List of PBEs needed for restoring the pages that were allocated before
38 * the suspend and included in the suspend image, but have also been
39 * allocated by the "resume" kernel, so their contents cannot be written
40 * directly to their "original" page frames.
41 */
38struct pbe *restore_pblist; 42struct pbe *restore_pblist;
39 43
40static unsigned int nr_copy_pages; 44/* Pointer to an auxiliary buffer (1 page) */
41static unsigned int nr_meta_pages;
42static void *buffer; 45static void *buffer;
43 46
44#ifdef CONFIG_HIGHMEM
45unsigned int count_highmem_pages(void)
46{
47 struct zone *zone;
48 unsigned long zone_pfn;
49 unsigned int n = 0;
50
51 for_each_zone (zone)
52 if (is_highmem(zone)) {
53 mark_free_pages(zone);
54 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
55 struct page *page;
56 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
57 if (!pfn_valid(pfn))
58 continue;
59 page = pfn_to_page(pfn);
60 if (PageReserved(page))
61 continue;
62 if (PageNosaveFree(page))
63 continue;
64 n++;
65 }
66 }
67 return n;
68}
69
70struct highmem_page {
71 char *data;
72 struct page *page;
73 struct highmem_page *next;
74};
75
76static struct highmem_page *highmem_copy;
77
78static int save_highmem_zone(struct zone *zone)
79{
80 unsigned long zone_pfn;
81 mark_free_pages(zone);
82 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
83 struct page *page;
84 struct highmem_page *save;
85 void *kaddr;
86 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
87
88 if (!(pfn%10000))
89 printk(".");
90 if (!pfn_valid(pfn))
91 continue;
92 page = pfn_to_page(pfn);
93 /*
94 * This condition results from rvmalloc() sans vmalloc_32()
95 * and architectural memory reservations. This should be
96 * corrected eventually when the cases giving rise to this
97 * are better understood.
98 */
99 if (PageReserved(page))
100 continue;
101 BUG_ON(PageNosave(page));
102 if (PageNosaveFree(page))
103 continue;
104 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
105 if (!save)
106 return -ENOMEM;
107 save->next = highmem_copy;
108 save->page = page;
109 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
110 if (!save->data) {
111 kfree(save);
112 return -ENOMEM;
113 }
114 kaddr = kmap_atomic(page, KM_USER0);
115 memcpy(save->data, kaddr, PAGE_SIZE);
116 kunmap_atomic(kaddr, KM_USER0);
117 highmem_copy = save;
118 }
119 return 0;
120}
121
122int save_highmem(void)
123{
124 struct zone *zone;
125 int res = 0;
126
127 pr_debug("swsusp: Saving Highmem");
128 drain_local_pages();
129 for_each_zone (zone) {
130 if (is_highmem(zone))
131 res = save_highmem_zone(zone);
132 if (res)
133 return res;
134 }
135 printk("\n");
136 return 0;
137}
138
139int restore_highmem(void)
140{
141 printk("swsusp: Restoring Highmem\n");
142 while (highmem_copy) {
143 struct highmem_page *save = highmem_copy;
144 void *kaddr;
145 highmem_copy = save->next;
146
147 kaddr = kmap_atomic(save->page, KM_USER0);
148 memcpy(kaddr, save->data, PAGE_SIZE);
149 kunmap_atomic(kaddr, KM_USER0);
150 free_page((long) save->data);
151 kfree(save);
152 }
153 return 0;
154}
155#else
156static inline unsigned int count_highmem_pages(void) {return 0;}
157static inline int save_highmem(void) {return 0;}
158static inline int restore_highmem(void) {return 0;}
159#endif
160
161/** 47/**
162 * @safe_needed - on resume, for storing the PBE list and the image, 48 * @safe_needed - on resume, for storing the PBE list and the image,
163 * we can only use memory pages that do not conflict with the pages 49 * we can only use memory pages that do not conflict with the pages
164 * used before suspend. 50 * used before suspend. The unsafe pages have PageNosaveFree set
51 * and we count them using unsafe_pages.
165 * 52 *
166 * The unsafe pages are marked with the PG_nosave_free flag 53 * Each allocated image page is marked as PageNosave and PageNosaveFree
167 * and we count them using unsafe_pages 54 * so that swsusp_free() can release it.
168 */ 55 */
169 56
170#define PG_ANY 0 57#define PG_ANY 0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
174 61
175static unsigned int allocated_unsafe_pages; 62static unsigned int allocated_unsafe_pages;
176 63
177static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 64static void *get_image_page(gfp_t gfp_mask, int safe_needed)
178{ 65{
179 void *res; 66 void *res;
180 67
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
195 82
196unsigned long get_safe_page(gfp_t gfp_mask) 83unsigned long get_safe_page(gfp_t gfp_mask)
197{ 84{
198 return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); 85 return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
86}
87
88static struct page *alloc_image_page(gfp_t gfp_mask)
89{
90 struct page *page;
91
92 page = alloc_page(gfp_mask);
93 if (page) {
94 SetPageNosave(page);
95 SetPageNosaveFree(page);
96 }
97 return page;
199} 98}
200 99
201/** 100/**
202 * free_image_page - free page represented by @addr, allocated with 101 * free_image_page - free page represented by @addr, allocated with
203 * alloc_image_page (page flags set by it must be cleared) 102 * get_image_page (page flags set by it must be cleared)
204 */ 103 */
205 104
206static inline void free_image_page(void *addr, int clear_nosave_free) 105static inline void free_image_page(void *addr, int clear_nosave_free)
207{ 106{
208 ClearPageNosave(virt_to_page(addr)); 107 struct page *page;
108
109 BUG_ON(!virt_addr_valid(addr));
110
111 page = virt_to_page(addr);
112
113 ClearPageNosave(page);
209 if (clear_nosave_free) 114 if (clear_nosave_free)
210 ClearPageNosaveFree(virt_to_page(addr)); 115 ClearPageNosaveFree(page);
211 free_page((unsigned long)addr); 116
117 __free_page(page);
212} 118}
213 119
214/* struct linked_page is used to build chains of pages */ 120/* struct linked_page is used to build chains of pages */
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
269 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { 175 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
270 struct linked_page *lp; 176 struct linked_page *lp;
271 177
272 lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); 178 lp = get_image_page(ca->gfp_mask, ca->safe_needed);
273 if (!lp) 179 if (!lp)
274 return NULL; 180 return NULL;
275 181
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
446 352
447 /* Compute the number of zones */ 353 /* Compute the number of zones */
448 nr = 0; 354 nr = 0;
449 for_each_zone (zone) 355 for_each_zone(zone)
450 if (populated_zone(zone) && !is_highmem(zone)) 356 if (populated_zone(zone))
451 nr++; 357 nr++;
452 358
453 /* Allocate the list of zones bitmap objects */ 359 /* Allocate the list of zones bitmap objects */
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
459 } 365 }
460 366
461 /* Initialize the zone bitmap objects */ 367 /* Initialize the zone bitmap objects */
462 for_each_zone (zone) { 368 for_each_zone(zone) {
463 unsigned long pfn; 369 unsigned long pfn;
464 370
465 if (!populated_zone(zone) || is_highmem(zone)) 371 if (!populated_zone(zone))
466 continue; 372 continue;
467 373
468 zone_bm->start_pfn = zone->zone_start_pfn; 374 zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
481 while (bb) { 387 while (bb) {
482 unsigned long *ptr; 388 unsigned long *ptr;
483 389
484 ptr = alloc_image_page(gfp_mask, safe_needed); 390 ptr = get_image_page(gfp_mask, safe_needed);
485 bb->data = ptr; 391 bb->data = ptr;
486 if (!ptr) 392 if (!ptr)
487 goto Free; 393 goto Free;
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
505 memory_bm_position_reset(bm); 411 memory_bm_position_reset(bm);
506 return 0; 412 return 0;
507 413
508Free: 414 Free:
509 bm->p_list = ca.chain; 415 bm->p_list = ca.chain;
510 memory_bm_free(bm, PG_UNSAFE_CLEAR); 416 memory_bm_free(bm, PG_UNSAFE_CLEAR);
511 return -ENOMEM; 417 return -ENOMEM;
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
651 memory_bm_position_reset(bm); 557 memory_bm_position_reset(bm);
652 return BM_END_OF_MAP; 558 return BM_END_OF_MAP;
653 559
654Return_pfn: 560 Return_pfn:
655 bm->cur.chunk = chunk; 561 bm->cur.chunk = chunk;
656 bm->cur.bit = bit; 562 bm->cur.bit = bit;
657 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; 563 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone)
669 575
670 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 576 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
671 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); 577 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
672 return res; 578 return 2 * res;
579}
580
581#ifdef CONFIG_HIGHMEM
582/**
583 * count_free_highmem_pages - compute the total number of free highmem
584 * pages, system-wide.
585 */
586
587static unsigned int count_free_highmem_pages(void)
588{
589 struct zone *zone;
590 unsigned int cnt = 0;
591
592 for_each_zone(zone)
593 if (populated_zone(zone) && is_highmem(zone))
594 cnt += zone->free_pages;
595
596 return cnt;
597}
598
599/**
600 * saveable_highmem_page - Determine whether a highmem page should be
601 * included in the suspend image.
602 *
603 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
604 * and it isn't a part of a free chunk of pages.
605 */
606
607static struct page *saveable_highmem_page(unsigned long pfn)
608{
609 struct page *page;
610
611 if (!pfn_valid(pfn))
612 return NULL;
613
614 page = pfn_to_page(pfn);
615
616 BUG_ON(!PageHighMem(page));
617
618 if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
619 return NULL;
620
621 return page;
673} 622}
674 623
675/** 624/**
625 * count_highmem_pages - compute the total number of saveable highmem
626 * pages.
627 */
628
629unsigned int count_highmem_pages(void)
630{
631 struct zone *zone;
632 unsigned int n = 0;
633
634 for_each_zone(zone) {
635 unsigned long pfn, max_zone_pfn;
636
637 if (!is_highmem(zone))
638 continue;
639
640 mark_free_pages(zone);
641 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
642 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
643 if (saveable_highmem_page(pfn))
644 n++;
645 }
646 return n;
647}
648#else
649static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
650static inline unsigned int count_highmem_pages(void) { return 0; }
651#endif /* CONFIG_HIGHMEM */
652
653/**
676 * pfn_is_nosave - check if given pfn is in the 'nosave' section 654 * pfn_is_nosave - check if given pfn is in the 'nosave' section
677 */ 655 */
678 656
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
684} 662}
685 663
686/** 664/**
687 * saveable - Determine whether a page should be cloned or not. 665 * saveable - Determine whether a non-highmem page should be included in
688 * @pfn: The page 666 * the suspend image.
689 * 667 *
690 * We save a page if it isn't Nosave, and is not in the range of pages 668 * We should save the page if it isn't Nosave, and is not in the range
691 * statically defined as 'unsaveable', and it 669 * of pages statically defined as 'unsaveable', and it isn't a part of
692 * isn't a part of a free chunk of pages. 670 * a free chunk of pages.
693 */ 671 */
694 672
695static struct page *saveable_page(unsigned long pfn) 673static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn)
701 679
702 page = pfn_to_page(pfn); 680 page = pfn_to_page(pfn);
703 681
704 if (PageNosave(page)) 682 BUG_ON(PageHighMem(page));
683
684 if (PageNosave(page) || PageNosaveFree(page))
705 return NULL; 685 return NULL;
686
706 if (PageReserved(page) && pfn_is_nosave(pfn)) 687 if (PageReserved(page) && pfn_is_nosave(pfn))
707 return NULL; 688 return NULL;
708 if (PageNosaveFree(page))
709 return NULL;
710 689
711 return page; 690 return page;
712} 691}
713 692
693/**
694 * count_data_pages - compute the total number of saveable non-highmem
695 * pages.
696 */
697
714unsigned int count_data_pages(void) 698unsigned int count_data_pages(void)
715{ 699{
716 struct zone *zone; 700 struct zone *zone;
717 unsigned long pfn, max_zone_pfn; 701 unsigned long pfn, max_zone_pfn;
718 unsigned int n = 0; 702 unsigned int n = 0;
719 703
720 for_each_zone (zone) { 704 for_each_zone(zone) {
721 if (is_highmem(zone)) 705 if (is_highmem(zone))
722 continue; 706 continue;
707
723 mark_free_pages(zone); 708 mark_free_pages(zone);
724 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 709 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
725 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 710 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
726 n += !!saveable_page(pfn); 711 if(saveable_page(pfn))
712 n++;
727 } 713 }
728 return n; 714 return n;
729} 715}
730 716
731static inline void copy_data_page(long *dst, long *src) 717/* This is needed, because copy_page and memcpy are not usable for copying
718 * task structs.
719 */
720static inline void do_copy_page(long *dst, long *src)
732{ 721{
733 int n; 722 int n;
734 723
735 /* copy_page and memcpy are not usable for copying task structs. */
736 for (n = PAGE_SIZE / sizeof(long); n; n--) 724 for (n = PAGE_SIZE / sizeof(long); n; n--)
737 *dst++ = *src++; 725 *dst++ = *src++;
738} 726}
739 727
728#ifdef CONFIG_HIGHMEM
729static inline struct page *
730page_is_saveable(struct zone *zone, unsigned long pfn)
731{
732 return is_highmem(zone) ?
733 saveable_highmem_page(pfn) : saveable_page(pfn);
734}
735
736static inline void
737copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
738{
739 struct page *s_page, *d_page;
740 void *src, *dst;
741
742 s_page = pfn_to_page(src_pfn);
743 d_page = pfn_to_page(dst_pfn);
744 if (PageHighMem(s_page)) {
745 src = kmap_atomic(s_page, KM_USER0);
746 dst = kmap_atomic(d_page, KM_USER1);
747 do_copy_page(dst, src);
748 kunmap_atomic(src, KM_USER0);
749 kunmap_atomic(dst, KM_USER1);
750 } else {
751 src = page_address(s_page);
752 if (PageHighMem(d_page)) {
753 /* Page pointed to by src may contain some kernel
754 * data modified by kmap_atomic()
755 */
756 do_copy_page(buffer, src);
757 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
758 memcpy(dst, buffer, PAGE_SIZE);
759 kunmap_atomic(dst, KM_USER0);
760 } else {
761 dst = page_address(d_page);
762 do_copy_page(dst, src);
763 }
764 }
765}
766#else
767#define page_is_saveable(zone, pfn) saveable_page(pfn)
768
769static inline void
770copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
771{
772 do_copy_page(page_address(pfn_to_page(dst_pfn)),
773 page_address(pfn_to_page(src_pfn)));
774}
775#endif /* CONFIG_HIGHMEM */
776
740static void 777static void
741copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) 778copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
742{ 779{
743 struct zone *zone; 780 struct zone *zone;
744 unsigned long pfn; 781 unsigned long pfn;
745 782
746 for_each_zone (zone) { 783 for_each_zone(zone) {
747 unsigned long max_zone_pfn; 784 unsigned long max_zone_pfn;
748 785
749 if (is_highmem(zone))
750 continue;
751
752 mark_free_pages(zone); 786 mark_free_pages(zone);
753 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 787 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
754 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 788 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
755 if (saveable_page(pfn)) 789 if (page_is_saveable(zone, pfn))
756 memory_bm_set_bit(orig_bm, pfn); 790 memory_bm_set_bit(orig_bm, pfn);
757 } 791 }
758 memory_bm_position_reset(orig_bm); 792 memory_bm_position_reset(orig_bm);
759 memory_bm_position_reset(copy_bm); 793 memory_bm_position_reset(copy_bm);
760 do { 794 do {
761 pfn = memory_bm_next_pfn(orig_bm); 795 pfn = memory_bm_next_pfn(orig_bm);
762 if (likely(pfn != BM_END_OF_MAP)) { 796 if (likely(pfn != BM_END_OF_MAP))
763 struct page *page; 797 copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
764 void *src;
765
766 page = pfn_to_page(pfn);
767 src = page_address(page);
768 page = pfn_to_page(memory_bm_next_pfn(copy_bm));
769 copy_data_page(page_address(page), src);
770 }
771 } while (pfn != BM_END_OF_MAP); 798 } while (pfn != BM_END_OF_MAP);
772} 799}
773 800
801/* Total number of image pages */
802static unsigned int nr_copy_pages;
803/* Number of pages needed for saving the original pfns of the image pages */
804static unsigned int nr_meta_pages;
805
774/** 806/**
775 * swsusp_free - free pages allocated for the suspend. 807 * swsusp_free - free pages allocated for the suspend.
776 * 808 *
@@ -792,7 +824,7 @@ void swsusp_free(void)
792 if (PageNosave(page) && PageNosaveFree(page)) { 824 if (PageNosave(page) && PageNosaveFree(page)) {
793 ClearPageNosave(page); 825 ClearPageNosave(page);
794 ClearPageNosaveFree(page); 826 ClearPageNosaveFree(page);
795 free_page((long) page_address(page)); 827 __free_page(page);
796 } 828 }
797 } 829 }
798 } 830 }
@@ -802,34 +834,108 @@ void swsusp_free(void)
802 buffer = NULL; 834 buffer = NULL;
803} 835}
804 836
837#ifdef CONFIG_HIGHMEM
838/**
839 * count_pages_for_highmem - compute the number of non-highmem pages
840 * that will be necessary for creating copies of highmem pages.
841 */
842
843static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
844{
845 unsigned int free_highmem = count_free_highmem_pages();
846
847 if (free_highmem >= nr_highmem)
848 nr_highmem = 0;
849 else
850 nr_highmem -= free_highmem;
851
852 return nr_highmem;
853}
854#else
855static unsigned int
856count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
857#endif /* CONFIG_HIGHMEM */
805 858
806/** 859/**
807 * enough_free_mem - Make sure we enough free memory to snapshot. 860 * enough_free_mem - Make sure we have enough free memory for the
808 * 861 * snapshot image.
809 * Returns TRUE or FALSE after checking the number of available
810 * free pages.
811 */ 862 */
812 863
813static int enough_free_mem(unsigned int nr_pages) 864static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
814{ 865{
815 struct zone *zone; 866 struct zone *zone;
816 unsigned int free = 0, meta = 0; 867 unsigned int free = 0, meta = 0;
817 868
818 for_each_zone (zone) 869 for_each_zone(zone) {
819 if (!is_highmem(zone)) { 870 meta += snapshot_additional_pages(zone);
871 if (!is_highmem(zone))
820 free += zone->free_pages; 872 free += zone->free_pages;
821 meta += snapshot_additional_pages(zone); 873 }
822 }
823 874
824 pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", 875 nr_pages += count_pages_for_highmem(nr_highmem);
876 pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
825 nr_pages, PAGES_FOR_IO, meta, free); 877 nr_pages, PAGES_FOR_IO, meta, free);
826 878
827 return free > nr_pages + PAGES_FOR_IO + meta; 879 return free > nr_pages + PAGES_FOR_IO + meta;
828} 880}
829 881
882#ifdef CONFIG_HIGHMEM
883/**
884 * get_highmem_buffer - if there are some highmem pages in the suspend
885 * image, we may need the buffer to copy them and/or load their data.
886 */
887
888static inline int get_highmem_buffer(int safe_needed)
889{
890 buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
891 return buffer ? 0 : -ENOMEM;
892}
893
894/**
895 * alloc_highmem_image_pages - allocate some highmem pages for the image.
896 * Try to allocate as many pages as needed, but if the number of free
897 * highmem pages is lesser than that, allocate them all.
898 */
899
900static inline unsigned int
901alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
902{
903 unsigned int to_alloc = count_free_highmem_pages();
904
905 if (to_alloc > nr_highmem)
906 to_alloc = nr_highmem;
907
908 nr_highmem -= to_alloc;
909 while (to_alloc-- > 0) {
910 struct page *page;
911
912 page = alloc_image_page(__GFP_HIGHMEM);
913 memory_bm_set_bit(bm, page_to_pfn(page));
914 }
915 return nr_highmem;
916}
917#else
918static inline int get_highmem_buffer(int safe_needed) { return 0; }
919
920static inline unsigned int
921alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
922#endif /* CONFIG_HIGHMEM */
923
924/**
925 * swsusp_alloc - allocate memory for the suspend image
926 *
927 * We first try to allocate as many highmem pages as there are
928 * saveable highmem pages in the system. If that fails, we allocate
929 * non-highmem pages for the copies of the remaining highmem ones.
930 *
931 * In this approach it is likely that the copies of highmem pages will
932 * also be located in the high memory, because of the way in which
933 * copy_data_pages() works.
934 */
935
830static int 936static int
831swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 937swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
832 unsigned int nr_pages) 938 unsigned int nr_pages, unsigned int nr_highmem)
833{ 939{
834 int error; 940 int error;
835 941
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
841 if (error) 947 if (error)
842 goto Free; 948 goto Free;
843 949
950 if (nr_highmem > 0) {
951 error = get_highmem_buffer(PG_ANY);
952 if (error)
953 goto Free;
954
955 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
956 }
844 while (nr_pages-- > 0) { 957 while (nr_pages-- > 0) {
845 struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); 958 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
959
846 if (!page) 960 if (!page)
847 goto Free; 961 goto Free;
848 962
849 SetPageNosave(page);
850 SetPageNosaveFree(page);
851 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 963 memory_bm_set_bit(copy_bm, page_to_pfn(page));
852 } 964 }
853 return 0; 965 return 0;
854 966
855Free: 967 Free:
856 swsusp_free(); 968 swsusp_free();
857 return -ENOMEM; 969 return -ENOMEM;
858} 970}
859 971
860/* Memory bitmap used for marking saveable pages */ 972/* Memory bitmap used for marking saveable pages (during suspend) or the
973 * suspend image pages (during resume)
974 */
861static struct memory_bitmap orig_bm; 975static struct memory_bitmap orig_bm;
862/* Memory bitmap used for marking allocated pages that will contain the copies 976/* Memory bitmap used on suspend for marking allocated pages that will contain
863 * of saveable pages 977 * the copies of saveable pages. During resume it is initially used for
978 * marking the suspend image pages, but then its set bits are duplicated in
979 * @orig_bm and it is released. Next, on systems with high memory, it may be
980 * used for marking "safe" highmem pages, but it has to be reinitialized for
981 * this purpose.
864 */ 982 */
865static struct memory_bitmap copy_bm; 983static struct memory_bitmap copy_bm;
866 984
867asmlinkage int swsusp_save(void) 985asmlinkage int swsusp_save(void)
868{ 986{
869 unsigned int nr_pages; 987 unsigned int nr_pages, nr_highmem;
870 988
871 pr_debug("swsusp: critical section: \n"); 989 printk("swsusp: critical section: \n");
872 990
873 drain_local_pages(); 991 drain_local_pages();
874 nr_pages = count_data_pages(); 992 nr_pages = count_data_pages();
875 printk("swsusp: Need to copy %u pages\n", nr_pages); 993 nr_highmem = count_highmem_pages();
994 printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
876 995
877 if (!enough_free_mem(nr_pages)) { 996 if (!enough_free_mem(nr_pages, nr_highmem)) {
878 printk(KERN_ERR "swsusp: Not enough free memory\n"); 997 printk(KERN_ERR "swsusp: Not enough free memory\n");
879 return -ENOMEM; 998 return -ENOMEM;
880 } 999 }
881 1000
882 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages)) 1001 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
1002 printk(KERN_ERR "swsusp: Memory allocation failed\n");
883 return -ENOMEM; 1003 return -ENOMEM;
1004 }
884 1005
885 /* During allocating of suspend pagedir, new cold pages may appear. 1006 /* During allocating of suspend pagedir, new cold pages may appear.
886 * Kill them. 1007 * Kill them.
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void)
894 * touch swap space! Except we must write out our image of course. 1015 * touch swap space! Except we must write out our image of course.
895 */ 1016 */
896 1017
1018 nr_pages += nr_highmem;
897 nr_copy_pages = nr_pages; 1019 nr_copy_pages = nr_pages;
898 nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1020 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
899 1021
900 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 1022 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
1023
901 return 0; 1024 return 0;
902} 1025}
903 1026
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
960 1083
961 if (!buffer) { 1084 if (!buffer) {
962 /* This makes the buffer be freed by swsusp_free() */ 1085 /* This makes the buffer be freed by swsusp_free() */
963 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1086 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
964 if (!buffer) 1087 if (!buffer)
965 return -ENOMEM; 1088 return -ENOMEM;
966 } 1089 }
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
975 memset(buffer, 0, PAGE_SIZE); 1098 memset(buffer, 0, PAGE_SIZE);
976 pack_pfns(buffer, &orig_bm); 1099 pack_pfns(buffer, &orig_bm);
977 } else { 1100 } else {
978 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1101 struct page *page;
979 1102
980 handle->buffer = page_address(pfn_to_page(pfn)); 1103 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1104 if (PageHighMem(page)) {
1105 /* Highmem pages are copied to the buffer,
1106 * because we can't return with a kmapped
1107 * highmem page (we may not be called again).
1108 */
1109 void *kaddr;
1110
1111 kaddr = kmap_atomic(page, KM_USER0);
1112 memcpy(buffer, kaddr, PAGE_SIZE);
1113 kunmap_atomic(kaddr, KM_USER0);
1114 handle->buffer = buffer;
1115 } else {
1116 handle->buffer = page_address(page);
1117 }
981 } 1118 }
982 handle->prev = handle->cur; 1119 handle->prev = handle->cur;
983 } 1120 }
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1005 unsigned long pfn, max_zone_pfn; 1142 unsigned long pfn, max_zone_pfn;
1006 1143
1007 /* Clear page flags */ 1144 /* Clear page flags */
1008 for_each_zone (zone) { 1145 for_each_zone(zone) {
1009 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1146 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1010 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1147 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1011 if (pfn_valid(pfn)) 1148 if (pfn_valid(pfn))
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1101 } 1238 }
1102} 1239}
1103 1240
1241/* List of "safe" pages that may be used to store data loaded from the suspend
1242 * image
1243 */
1244static struct linked_page *safe_pages_list;
1245
1246#ifdef CONFIG_HIGHMEM
1247/* struct highmem_pbe is used for creating the list of highmem pages that
1248 * should be restored atomically during the resume from disk, because the page
1249 * frames they have occupied before the suspend are in use.
1250 */
1251struct highmem_pbe {
1252 struct page *copy_page; /* data is here now */
1253 struct page *orig_page; /* data was here before the suspend */
1254 struct highmem_pbe *next;
1255};
1256
1257/* List of highmem PBEs needed for restoring the highmem pages that were
1258 * allocated before the suspend and included in the suspend image, but have
1259 * also been allocated by the "resume" kernel, so their contents cannot be
1260 * written directly to their "original" page frames.
1261 */
1262static struct highmem_pbe *highmem_pblist;
1263
1264/**
1265 * count_highmem_image_pages - compute the number of highmem pages in the
1266 * suspend image. The bits in the memory bitmap @bm that correspond to the
1267 * image pages are assumed to be set.
1268 */
1269
1270static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
1271{
1272 unsigned long pfn;
1273 unsigned int cnt = 0;
1274
1275 memory_bm_position_reset(bm);
1276 pfn = memory_bm_next_pfn(bm);
1277 while (pfn != BM_END_OF_MAP) {
1278 if (PageHighMem(pfn_to_page(pfn)))
1279 cnt++;
1280
1281 pfn = memory_bm_next_pfn(bm);
1282 }
1283 return cnt;
1284}
1285
1286/**
1287 * prepare_highmem_image - try to allocate as many highmem pages as
1288 * there are highmem image pages (@nr_highmem_p points to the variable
1289 * containing the number of highmem image pages). The pages that are
1290 * "safe" (ie. will not be overwritten when the suspend image is
1291 * restored) have the corresponding bits set in @bm (it must be
1292 * unitialized).
1293 *
1294 * NOTE: This function should not be called if there are no highmem
1295 * image pages.
1296 */
1297
1298static unsigned int safe_highmem_pages;
1299
1300static struct memory_bitmap *safe_highmem_bm;
1301
1302static int
1303prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1304{
1305 unsigned int to_alloc;
1306
1307 if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
1308 return -ENOMEM;
1309
1310 if (get_highmem_buffer(PG_SAFE))
1311 return -ENOMEM;
1312
1313 to_alloc = count_free_highmem_pages();
1314 if (to_alloc > *nr_highmem_p)
1315 to_alloc = *nr_highmem_p;
1316 else
1317 *nr_highmem_p = to_alloc;
1318
1319 safe_highmem_pages = 0;
1320 while (to_alloc-- > 0) {
1321 struct page *page;
1322
1323 page = alloc_page(__GFP_HIGHMEM);
1324 if (!PageNosaveFree(page)) {
1325 /* The page is "safe", set its bit the bitmap */
1326 memory_bm_set_bit(bm, page_to_pfn(page));
1327 safe_highmem_pages++;
1328 }
1329 /* Mark the page as allocated */
1330 SetPageNosave(page);
1331 SetPageNosaveFree(page);
1332 }
1333 memory_bm_position_reset(bm);
1334 safe_highmem_bm = bm;
1335 return 0;
1336}
1337
1338/**
1339 * get_highmem_page_buffer - for given highmem image page find the buffer
1340 * that suspend_write_next() should set for its caller to write to.
1341 *
1342 * If the page is to be saved to its "original" page frame or a copy of
1343 * the page is to be made in the highmem, @buffer is returned. Otherwise,
1344 * the copy of the page is to be made in normal memory, so the address of
1345 * the copy is returned.
1346 *
1347 * If @buffer is returned, the caller of suspend_write_next() will write
1348 * the page's contents to @buffer, so they will have to be copied to the
1349 * right location on the next call to suspend_write_next() and it is done
1350 * with the help of copy_last_highmem_page(). For this purpose, if
1351 * @buffer is returned, @last_highmem page is set to the page to which
1352 * the data will have to be copied from @buffer.
1353 */
1354
1355static struct page *last_highmem_page;
1356
1357static void *
1358get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1359{
1360 struct highmem_pbe *pbe;
1361 void *kaddr;
1362
1363 if (PageNosave(page) && PageNosaveFree(page)) {
1364 /* We have allocated the "original" page frame and we can
1365 * use it directly to store the loaded page.
1366 */
1367 last_highmem_page = page;
1368 return buffer;
1369 }
1370 /* The "original" page frame has not been allocated and we have to
1371 * use a "safe" page frame to store the loaded page.
1372 */
1373 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1374 if (!pbe) {
1375 swsusp_free();
1376 return NULL;
1377 }
1378 pbe->orig_page = page;
1379 if (safe_highmem_pages > 0) {
1380 struct page *tmp;
1381
1382 /* Copy of the page will be stored in high memory */
1383 kaddr = buffer;
1384 tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
1385 safe_highmem_pages--;
1386 last_highmem_page = tmp;
1387 pbe->copy_page = tmp;
1388 } else {
1389 /* Copy of the page will be stored in normal memory */
1390 kaddr = safe_pages_list;
1391 safe_pages_list = safe_pages_list->next;
1392 pbe->copy_page = virt_to_page(kaddr);
1393 }
1394 pbe->next = highmem_pblist;
1395 highmem_pblist = pbe;
1396 return kaddr;
1397}
1398
1399/**
1400 * copy_last_highmem_page - copy the contents of a highmem image from
1401 * @buffer, where the caller of snapshot_write_next() has place them,
1402 * to the right location represented by @last_highmem_page .
1403 */
1404
1405static void copy_last_highmem_page(void)
1406{
1407 if (last_highmem_page) {
1408 void *dst;
1409
1410 dst = kmap_atomic(last_highmem_page, KM_USER0);
1411 memcpy(dst, buffer, PAGE_SIZE);
1412 kunmap_atomic(dst, KM_USER0);
1413 last_highmem_page = NULL;
1414 }
1415}
1416
1417static inline int last_highmem_page_copied(void)
1418{
1419 return !last_highmem_page;
1420}
1421
1422static inline void free_highmem_data(void)
1423{
1424 if (safe_highmem_bm)
1425 memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
1426
1427 if (buffer)
1428 free_image_page(buffer, PG_UNSAFE_CLEAR);
1429}
1430#else
1431static inline int get_safe_write_buffer(void) { return 0; }
1432
1433static unsigned int
1434count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
1435
1436static inline int
1437prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1438{
1439 return 0;
1440}
1441
1442static inline void *
1443get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1444{
1445 return NULL;
1446}
1447
1448static inline void copy_last_highmem_page(void) {}
1449static inline int last_highmem_page_copied(void) { return 1; }
1450static inline void free_highmem_data(void) {}
1451#endif /* CONFIG_HIGHMEM */
1452
1104/** 1453/**
1105 * prepare_image - use the memory bitmap @bm to mark the pages that will 1454 * prepare_image - use the memory bitmap @bm to mark the pages that will
1106 * be overwritten in the process of restoring the system memory state 1455 * be overwritten in the process of restoring the system memory state
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1110 * The idea is to allocate a new memory bitmap first and then allocate 1459 * The idea is to allocate a new memory bitmap first and then allocate
1111 * as many pages as needed for the image data, but not to assign these 1460 * as many pages as needed for the image data, but not to assign these
1112 * pages to specific tasks initially. Instead, we just mark them as 1461 * pages to specific tasks initially. Instead, we just mark them as
1113 * allocated and create a list of "safe" pages that will be used later. 1462 * allocated and create a lists of "safe" pages that will be used
1463 * later. On systems with high memory a list of "safe" highmem pages is
1464 * also created.
1114 */ 1465 */
1115 1466
1116#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) 1467#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
1117 1468
1118static struct linked_page *safe_pages_list;
1119
1120static int 1469static int
1121prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) 1470prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1122{ 1471{
1123 unsigned int nr_pages; 1472 unsigned int nr_pages, nr_highmem;
1124 struct linked_page *sp_list, *lp; 1473 struct linked_page *sp_list, *lp;
1125 int error; 1474 int error;
1126 1475
1476 /* If there is no highmem, the buffer will not be necessary */
1477 free_image_page(buffer, PG_UNSAFE_CLEAR);
1478 buffer = NULL;
1479
1480 nr_highmem = count_highmem_image_pages(bm);
1127 error = mark_unsafe_pages(bm); 1481 error = mark_unsafe_pages(bm);
1128 if (error) 1482 if (error)
1129 goto Free; 1483 goto Free;
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1134 1488
1135 duplicate_memory_bitmap(new_bm, bm); 1489 duplicate_memory_bitmap(new_bm, bm);
1136 memory_bm_free(bm, PG_UNSAFE_KEEP); 1490 memory_bm_free(bm, PG_UNSAFE_KEEP);
1491 if (nr_highmem > 0) {
1492 error = prepare_highmem_image(bm, &nr_highmem);
1493 if (error)
1494 goto Free;
1495 }
1137 /* Reserve some safe pages for potential later use. 1496 /* Reserve some safe pages for potential later use.
1138 * 1497 *
1139 * NOTE: This way we make sure there will be enough safe pages for the 1498 * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1142 */ 1501 */
1143 sp_list = NULL; 1502 sp_list = NULL;
1144 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ 1503 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1145 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1504 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1146 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); 1505 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1147 while (nr_pages > 0) { 1506 while (nr_pages > 0) {
1148 lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); 1507 lp = get_image_page(GFP_ATOMIC, PG_SAFE);
1149 if (!lp) { 1508 if (!lp) {
1150 error = -ENOMEM; 1509 error = -ENOMEM;
1151 goto Free; 1510 goto Free;
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1156 } 1515 }
1157 /* Preallocate memory for the image */ 1516 /* Preallocate memory for the image */
1158 safe_pages_list = NULL; 1517 safe_pages_list = NULL;
1159 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1518 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1160 while (nr_pages > 0) { 1519 while (nr_pages > 0) {
1161 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); 1520 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
1162 if (!lp) { 1521 if (!lp) {
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1181 } 1540 }
1182 return 0; 1541 return 0;
1183 1542
1184Free: 1543 Free:
1185 swsusp_free(); 1544 swsusp_free();
1186 return error; 1545 return error;
1187} 1546}
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1196 struct pbe *pbe; 1555 struct pbe *pbe;
1197 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1556 struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
1198 1557
1558 if (PageHighMem(page))
1559 return get_highmem_page_buffer(page, ca);
1560
1199 if (PageNosave(page) && PageNosaveFree(page)) 1561 if (PageNosave(page) && PageNosaveFree(page))
1200 /* We have allocated the "original" page frame and we can 1562 /* We have allocated the "original" page frame and we can
1201 * use it directly to store the loaded page. 1563 * use it directly to store the loaded page.
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1210 swsusp_free(); 1572 swsusp_free();
1211 return NULL; 1573 return NULL;
1212 } 1574 }
1213 pbe->orig_address = (unsigned long)page_address(page); 1575 pbe->orig_address = page_address(page);
1214 pbe->address = (unsigned long)safe_pages_list; 1576 pbe->address = safe_pages_list;
1215 safe_pages_list = safe_pages_list->next; 1577 safe_pages_list = safe_pages_list->next;
1216 pbe->next = restore_pblist; 1578 pbe->next = restore_pblist;
1217 restore_pblist = pbe; 1579 restore_pblist = pbe;
1218 return (void *)pbe->address; 1580 return pbe->address;
1219} 1581}
1220 1582
1221/** 1583/**
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1249 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1611 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1250 return 0; 1612 return 0;
1251 1613
1252 if (!buffer) { 1614 if (handle->offset == 0) {
1253 /* This makes the buffer be freed by swsusp_free() */ 1615 if (!buffer)
1254 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1616 /* This makes the buffer be freed by swsusp_free() */
1617 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
1618
1255 if (!buffer) 1619 if (!buffer)
1256 return -ENOMEM; 1620 return -ENOMEM;
1257 } 1621
1258 if (!handle->offset)
1259 handle->buffer = buffer; 1622 handle->buffer = buffer;
1623 }
1260 handle->sync_read = 1; 1624 handle->sync_read = 1;
1261 if (handle->prev < handle->cur) { 1625 if (handle->prev < handle->cur) {
1262 if (handle->prev == 0) { 1626 if (handle->prev == 0) {
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1284 return -ENOMEM; 1648 return -ENOMEM;
1285 } 1649 }
1286 } else { 1650 } else {
1651 copy_last_highmem_page();
1287 handle->buffer = get_buffer(&orig_bm, &ca); 1652 handle->buffer = get_buffer(&orig_bm, &ca);
1288 handle->sync_read = 0; 1653 if (handle->buffer != buffer)
1654 handle->sync_read = 0;
1289 } 1655 }
1290 handle->prev = handle->cur; 1656 handle->prev = handle->cur;
1291 } 1657 }
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1301 return count; 1667 return count;
1302} 1668}
1303 1669
1670/**
1671 * snapshot_write_finalize - must be called after the last call to
1672 * snapshot_write_next() in case the last page in the image happens
1673 * to be a highmem page and its contents should be stored in the
1674 * highmem. Additionally, it releases the memory that will not be
1675 * used any more.
1676 */
1677
1678void snapshot_write_finalize(struct snapshot_handle *handle)
1679{
1680 copy_last_highmem_page();
1681 /* Free only if we have loaded the image entirely */
1682 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
1683 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
1684 free_highmem_data();
1685 }
1686}
1687
1304int snapshot_image_loaded(struct snapshot_handle *handle) 1688int snapshot_image_loaded(struct snapshot_handle *handle)
1305{ 1689{
1306 return !(!nr_copy_pages || 1690 return !(!nr_copy_pages || !last_highmem_page_copied() ||
1307 handle->cur <= nr_meta_pages + nr_copy_pages); 1691 handle->cur <= nr_meta_pages + nr_copy_pages);
1308} 1692}
1309 1693
1310void snapshot_free_unused_memory(struct snapshot_handle *handle) 1694#ifdef CONFIG_HIGHMEM
1695/* Assumes that @buf is ready and points to a "safe" page */
1696static inline void
1697swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
1311{ 1698{
1312 /* Free only if we have loaded the image entirely */ 1699 void *kaddr1, *kaddr2;
1313 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1700
1314 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 1701 kaddr1 = kmap_atomic(p1, KM_USER0);
1702 kaddr2 = kmap_atomic(p2, KM_USER1);
1703 memcpy(buf, kaddr1, PAGE_SIZE);
1704 memcpy(kaddr1, kaddr2, PAGE_SIZE);
1705 memcpy(kaddr2, buf, PAGE_SIZE);
1706 kunmap_atomic(kaddr1, KM_USER0);
1707 kunmap_atomic(kaddr2, KM_USER1);
1708}
1709
1710/**
1711 * restore_highmem - for each highmem page that was allocated before
1712 * the suspend and included in the suspend image, and also has been
1713 * allocated by the "resume" kernel swap its current (ie. "before
1714 * resume") contents with the previous (ie. "before suspend") one.
1715 *
1716 * If the resume eventually fails, we can call this function once
1717 * again and restore the "before resume" highmem state.
1718 */
1719
1720int restore_highmem(void)
1721{
1722 struct highmem_pbe *pbe = highmem_pblist;
1723 void *buf;
1724
1725 if (!pbe)
1726 return 0;
1727
1728 buf = get_image_page(GFP_ATOMIC, PG_SAFE);
1729 if (!buf)
1730 return -ENOMEM;
1731
1732 while (pbe) {
1733 swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
1734 pbe = pbe->next;
1735 }
1736 free_image_page(buf, PG_UNSAFE_CLEAR);
1737 return 0;
1315} 1738}
1739#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd2c3fc..f133d4a6d817 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,34 +34,123 @@ extern char resume_file[];
34#define SWSUSP_SIG "S1SUSPEND" 34#define SWSUSP_SIG "S1SUSPEND"
35 35
36static struct swsusp_header { 36static struct swsusp_header {
37 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; 37 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
38 swp_entry_t image; 38 sector_t image;
39 char orig_sig[10]; 39 char orig_sig[10];
40 char sig[10]; 40 char sig[10];
41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
42 42
43/* 43/*
44 * Saving part... 44 * General things
45 */ 45 */
46 46
47static unsigned short root_swap = 0xffff; 47static unsigned short root_swap = 0xffff;
48static struct block_device *resume_bdev;
49
50/**
51 * submit - submit BIO request.
52 * @rw: READ or WRITE.
53 * @off physical offset of page.
54 * @page: page we're reading or writing.
55 * @bio_chain: list of pending biod (for async reading)
56 *
57 * Straight from the textbook - allocate and initialize the bio.
58 * If we're reading, make sure the page is marked as dirty.
59 * Then submit it and, if @bio_chain == NULL, wait.
60 */
61static int submit(int rw, pgoff_t page_off, struct page *page,
62 struct bio **bio_chain)
63{
64 struct bio *bio;
65
66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
67 if (!bio)
68 return -ENOMEM;
69 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
70 bio->bi_bdev = resume_bdev;
71 bio->bi_end_io = end_swap_bio_read;
72
73 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
74 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
75 bio_put(bio);
76 return -EFAULT;
77 }
78
79 lock_page(page);
80 bio_get(bio);
81
82 if (bio_chain == NULL) {
83 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
84 wait_on_page_locked(page);
85 if (rw == READ)
86 bio_set_pages_dirty(bio);
87 bio_put(bio);
88 } else {
89 if (rw == READ)
90 get_page(page); /* These pages are freed later */
91 bio->bi_private = *bio_chain;
92 *bio_chain = bio;
93 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
94 }
95 return 0;
96}
97
98static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
99{
100 return submit(READ, page_off, virt_to_page(addr), bio_chain);
101}
102
103static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
104{
105 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
106}
107
108static int wait_on_bio_chain(struct bio **bio_chain)
109{
110 struct bio *bio;
111 struct bio *next_bio;
112 int ret = 0;
113
114 if (bio_chain == NULL)
115 return 0;
116
117 bio = *bio_chain;
118 if (bio == NULL)
119 return 0;
120 while (bio) {
121 struct page *page;
122
123 next_bio = bio->bi_private;
124 page = bio->bi_io_vec[0].bv_page;
125 wait_on_page_locked(page);
126 if (!PageUptodate(page) || PageError(page))
127 ret = -EIO;
128 put_page(page);
129 bio_put(bio);
130 bio = next_bio;
131 }
132 *bio_chain = NULL;
133 return ret;
134}
135
136/*
137 * Saving part
138 */
48 139
49static int mark_swapfiles(swp_entry_t start) 140static int mark_swapfiles(sector_t start)
50{ 141{
51 int error; 142 int error;
52 143
53 rw_swap_page_sync(READ, swp_entry(root_swap, 0), 144 bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
54 virt_to_page((unsigned long)&swsusp_header), NULL);
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 145 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 146 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 147 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 148 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start; 149 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), 150 error = bio_write_page(swsusp_resume_block,
61 virt_to_page((unsigned long)&swsusp_header), 151 &swsusp_header, NULL);
62 NULL);
63 } else { 152 } else {
64 pr_debug("swsusp: Partition is not swap space.\n"); 153 printk(KERN_ERR "swsusp: Swap header not found!\n");
65 error = -ENODEV; 154 error = -ENODEV;
66 } 155 }
67 return error; 156 return error;
@@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start)
74 163
75static int swsusp_swap_check(void) /* This is called before saving image */ 164static int swsusp_swap_check(void) /* This is called before saving image */
76{ 165{
77 int res = swap_type_of(swsusp_resume_device); 166 int res;
167
168 res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
169 if (res < 0)
170 return res;
171
172 root_swap = res;
173 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
174 if (IS_ERR(resume_bdev))
175 return PTR_ERR(resume_bdev);
176
177 res = set_blocksize(resume_bdev, PAGE_SIZE);
178 if (res < 0)
179 blkdev_put(resume_bdev);
78 180
79 if (res >= 0) {
80 root_swap = res;
81 return 0;
82 }
83 return res; 181 return res;
84} 182}
85 183
@@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
90 * @bio_chain: Link the next write BIO here 188 * @bio_chain: Link the next write BIO here
91 */ 189 */
92 190
93static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) 191static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
94{ 192{
95 swp_entry_t entry; 193 void *src;
96 int error = -ENOSPC; 194
97 195 if (!offset)
98 if (offset) { 196 return -ENOSPC;
99 struct page *page = virt_to_page(buf); 197
100 198 if (bio_chain) {
101 if (bio_chain) { 199 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
102 /* 200 if (src) {
103 * Whether or not we successfully allocated a copy page, 201 memcpy(src, buf, PAGE_SIZE);
104 * we take a ref on the page here. It gets undone in 202 } else {
105 * wait_on_bio_chain(). 203 WARN_ON_ONCE(1);
106 */ 204 bio_chain = NULL; /* Go synchronous */
107 struct page *page_copy; 205 src = buf;
108 page_copy = alloc_page(GFP_ATOMIC);
109 if (page_copy == NULL) {
110 WARN_ON_ONCE(1);
111 bio_chain = NULL; /* Go synchronous */
112 get_page(page);
113 } else {
114 memcpy(page_address(page_copy),
115 page_address(page), PAGE_SIZE);
116 page = page_copy;
117 }
118 } 206 }
119 entry = swp_entry(root_swap, offset); 207 } else {
120 error = rw_swap_page_sync(WRITE, entry, page, bio_chain); 208 src = buf;
121 } 209 }
122 return error; 210 return bio_write_page(offset, src, bio_chain);
123} 211}
124 212
125/* 213/*
@@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
137 * at a time. 225 * at a time.
138 */ 226 */
139 227
140#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) 228#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
141 229
142struct swap_map_page { 230struct swap_map_page {
143 unsigned long entries[MAP_PAGE_ENTRIES]; 231 sector_t entries[MAP_PAGE_ENTRIES];
144 unsigned long next_swap; 232 sector_t next_swap;
145}; 233};
146 234
147/** 235/**
@@ -151,7 +239,7 @@ struct swap_map_page {
151 239
152struct swap_map_handle { 240struct swap_map_handle {
153 struct swap_map_page *cur; 241 struct swap_map_page *cur;
154 unsigned long cur_swap; 242 sector_t cur_swap;
155 struct bitmap_page *bitmap; 243 struct bitmap_page *bitmap;
156 unsigned int k; 244 unsigned int k;
157}; 245};
@@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
166 handle->bitmap = NULL; 254 handle->bitmap = NULL;
167} 255}
168 256
169static void show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
185 centisecs / 100, centisecs % 100,
186 kps / 1000, (kps % 1000) / 10);
187}
188
189static int get_swap_writer(struct swap_map_handle *handle) 257static int get_swap_writer(struct swap_map_handle *handle)
190{ 258{
191 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 259 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
196 release_swap_writer(handle); 264 release_swap_writer(handle);
197 return -ENOMEM; 265 return -ENOMEM;
198 } 266 }
199 handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); 267 handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
200 if (!handle->cur_swap) { 268 if (!handle->cur_swap) {
201 release_swap_writer(handle); 269 release_swap_writer(handle);
202 return -ENOSPC; 270 return -ENOSPC;
@@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle)
205 return 0; 273 return 0;
206} 274}
207 275
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235
236static int swap_write_page(struct swap_map_handle *handle, void *buf, 276static int swap_write_page(struct swap_map_handle *handle, void *buf,
237 struct bio **bio_chain) 277 struct bio **bio_chain)
238{ 278{
239 int error = 0; 279 int error = 0;
240 unsigned long offset; 280 sector_t offset;
241 281
242 if (!handle->cur) 282 if (!handle->cur)
243 return -EINVAL; 283 return -EINVAL;
244 offset = alloc_swap_page(root_swap, handle->bitmap); 284 offset = alloc_swapdev_block(root_swap, handle->bitmap);
245 error = write_page(buf, offset, bio_chain); 285 error = write_page(buf, offset, bio_chain);
246 if (error) 286 if (error)
247 return error; 287 return error;
@@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
250 error = wait_on_bio_chain(bio_chain); 290 error = wait_on_bio_chain(bio_chain);
251 if (error) 291 if (error)
252 goto out; 292 goto out;
253 offset = alloc_swap_page(root_swap, handle->bitmap); 293 offset = alloc_swapdev_block(root_swap, handle->bitmap);
254 if (!offset) 294 if (!offset)
255 return -ENOSPC; 295 return -ENOSPC;
256 handle->cur->next_swap = offset; 296 handle->cur->next_swap = offset;
@@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
261 handle->cur_swap = offset; 301 handle->cur_swap = offset;
262 handle->k = 0; 302 handle->k = 0;
263 } 303 }
264out: 304 out:
265 return error; 305 return error;
266} 306}
267 307
@@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
315 error = err2; 355 error = err2;
316 if (!error) 356 if (!error)
317 printk("\b\b\b\bdone\n"); 357 printk("\b\b\b\bdone\n");
318 show_speed(&start, &stop, nr_to_write, "Wrote"); 358 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
319 return error; 359 return error;
320} 360}
321 361
@@ -350,100 +390,50 @@ int swsusp_write(void)
350 struct swsusp_info *header; 390 struct swsusp_info *header;
351 int error; 391 int error;
352 392
353 if ((error = swsusp_swap_check())) { 393 error = swsusp_swap_check();
394 if (error) {
354 printk(KERN_ERR "swsusp: Cannot find swap device, try " 395 printk(KERN_ERR "swsusp: Cannot find swap device, try "
355 "swapon -a.\n"); 396 "swapon -a.\n");
356 return error; 397 return error;
357 } 398 }
358 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 399 memset(&snapshot, 0, sizeof(struct snapshot_handle));
359 error = snapshot_read_next(&snapshot, PAGE_SIZE); 400 error = snapshot_read_next(&snapshot, PAGE_SIZE);
360 if (error < PAGE_SIZE) 401 if (error < PAGE_SIZE) {
361 return error < 0 ? error : -EFAULT; 402 if (error >= 0)
403 error = -EFAULT;
404
405 goto out;
406 }
362 header = (struct swsusp_info *)data_of(snapshot); 407 header = (struct swsusp_info *)data_of(snapshot);
363 if (!enough_swap(header->pages)) { 408 if (!enough_swap(header->pages)) {
364 printk(KERN_ERR "swsusp: Not enough free swap\n"); 409 printk(KERN_ERR "swsusp: Not enough free swap\n");
365 return -ENOSPC; 410 error = -ENOSPC;
411 goto out;
366 } 412 }
367 error = get_swap_writer(&handle); 413 error = get_swap_writer(&handle);
368 if (!error) { 414 if (!error) {
369 unsigned long start = handle.cur_swap; 415 sector_t start = handle.cur_swap;
416
370 error = swap_write_page(&handle, header, NULL); 417 error = swap_write_page(&handle, header, NULL);
371 if (!error) 418 if (!error)
372 error = save_image(&handle, &snapshot, 419 error = save_image(&handle, &snapshot,
373 header->pages - 1); 420 header->pages - 1);
421
374 if (!error) { 422 if (!error) {
375 flush_swap_writer(&handle); 423 flush_swap_writer(&handle);
376 printk("S"); 424 printk("S");
377 error = mark_swapfiles(swp_entry(root_swap, start)); 425 error = mark_swapfiles(start);
378 printk("|\n"); 426 printk("|\n");
379 } 427 }
380 } 428 }
381 if (error) 429 if (error)
382 free_all_swap_pages(root_swap, handle.bitmap); 430 free_all_swap_pages(root_swap, handle.bitmap);
383 release_swap_writer(&handle); 431 release_swap_writer(&handle);
432 out:
433 swsusp_close();
384 return error; 434 return error;
385} 435}
386 436
387static struct block_device *resume_bdev;
388
389/**
390 * submit - submit BIO request.
391 * @rw: READ or WRITE.
392 * @off physical offset of page.
393 * @page: page we're reading or writing.
394 * @bio_chain: list of pending biod (for async reading)
395 *
396 * Straight from the textbook - allocate and initialize the bio.
397 * If we're reading, make sure the page is marked as dirty.
398 * Then submit it and, if @bio_chain == NULL, wait.
399 */
400static int submit(int rw, pgoff_t page_off, struct page *page,
401 struct bio **bio_chain)
402{
403 struct bio *bio;
404
405 bio = bio_alloc(GFP_ATOMIC, 1);
406 if (!bio)
407 return -ENOMEM;
408 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
409 bio->bi_bdev = resume_bdev;
410 bio->bi_end_io = end_swap_bio_read;
411
412 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
413 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
414 bio_put(bio);
415 return -EFAULT;
416 }
417
418 lock_page(page);
419 bio_get(bio);
420
421 if (bio_chain == NULL) {
422 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
423 wait_on_page_locked(page);
424 if (rw == READ)
425 bio_set_pages_dirty(bio);
426 bio_put(bio);
427 } else {
428 if (rw == READ)
429 get_page(page); /* These pages are freed later */
430 bio->bi_private = *bio_chain;
431 *bio_chain = bio;
432 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
433 }
434 return 0;
435}
436
437static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
438{
439 return submit(READ, page_off, virt_to_page(addr), bio_chain);
440}
441
442static int bio_write_page(pgoff_t page_off, void *addr)
443{
444 return submit(WRITE, page_off, virt_to_page(addr), NULL);
445}
446
447/** 437/**
448 * The following functions allow us to read data using a swap map 438 * The following functions allow us to read data using a swap map
449 * in a file-alike way 439 * in a file-alike way
@@ -456,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
456 handle->cur = NULL; 446 handle->cur = NULL;
457} 447}
458 448
459static int get_swap_reader(struct swap_map_handle *handle, 449static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
460 swp_entry_t start)
461{ 450{
462 int error; 451 int error;
463 452
464 if (!swp_offset(start)) 453 if (!start)
465 return -EINVAL; 454 return -EINVAL;
466 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 455
456 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
467 if (!handle->cur) 457 if (!handle->cur)
468 return -ENOMEM; 458 return -ENOMEM;
469 error = bio_read_page(swp_offset(start), handle->cur, NULL); 459
460 error = bio_read_page(start, handle->cur, NULL);
470 if (error) { 461 if (error) {
471 release_swap_reader(handle); 462 release_swap_reader(handle);
472 return error; 463 return error;
@@ -478,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
478static int swap_read_page(struct swap_map_handle *handle, void *buf, 469static int swap_read_page(struct swap_map_handle *handle, void *buf,
479 struct bio **bio_chain) 470 struct bio **bio_chain)
480{ 471{
481 unsigned long offset; 472 sector_t offset;
482 int error; 473 int error;
483 474
484 if (!handle->cur) 475 if (!handle->cur)
@@ -547,11 +538,11 @@ static int load_image(struct swap_map_handle *handle,
547 error = err2; 538 error = err2;
548 if (!error) { 539 if (!error) {
549 printk("\b\b\b\bdone\n"); 540 printk("\b\b\b\bdone\n");
550 snapshot_free_unused_memory(snapshot); 541 snapshot_write_finalize(snapshot);
551 if (!snapshot_image_loaded(snapshot)) 542 if (!snapshot_image_loaded(snapshot))
552 error = -ENODATA; 543 error = -ENODATA;
553 } 544 }
554 show_speed(&start, &stop, nr_to_read, "Read"); 545 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
555 return error; 546 return error;
556} 547}
557 548
@@ -600,12 +591,16 @@ int swsusp_check(void)
600 if (!IS_ERR(resume_bdev)) { 591 if (!IS_ERR(resume_bdev)) {
601 set_blocksize(resume_bdev, PAGE_SIZE); 592 set_blocksize(resume_bdev, PAGE_SIZE);
602 memset(&swsusp_header, 0, sizeof(swsusp_header)); 593 memset(&swsusp_header, 0, sizeof(swsusp_header));
603 if ((error = bio_read_page(0, &swsusp_header, NULL))) 594 error = bio_read_page(swsusp_resume_block,
595 &swsusp_header, NULL);
596 if (error)
604 return error; 597 return error;
598
605 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 599 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
606 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 600 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
607 /* Reset swap signature now */ 601 /* Reset swap signature now */
608 error = bio_write_page(0, &swsusp_header); 602 error = bio_write_page(swsusp_resume_block,
603 &swsusp_header, NULL);
609 } else { 604 } else {
610 return -EINVAL; 605 return -EINVAL;
611 } 606 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc516..31aa0390c777 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
49#include <linux/bootmem.h> 49#include <linux/bootmem.h>
50#include <linux/syscalls.h> 50#include <linux/syscalls.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h>
52 53
53#include "power.h" 54#include "power.h"
54 55
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0;
64 65
65#ifdef CONFIG_HIGHMEM 66#ifdef CONFIG_HIGHMEM
66unsigned int count_highmem_pages(void); 67unsigned int count_highmem_pages(void);
67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static inline int save_highmem(void) { return 0; }
71static inline int restore_highmem(void) { return 0; } 70static inline int restore_highmem(void) { return 0; }
72static inline unsigned int count_highmem_pages(void) { return 0; } 71static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 72#endif
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
134 return 0; 133 return 0;
135} 134}
136 135
137unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) 136sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
138{ 137{
139 unsigned long offset; 138 unsigned long offset;
140 139
141 offset = swp_offset(get_swap_page_of_type(swap)); 140 offset = swp_offset(get_swap_page_of_type(swap));
142 if (offset) { 141 if (offset) {
143 if (bitmap_set(bitmap, offset)) { 142 if (bitmap_set(bitmap, offset))
144 swap_free(swp_entry(swap, offset)); 143 swap_free(swp_entry(swap, offset));
145 offset = 0; 144 else
146 } 145 return swapdev_block(swap, offset);
147 } 146 }
148 return offset; 147 return 0;
149} 148}
150 149
151void free_all_swap_pages(int swap, struct bitmap_page *bitmap) 150void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
166} 165}
167 166
168/** 167/**
168 * swsusp_show_speed - print the time elapsed between two events represented by
169 * @start and @stop
170 *
171 * @nr_pages - number of pages processed between @start and @stop
172 * @msg - introductory message to print
173 */
174
175void swsusp_show_speed(struct timeval *start, struct timeval *stop,
176 unsigned nr_pages, char *msg)
177{
178 s64 elapsed_centisecs64;
179 int centisecs;
180 int k;
181 int kps;
182
183 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
184 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
185 centisecs = elapsed_centisecs64;
186 if (centisecs == 0)
187 centisecs = 1; /* avoid div-by-zero */
188 k = nr_pages * (PAGE_SIZE / 1024);
189 kps = (k * 100) / centisecs;
190 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
191 centisecs / 100, centisecs % 100,
192 kps / 1000, (kps % 1000) / 10);
193}
194
195/**
169 * swsusp_shrink_memory - Try to free as much memory as needed 196 * swsusp_shrink_memory - Try to free as much memory as needed
170 * 197 *
171 * ... but do not OOM-kill anyone 198 * ... but do not OOM-kill anyone
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp)
184 211
185int swsusp_shrink_memory(void) 212int swsusp_shrink_memory(void)
186{ 213{
187 long size, tmp; 214 long tmp;
188 struct zone *zone; 215 struct zone *zone;
189 unsigned long pages = 0; 216 unsigned long pages = 0;
190 unsigned int i = 0; 217 unsigned int i = 0;
191 char *p = "-\\|/"; 218 char *p = "-\\|/";
219 struct timeval start, stop;
192 220
193 printk("Shrinking memory... "); 221 printk("Shrinking memory... ");
222 do_gettimeofday(&start);
194 do { 223 do {
195 size = 2 * count_highmem_pages(); 224 long size, highmem_size;
196 size += size / 50 + count_data_pages() + PAGES_FOR_IO; 225
226 highmem_size = count_highmem_pages();
227 size = count_data_pages() + PAGES_FOR_IO;
197 tmp = size; 228 tmp = size;
229 size += highmem_size;
198 for_each_zone (zone) 230 for_each_zone (zone)
199 if (!is_highmem(zone) && populated_zone(zone)) { 231 if (populated_zone(zone)) {
200 tmp -= zone->free_pages; 232 if (is_highmem(zone)) {
201 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 233 highmem_size -= zone->free_pages;
202 tmp += snapshot_additional_pages(zone); 234 } else {
235 tmp -= zone->free_pages;
236 tmp += zone->lowmem_reserve[ZONE_NORMAL];
237 tmp += snapshot_additional_pages(zone);
238 }
203 } 239 }
240
241 if (highmem_size < 0)
242 highmem_size = 0;
243
244 tmp += highmem_size;
204 if (tmp > 0) { 245 if (tmp > 0) {
205 tmp = __shrink_memory(tmp); 246 tmp = __shrink_memory(tmp);
206 if (!tmp) 247 if (!tmp)
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void)
212 } 253 }
213 printk("\b%c", p[i++%4]); 254 printk("\b%c", p[i++%4]);
214 } while (tmp > 0); 255 } while (tmp > 0);
256 do_gettimeofday(&stop);
215 printk("\bdone (%lu pages freed)\n", pages); 257 printk("\bdone (%lu pages freed)\n", pages);
258 swsusp_show_speed(&start, &stop, pages, "Freed");
216 259
217 return 0; 260 return 0;
218} 261}
@@ -223,6 +266,7 @@ int swsusp_suspend(void)
223 266
224 if ((error = arch_prepare_suspend())) 267 if ((error = arch_prepare_suspend()))
225 return error; 268 return error;
269
226 local_irq_disable(); 270 local_irq_disable();
227 /* At this point, device_suspend() has been called, but *not* 271 /* At this point, device_suspend() has been called, but *not*
228 * device_power_down(). We *must* device_power_down() now. 272 * device_power_down(). We *must* device_power_down() now.
@@ -235,23 +279,16 @@ int swsusp_suspend(void)
235 goto Enable_irqs; 279 goto Enable_irqs;
236 } 280 }
237 281
238 if ((error = save_highmem())) {
239 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
240 goto Restore_highmem;
241 }
242
243 save_processor_state(); 282 save_processor_state();
244 if ((error = swsusp_arch_suspend())) 283 if ((error = swsusp_arch_suspend()))
245 printk(KERN_ERR "Error %d suspending\n", error); 284 printk(KERN_ERR "Error %d suspending\n", error);
246 /* Restore control flow magically appears here */ 285 /* Restore control flow magically appears here */
247 restore_processor_state(); 286 restore_processor_state();
248Restore_highmem:
249 restore_highmem();
250 /* NOTE: device_power_up() is just a resume() for devices 287 /* NOTE: device_power_up() is just a resume() for devices
251 * that suspended with irqs off ... no overall powerup. 288 * that suspended with irqs off ... no overall powerup.
252 */ 289 */
253 device_power_up(); 290 device_power_up();
254Enable_irqs: 291 Enable_irqs:
255 local_irq_enable(); 292 local_irq_enable();
256 return error; 293 return error;
257} 294}
@@ -268,18 +305,23 @@ int swsusp_resume(void)
268 printk(KERN_ERR "Some devices failed to power down, very bad\n"); 305 printk(KERN_ERR "Some devices failed to power down, very bad\n");
269 /* We'll ignore saved state, but this gets preempt count (etc) right */ 306 /* We'll ignore saved state, but this gets preempt count (etc) right */
270 save_processor_state(); 307 save_processor_state();
271 error = swsusp_arch_resume(); 308 error = restore_highmem();
272 /* Code below is only ever reached in case of failure. Otherwise 309 if (!error) {
273 * execution continues at place where swsusp_arch_suspend was called 310 error = swsusp_arch_resume();
274 */ 311 /* The code below is only ever reached in case of a failure.
275 BUG_ON(!error); 312 * Otherwise execution continues at place where
313 * swsusp_arch_suspend() was called
314 */
315 BUG_ON(!error);
316 /* This call to restore_highmem() undos the previous one */
317 restore_highmem();
318 }
276 /* The only reason why swsusp_arch_resume() can fail is memory being 319 /* The only reason why swsusp_arch_resume() can fail is memory being
277 * very tight, so we have to free it as soon as we can to avoid 320 * very tight, so we have to free it as soon as we can to avoid
278 * subsequent failures 321 * subsequent failures
279 */ 322 */
280 swsusp_free(); 323 swsusp_free();
281 restore_processor_state(); 324 restore_processor_state();
282 restore_highmem();
283 touch_softlockup_watchdog(); 325 touch_softlockup_watchdog();
284 device_power_up(); 326 device_power_up();
285 local_irq_enable(); 327 local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a4..89443b85163b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/reboot.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/device.h> 16#include <linux/device.h>
16#include <linux/miscdevice.h> 17#include <linux/miscdevice.h>
@@ -21,6 +22,7 @@
21#include <linux/fs.h> 22#include <linux/fs.h>
22#include <linux/console.h> 23#include <linux/console.h>
23#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h>
24 26
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
26 28
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
54 filp->private_data = data; 56 filp->private_data = data;
55 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 57 memset(&data->handle, 0, sizeof(struct snapshot_handle));
56 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { 58 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
57 data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; 59 data->swap = swsusp_resume_device ?
60 swap_type_of(swsusp_resume_device, 0) : -1;
58 data->mode = O_RDONLY; 61 data->mode = O_RDONLY;
59 } else { 62 } else {
60 data->swap = -1; 63 data->swap = -1;
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
76 free_all_swap_pages(data->swap, data->bitmap); 79 free_all_swap_pages(data->swap, data->bitmap);
77 free_bitmap(data->bitmap); 80 free_bitmap(data->bitmap);
78 if (data->frozen) { 81 if (data->frozen) {
79 down(&pm_sem); 82 mutex_lock(&pm_mutex);
80 thaw_processes(); 83 thaw_processes();
81 enable_nonboot_cpus(); 84 enable_nonboot_cpus();
82 up(&pm_sem); 85 mutex_unlock(&pm_mutex);
83 } 86 }
84 atomic_inc(&device_available); 87 atomic_inc(&device_available);
85 return 0; 88 return 0;
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
124{ 127{
125 int error = 0; 128 int error = 0;
126 struct snapshot_data *data; 129 struct snapshot_data *data;
127 loff_t offset, avail; 130 loff_t avail;
131 sector_t offset;
128 132
129 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) 133 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
130 return -ENOTTY; 134 return -ENOTTY;
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
140 case SNAPSHOT_FREEZE: 144 case SNAPSHOT_FREEZE:
141 if (data->frozen) 145 if (data->frozen)
142 break; 146 break;
143 down(&pm_sem); 147 mutex_lock(&pm_mutex);
144 error = disable_nonboot_cpus(); 148 error = disable_nonboot_cpus();
145 if (!error) { 149 if (!error) {
146 error = freeze_processes(); 150 error = freeze_processes();
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
150 error = -EBUSY; 154 error = -EBUSY;
151 } 155 }
152 } 156 }
153 up(&pm_sem); 157 mutex_unlock(&pm_mutex);
154 if (!error) 158 if (!error)
155 data->frozen = 1; 159 data->frozen = 1;
156 break; 160 break;
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
158 case SNAPSHOT_UNFREEZE: 162 case SNAPSHOT_UNFREEZE:
159 if (!data->frozen) 163 if (!data->frozen)
160 break; 164 break;
161 down(&pm_sem); 165 mutex_lock(&pm_mutex);
162 thaw_processes(); 166 thaw_processes();
163 enable_nonboot_cpus(); 167 enable_nonboot_cpus();
164 up(&pm_sem); 168 mutex_unlock(&pm_mutex);
165 data->frozen = 0; 169 data->frozen = 0;
166 break; 170 break;
167 171
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
170 error = -EPERM; 174 error = -EPERM;
171 break; 175 break;
172 } 176 }
173 down(&pm_sem); 177 mutex_lock(&pm_mutex);
174 /* Free memory before shutting down devices. */ 178 /* Free memory before shutting down devices. */
175 error = swsusp_shrink_memory(); 179 error = swsusp_shrink_memory();
176 if (!error) { 180 if (!error) {
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
183 } 187 }
184 resume_console(); 188 resume_console();
185 } 189 }
186 up(&pm_sem); 190 mutex_unlock(&pm_mutex);
187 if (!error) 191 if (!error)
188 error = put_user(in_suspend, (unsigned int __user *)arg); 192 error = put_user(in_suspend, (unsigned int __user *)arg);
189 if (!error) 193 if (!error)
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
191 break; 195 break;
192 196
193 case SNAPSHOT_ATOMIC_RESTORE: 197 case SNAPSHOT_ATOMIC_RESTORE:
198 snapshot_write_finalize(&data->handle);
194 if (data->mode != O_WRONLY || !data->frozen || 199 if (data->mode != O_WRONLY || !data->frozen ||
195 !snapshot_image_loaded(&data->handle)) { 200 !snapshot_image_loaded(&data->handle)) {
196 error = -EPERM; 201 error = -EPERM;
197 break; 202 break;
198 } 203 }
199 snapshot_free_unused_memory(&data->handle); 204 mutex_lock(&pm_mutex);
200 down(&pm_sem);
201 pm_prepare_console(); 205 pm_prepare_console();
202 suspend_console(); 206 suspend_console();
203 error = device_suspend(PMSG_PRETHAW); 207 error = device_suspend(PMSG_PRETHAW);
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
207 } 211 }
208 resume_console(); 212 resume_console();
209 pm_restore_console(); 213 pm_restore_console();
210 up(&pm_sem); 214 mutex_unlock(&pm_mutex);
211 break; 215 break;
212 216
213 case SNAPSHOT_FREE: 217 case SNAPSHOT_FREE:
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
238 break; 242 break;
239 } 243 }
240 } 244 }
241 offset = alloc_swap_page(data->swap, data->bitmap); 245 offset = alloc_swapdev_block(data->swap, data->bitmap);
242 if (offset) { 246 if (offset) {
243 offset <<= PAGE_SHIFT; 247 offset <<= PAGE_SHIFT;
244 error = put_user(offset, (loff_t __user *)arg); 248 error = put_user(offset, (sector_t __user *)arg);
245 } else { 249 } else {
246 error = -ENOSPC; 250 error = -ENOSPC;
247 } 251 }
@@ -264,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
264 * so we need to recode them 268 * so we need to recode them
265 */ 269 */
266 if (old_decode_dev(arg)) { 270 if (old_decode_dev(arg)) {
267 data->swap = swap_type_of(old_decode_dev(arg)); 271 data->swap = swap_type_of(old_decode_dev(arg), 0);
268 if (data->swap < 0) 272 if (data->swap < 0)
269 error = -ENODEV; 273 error = -ENODEV;
270 } else { 274 } else {
@@ -282,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
282 break; 286 break;
283 } 287 }
284 288
285 if (down_trylock(&pm_sem)) { 289 if (!mutex_trylock(&pm_mutex)) {
286 error = -EBUSY; 290 error = -EBUSY;
287 break; 291 break;
288 } 292 }
@@ -309,8 +313,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
309 if (pm_ops->finish) 313 if (pm_ops->finish)
310 pm_ops->finish(PM_SUSPEND_MEM); 314 pm_ops->finish(PM_SUSPEND_MEM);
311 315
312OutS3: 316 OutS3:
313 up(&pm_sem); 317 mutex_unlock(&pm_mutex);
318 break;
319
320 case SNAPSHOT_PMOPS:
321 switch (arg) {
322
323 case PMOPS_PREPARE:
324 if (pm_ops->prepare) {
325 error = pm_ops->prepare(PM_SUSPEND_DISK);
326 }
327 break;
328
329 case PMOPS_ENTER:
330 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
331 error = pm_ops->enter(PM_SUSPEND_DISK);
332 break;
333
334 case PMOPS_FINISH:
335 if (pm_ops && pm_ops->finish) {
336 pm_ops->finish(PM_SUSPEND_DISK);
337 }
338 break;
339
340 default:
341 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
342 error = -EINVAL;
343
344 }
345 break;
346
347 case SNAPSHOT_SET_SWAP_AREA:
348 if (data->bitmap) {
349 error = -EPERM;
350 } else {
351 struct resume_swap_area swap_area;
352 dev_t swdev;
353
354 error = copy_from_user(&swap_area, (void __user *)arg,
355 sizeof(struct resume_swap_area));
356 if (error) {
357 error = -EFAULT;
358 break;
359 }
360
361 /*
362 * User space encodes device types as two-byte values,
363 * so we need to recode them
364 */
365 swdev = old_decode_dev(swap_area.dev);
366 if (swdev) {
367 offset = swap_area.offset;
368 data->swap = swap_type_of(swdev, offset);
369 if (data->swap < 0)
370 error = -ENODEV;
371 } else {
372 data->swap = -1;
373 error = -EINVAL;
374 }
375 }
314 break; 376 break;
315 377
316 default: 378 default:
@@ -321,7 +383,7 @@ OutS3:
321 return error; 383 return error;
322} 384}
323 385
324static struct file_operations snapshot_fops = { 386static const struct file_operations snapshot_fops = {
325 .open = snapshot_open, 387 .open = snapshot_open,
326 .release = snapshot_release, 388 .release = snapshot_release,
327 .read = snapshot_read, 389 .read = snapshot_read,
diff --git a/kernel/printk.c b/kernel/printk.c
index f7d427ef5038..185bb45eacf7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/bootmem.h> 32#include <linux/bootmem.h>
33#include <linux/syscalls.h> 33#include <linux/syscalls.h>
34#include <linux/jiffies.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
@@ -52,8 +53,6 @@ int console_printk[4] = {
52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 53 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
53}; 54};
54 55
55EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
56
57/* 56/*
58 * Low lever drivers may need that to know if they can schedule in 57 * Low lever drivers may need that to know if they can schedule in
59 * their unblank() callback or not. So let's export it. 58 * their unblank() callback or not. So let's export it.
@@ -334,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
334 } 333 }
335} 334}
336 335
336static int __read_mostly ignore_loglevel;
337
338int __init ignore_loglevel_setup(char *str)
339{
340 ignore_loglevel = 1;
341 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
342
343 return 1;
344}
345
346__setup("ignore_loglevel", ignore_loglevel_setup);
347
337/* 348/*
338 * Write out chars from start to end - 1 inclusive 349 * Write out chars from start to end - 1 inclusive
339 */ 350 */
340static void _call_console_drivers(unsigned long start, 351static void _call_console_drivers(unsigned long start,
341 unsigned long end, int msg_log_level) 352 unsigned long end, int msg_log_level)
342{ 353{
343 if (msg_log_level < console_loglevel && 354 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
344 console_drivers && start != end) { 355 console_drivers && start != end) {
345 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { 356 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
346 /* wrapped write */ 357 /* wrapped write */
@@ -630,12 +641,7 @@ EXPORT_SYMBOL(vprintk);
630 641
631asmlinkage long sys_syslog(int type, char __user *buf, int len) 642asmlinkage long sys_syslog(int type, char __user *buf, int len)
632{ 643{
633 return 0; 644 return -ENOSYS;
634}
635
636int do_syslog(int type, char __user *buf, int len)
637{
638 return 0;
639} 645}
640 646
641static void call_console_drivers(unsigned long start, unsigned long end) 647static void call_console_drivers(unsigned long start, unsigned long end)
@@ -776,7 +782,6 @@ int is_console_locked(void)
776{ 782{
777 return console_locked; 783 return console_locked;
778} 784}
779EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
780 785
781/** 786/**
782 * release_console_sem - unlock the console system 787 * release_console_sem - unlock the console system
@@ -1101,3 +1106,23 @@ int printk_ratelimit(void)
1101 printk_ratelimit_burst); 1106 printk_ratelimit_burst);
1102} 1107}
1103EXPORT_SYMBOL(printk_ratelimit); 1108EXPORT_SYMBOL(printk_ratelimit);
1109
1110/**
1111 * printk_timed_ratelimit - caller-controlled printk ratelimiting
1112 * @caller_jiffies: pointer to caller's state
1113 * @interval_msecs: minimum interval between prints
1114 *
1115 * printk_timed_ratelimit() returns true if more than @interval_msecs
1116 * milliseconds have elapsed since the last time printk_timed_ratelimit()
1117 * returned true.
1118 */
1119bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1120 unsigned int interval_msecs)
1121{
1122 if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
1123 *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
1124 return true;
1125 }
1126 return false;
1127}
1128EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec9..fb5e03d57e9d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
40 40
41static atomic_t *prof_buffer; 41static atomic_t *prof_buffer;
42static unsigned long prof_len, prof_shift; 42static unsigned long prof_len, prof_shift;
43static int prof_on __read_mostly; 43int prof_on __read_mostly;
44static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 44static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
45#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
51static int __init profile_setup(char * str) 51static int __init profile_setup(char * str)
52{ 52{
53 static char __initdata schedstr[] = "schedule"; 53 static char __initdata schedstr[] = "schedule";
54 static char __initdata sleepstr[] = "sleep";
54 int par; 55 int par;
55 56
56 if (!strncmp(str, schedstr, strlen(schedstr))) { 57 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
58 prof_on = SLEEP_PROFILING;
59 if (str[strlen(sleepstr)] == ',')
60 str += strlen(sleepstr) + 1;
61 if (get_option(&str, &par))
62 prof_shift = par;
63 printk(KERN_INFO
64 "kernel sleep profiling enabled (shift: %ld)\n",
65 prof_shift);
66 } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
57 prof_on = SCHED_PROFILING; 67 prof_on = SCHED_PROFILING;
58 if (str[strlen(schedstr)] == ',') 68 if (str[strlen(schedstr)] == ',')
59 str += strlen(schedstr) + 1; 69 str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
204 * positions to which hits are accounted during short intervals (e.g. 214 * positions to which hits are accounted during short intervals (e.g.
205 * several seconds) is usually very small. Exclusion from buffer 215 * several seconds) is usually very small. Exclusion from buffer
206 * flipping is provided by interrupt disablement (note that for 216 * flipping is provided by interrupt disablement (note that for
207 * SCHED_PROFILING profile_hit() may be called from process context). 217 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
218 * process context).
208 * The hash function is meant to be lightweight as opposed to strong, 219 * The hash function is meant to be lightweight as opposed to strong,
209 * and was vaguely inspired by ppc64 firmware-supported inverted 220 * and was vaguely inspired by ppc64 firmware-supported inverted
210 * pagetable hash functions, but uses a full hashtable full of finite 221 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
257 mutex_unlock(&profile_flip_mutex); 268 mutex_unlock(&profile_flip_mutex);
258} 269}
259 270
260void profile_hit(int type, void *__pc) 271void profile_hits(int type, void *__pc, unsigned int nr_hits)
261{ 272{
262 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 273 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
263 int i, j, cpu; 274 int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
274 put_cpu(); 285 put_cpu();
275 return; 286 return;
276 } 287 }
288 /*
289 * We buffer the global profiler buffer into a per-CPU
290 * queue and thus reduce the number of global (and possibly
291 * NUMA-alien) accesses. The write-queue is self-coalescing:
292 */
277 local_irq_save(flags); 293 local_irq_save(flags);
278 do { 294 do {
279 for (j = 0; j < PROFILE_GRPSZ; ++j) { 295 for (j = 0; j < PROFILE_GRPSZ; ++j) {
280 if (hits[i + j].pc == pc) { 296 if (hits[i + j].pc == pc) {
281 hits[i + j].hits++; 297 hits[i + j].hits += nr_hits;
282 goto out; 298 goto out;
283 } else if (!hits[i + j].hits) { 299 } else if (!hits[i + j].hits) {
284 hits[i + j].pc = pc; 300 hits[i + j].pc = pc;
285 hits[i + j].hits = 1; 301 hits[i + j].hits = nr_hits;
286 goto out; 302 goto out;
287 } 303 }
288 } 304 }
289 i = (i + secondary) & (NR_PROFILE_HIT - 1); 305 i = (i + secondary) & (NR_PROFILE_HIT - 1);
290 } while (i != primary); 306 } while (i != primary);
291 atomic_inc(&prof_buffer[pc]); 307
308 /*
309 * Add the current hit(s) and flush the write-queue out
310 * to the global buffer:
311 */
312 atomic_add(nr_hits, &prof_buffer[pc]);
292 for (i = 0; i < NR_PROFILE_HIT; ++i) { 313 for (i = 0; i < NR_PROFILE_HIT; ++i) {
293 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 314 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
294 hits[i].pc = hits[i].hits = 0; 315 hits[i].pc = hits[i].hits = 0;
@@ -298,7 +319,6 @@ out:
298 put_cpu(); 319 put_cpu();
299} 320}
300 321
301#ifdef CONFIG_HOTPLUG_CPU
302static int __devinit profile_cpu_callback(struct notifier_block *info, 322static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 323 unsigned long action, void *__cpu)
304{ 324{
@@ -351,19 +371,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
351 } 371 }
352 return NOTIFY_OK; 372 return NOTIFY_OK;
353} 373}
354#endif /* CONFIG_HOTPLUG_CPU */
355#else /* !CONFIG_SMP */ 374#else /* !CONFIG_SMP */
356#define profile_flip_buffers() do { } while (0) 375#define profile_flip_buffers() do { } while (0)
357#define profile_discard_flip_buffers() do { } while (0) 376#define profile_discard_flip_buffers() do { } while (0)
377#define profile_cpu_callback NULL
358 378
359void profile_hit(int type, void *__pc) 379void profile_hits(int type, void *__pc, unsigned int nr_hits)
360{ 380{
361 unsigned long pc; 381 unsigned long pc;
362 382
363 if (prof_on != type || !prof_buffer) 383 if (prof_on != type || !prof_buffer)
364 return; 384 return;
365 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 385 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
366 atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); 386 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
367} 387}
368#endif /* !CONFIG_SMP */ 388#endif /* !CONFIG_SMP */
369 389
@@ -442,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
442 read = 0; 462 read = 0;
443 463
444 while (p < sizeof(unsigned int) && count > 0) { 464 while (p < sizeof(unsigned int) && count > 0) {
445 put_user(*((char *)(&sample_step)+p),buf); 465 if (put_user(*((char *)(&sample_step)+p),buf))
466 return -EFAULT;
446 buf++; p++; count--; read++; 467 buf++; p++; count--; read++;
447 } 468 }
448 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 469 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
@@ -480,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
480 return count; 501 return count;
481} 502}
482 503
483static struct file_operations proc_profile_operations = { 504static const struct file_operations proc_profile_operations = {
484 .read = read_profile, 505 .read = read_profile,
485 .write = write_profile, 506 .write = write_profile,
486}; 507};
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef1..3554b76da84c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
235 235
236 list = rdp->donelist; 236 list = rdp->donelist;
237 while (list) { 237 while (list) {
238 next = rdp->donelist = list->next; 238 next = list->next;
239 prefetch(next);
239 list->func(list); 240 list->func(list);
240 list = next; 241 list = next;
241 if (++count >= rdp->blimit) 242 if (++count >= rdp->blimit)
242 break; 243 break;
243 } 244 }
245 rdp->donelist = list;
244 246
245 local_irq_disable(); 247 local_irq_disable();
246 rdp->qlen -= count; 248 rdp->qlen -= count;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f42..c52f981ea008 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
401 cleanup_srcu_struct(&srcu_ctl); 401 cleanup_srcu_struct(&srcu_ctl);
402} 402}
403 403
404static int srcu_torture_read_lock(void) 404static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
405{ 405{
406 return srcu_read_lock(&srcu_ctl); 406 return srcu_read_lock(&srcu_ctl);
407} 407}
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
419 schedule_timeout_interruptible(longdelay); 419 schedule_timeout_interruptible(longdelay);
420} 420}
421 421
422static void srcu_torture_read_unlock(int idx) 422static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
423{ 423{
424 srcu_read_unlock(&srcu_ctl, idx); 424 srcu_read_unlock(&srcu_ctl, idx);
425} 425}
diff --git a/kernel/relay.c b/kernel/relay.c
index f04bbdb56ac2..a4701e7ba7d0 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -138,7 +138,7 @@ depopulate:
138 */ 138 */
139struct rchan_buf *relay_create_buf(struct rchan *chan) 139struct rchan_buf *relay_create_buf(struct rchan *chan)
140{ 140{
141 struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL); 141 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
142 if (!buf) 142 if (!buf)
143 return NULL; 143 return NULL;
144 144
@@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = {
308 * reason waking is deferred is that calling directly from write 308 * reason waking is deferred is that calling directly from write
309 * causes problems if you're writing from say the scheduler. 309 * causes problems if you're writing from say the scheduler.
310 */ 310 */
311static void wakeup_readers(void *private) 311static void wakeup_readers(struct work_struct *work)
312{ 312{
313 struct rchan_buf *buf = private; 313 struct rchan_buf *buf =
314 container_of(work, struct rchan_buf, wake_readers.work);
314 wake_up_interruptible(&buf->read_wait); 315 wake_up_interruptible(&buf->read_wait);
315} 316}
316 317
@@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
328 if (init) { 329 if (init) {
329 init_waitqueue_head(&buf->read_wait); 330 init_waitqueue_head(&buf->read_wait);
330 kref_init(&buf->kref); 331 kref_init(&buf->kref);
331 INIT_WORK(&buf->wake_readers, NULL, NULL); 332 INIT_DELAYED_WORK(&buf->wake_readers, NULL);
332 } else { 333 } else {
333 cancel_delayed_work(&buf->wake_readers); 334 cancel_delayed_work(&buf->wake_readers);
334 flush_scheduled_work(); 335 flush_scheduled_work();
@@ -478,7 +479,7 @@ struct rchan *relay_open(const char *base_filename,
478 if (!(subbuf_size && n_subbufs)) 479 if (!(subbuf_size && n_subbufs))
479 return NULL; 480 return NULL;
480 481
481 chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL); 482 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
482 if (!chan) 483 if (!chan)
483 return NULL; 484 return NULL;
484 485
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
549 buf->padding[old_subbuf]; 550 buf->padding[old_subbuf];
550 smp_mb(); 551 smp_mb();
551 if (waitqueue_active(&buf->read_wait)) { 552 if (waitqueue_active(&buf->read_wait)) {
552 PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); 553 PREPARE_DELAYED_WORK(&buf->wake_readers,
554 wakeup_readers);
553 schedule_delayed_work(&buf->wake_readers, 1); 555 schedule_delayed_work(&buf->wake_readers, 1);
554 } 556 }
555 } 557 }
@@ -957,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
957 if (!desc->count) 959 if (!desc->count)
958 return 0; 960 return 0;
959 961
960 mutex_lock(&filp->f_dentry->d_inode->i_mutex); 962 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
961 do { 963 do {
962 if (!relay_file_read_avail(buf, *ppos)) 964 if (!relay_file_read_avail(buf, *ppos))
963 break; 965 break;
@@ -977,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
977 *ppos = relay_file_read_end_pos(buf, read_start, ret); 979 *ppos = relay_file_read_end_pos(buf, read_start, ret);
978 } 980 }
979 } while (desc->count && ret); 981 } while (desc->count && ret);
980 mutex_unlock(&filp->f_dentry->d_inode->i_mutex); 982 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
981 983
982 return desc->written; 984 return desc->written;
983} 985}
@@ -1011,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
1011 actor, &desc); 1013 actor, &desc);
1012} 1014}
1013 1015
1014struct file_operations relay_file_operations = { 1016const struct file_operations relay_file_operations = {
1015 .open = relay_file_open, 1017 .open = relay_file_open,
1016 .poll = relay_file_poll, 1018 .poll = relay_file_poll,
1017 .mmap = relay_file_mmap, 1019 .mmap = relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143e..7b9a497419d9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
88 return 0; 88 return 0;
89} 89}
90 90
91static struct seq_operations resource_op = { 91static const struct seq_operations resource_op = {
92 .start = r_start, 92 .start = r_start,
93 .next = r_next, 93 .next = r_next,
94 .stop = r_stop, 94 .stop = r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
115 return res; 115 return res;
116} 116}
117 117
118static struct file_operations proc_ioports_operations = { 118static const struct file_operations proc_ioports_operations = {
119 .open = ioports_open, 119 .open = ioports_open,
120 .read = seq_read, 120 .read = seq_read,
121 .llseek = seq_lseek, 121 .llseek = seq_lseek,
122 .release = seq_release, 122 .release = seq_release,
123}; 123};
124 124
125static struct file_operations proc_iomem_operations = { 125static const struct file_operations proc_iomem_operations = {
126 .open = iomem_open, 126 .open = iomem_open,
127 .read = seq_read, 127 .read = seq_read,
128 .llseek = seq_lseek, 128 .llseek = seq_lseek,
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c94..015fc633c96c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/sysdev.h> 14#include <linux/sysdev.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16#include <linux/freezer.h>
16 17
17#include "rtmutex.h" 18#include "rtmutex.h"
18 19
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701c680e..5cd833bc2173 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
34#include <linux/security.h> 34#include <linux/security.h>
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/profile.h> 36#include <linux/profile.h>
37#include <linux/suspend.h> 37#include <linux/freezer.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
@@ -225,8 +225,10 @@ struct rq {
225 unsigned long nr_uninterruptible; 225 unsigned long nr_uninterruptible;
226 226
227 unsigned long expired_timestamp; 227 unsigned long expired_timestamp;
228 unsigned long long timestamp_last_tick; 228 /* Cached timestamp set by update_cpu_clock() */
229 unsigned long long most_recent_timestamp;
229 struct task_struct *curr, *idle; 230 struct task_struct *curr, *idle;
231 unsigned long next_balance;
230 struct mm_struct *prev_mm; 232 struct mm_struct *prev_mm;
231 struct prio_array *active, *expired, arrays[2]; 233 struct prio_array *active, *expired, arrays[2];
232 int best_expired_prio; 234 int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
426 * bump this up when changing the output format or the meaning of an existing 428 * bump this up when changing the output format or the meaning of an existing
427 * format, so that tools can adapt (or abort) 429 * format, so that tools can adapt (or abort)
428 */ 430 */
429#define SCHEDSTAT_VERSION 12 431#define SCHEDSTAT_VERSION 14
430 432
431static int show_schedstat(struct seq_file *seq, void *v) 433static int show_schedstat(struct seq_file *seq, void *v)
432{ 434{
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
464 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 466 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
465 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
466 itype++) { 468 itype++) {
467 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
470 "%lu",
468 sd->lb_cnt[itype], 471 sd->lb_cnt[itype],
469 sd->lb_balanced[itype], 472 sd->lb_balanced[itype],
470 sd->lb_failed[itype], 473 sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
474 sd->lb_nobusyq[itype], 477 sd->lb_nobusyq[itype],
475 sd->lb_nobusyg[itype]); 478 sd->lb_nobusyg[itype]);
476 } 479 }
477 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 480 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
481 " %lu %lu %lu\n",
478 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 482 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
479 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 483 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
480 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 484 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
481 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 485 sd->ttwu_wake_remote, sd->ttwu_move_affine,
486 sd->ttwu_move_balance);
482 } 487 }
483 preempt_enable(); 488 preempt_enable();
484#endif 489#endif
@@ -505,7 +510,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
505 return res; 510 return res;
506} 511}
507 512
508struct file_operations proc_schedstat_operations = { 513const struct file_operations proc_schedstat_operations = {
509 .open = schedstat_open, 514 .open = schedstat_open,
510 .read = seq_read, 515 .read = seq_read,
511 .llseek = seq_lseek, 516 .llseek = seq_lseek,
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
547#endif 552#endif
548 553
549/* 554/*
550 * rq_lock - lock a given runqueue and disable interrupts. 555 * this_rq_lock - lock this runqueue and disable interrupts.
551 */ 556 */
552static inline struct rq *this_rq_lock(void) 557static inline struct rq *this_rq_lock(void)
553 __acquires(rq->lock) 558 __acquires(rq->lock)
@@ -938,18 +943,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
938{ 943{
939 unsigned long long now; 944 unsigned long long now;
940 945
946 if (rt_task(p))
947 goto out;
948
941 now = sched_clock(); 949 now = sched_clock();
942#ifdef CONFIG_SMP 950#ifdef CONFIG_SMP
943 if (!local) { 951 if (!local) {
944 /* Compensate for drifting sched_clock */ 952 /* Compensate for drifting sched_clock */
945 struct rq *this_rq = this_rq(); 953 struct rq *this_rq = this_rq();
946 now = (now - this_rq->timestamp_last_tick) 954 now = (now - this_rq->most_recent_timestamp)
947 + rq->timestamp_last_tick; 955 + rq->most_recent_timestamp;
948 } 956 }
949#endif 957#endif
950 958
951 if (!rt_task(p)) 959 /*
952 p->prio = recalc_task_prio(p, now); 960 * Sleep time is in units of nanosecs, so shift by 20 to get a
961 * milliseconds-range estimation of the amount of time that the task
962 * spent sleeping:
963 */
964 if (unlikely(prof_on == SLEEP_PROFILING)) {
965 if (p->state == TASK_UNINTERRUPTIBLE)
966 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
967 (now - p->timestamp) >> 20);
968 }
969
970 p->prio = recalc_task_prio(p, now);
953 971
954 /* 972 /*
955 * This checks to make sure it's not an uninterruptible task 973 * This checks to make sure it's not an uninterruptible task
@@ -974,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
974 } 992 }
975 } 993 }
976 p->timestamp = now; 994 p->timestamp = now;
977 995out:
978 __activate_task(p, rq); 996 __activate_task(p, rq);
979} 997}
980 998
@@ -1439,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1439 1457
1440 if (this_sd->flags & SD_WAKE_AFFINE) { 1458 if (this_sd->flags & SD_WAKE_AFFINE) {
1441 unsigned long tl = this_load; 1459 unsigned long tl = this_load;
1442 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); 1460 unsigned long tl_per_task;
1461
1462 tl_per_task = cpu_avg_load_per_task(this_cpu);
1443 1463
1444 /* 1464 /*
1445 * If sync wakeup then subtract the (maximum possible) 1465 * If sync wakeup then subtract the (maximum possible)
@@ -1677,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1677 * Not the local CPU - must adjust timestamp. This should 1697 * Not the local CPU - must adjust timestamp. This should
1678 * get optimised away in the !CONFIG_SMP case. 1698 * get optimised away in the !CONFIG_SMP case.
1679 */ 1699 */
1680 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) 1700 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1681 + rq->timestamp_last_tick; 1701 + rq->most_recent_timestamp;
1682 __activate_task(p, rq); 1702 __activate_task(p, rq);
1683 if (TASK_PREEMPTS_CURR(p, rq)) 1703 if (TASK_PREEMPTS_CURR(p, rq))
1684 resched_task(rq->curr); 1704 resched_task(rq->curr);
@@ -1941,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1941 __acquires(rq1->lock) 1961 __acquires(rq1->lock)
1942 __acquires(rq2->lock) 1962 __acquires(rq2->lock)
1943{ 1963{
1964 BUG_ON(!irqs_disabled());
1944 if (rq1 == rq2) { 1965 if (rq1 == rq2) {
1945 spin_lock(&rq1->lock); 1966 spin_lock(&rq1->lock);
1946 __acquire(rq2->lock); /* Fake it out ;) */ 1967 __acquire(rq2->lock); /* Fake it out ;) */
@@ -1980,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1980 __acquires(busiest->lock) 2001 __acquires(busiest->lock)
1981 __acquires(this_rq->lock) 2002 __acquires(this_rq->lock)
1982{ 2003{
2004 if (unlikely(!irqs_disabled())) {
2005 /* printk() doesn't work good under rq->lock */
2006 spin_unlock(&this_rq->lock);
2007 BUG_ON(1);
2008 }
1983 if (unlikely(!spin_trylock(&busiest->lock))) { 2009 if (unlikely(!spin_trylock(&busiest->lock))) {
1984 if (busiest < this_rq) { 2010 if (busiest < this_rq) {
1985 spin_unlock(&this_rq->lock); 2011 spin_unlock(&this_rq->lock);
@@ -2050,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2050 set_task_cpu(p, this_cpu); 2076 set_task_cpu(p, this_cpu);
2051 inc_nr_running(p, this_rq); 2077 inc_nr_running(p, this_rq);
2052 enqueue_task(p, this_array); 2078 enqueue_task(p, this_array);
2053 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 2079 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2054 + this_rq->timestamp_last_tick; 2080 + this_rq->most_recent_timestamp;
2055 /* 2081 /*
2056 * Note that idle threads have a prio of MAX_PRIO, for this test 2082 * Note that idle threads have a prio of MAX_PRIO, for this test
2057 * to be always true for them. 2083 * to be always true for them.
@@ -2087,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2087 * 2) too many balance attempts have failed. 2113 * 2) too many balance attempts have failed.
2088 */ 2114 */
2089 2115
2090 if (sd->nr_balance_failed > sd->cache_nice_tries) 2116 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2117#ifdef CONFIG_SCHEDSTATS
2118 if (task_hot(p, rq->most_recent_timestamp, sd))
2119 schedstat_inc(sd, lb_hot_gained[idle]);
2120#endif
2091 return 1; 2121 return 1;
2122 }
2092 2123
2093 if (task_hot(p, rq->timestamp_last_tick, sd)) 2124 if (task_hot(p, rq->most_recent_timestamp, sd))
2094 return 0; 2125 return 0;
2095 return 1; 2126 return 1;
2096} 2127}
@@ -2188,11 +2219,6 @@ skip_queue:
2188 goto skip_bitmap; 2219 goto skip_bitmap;
2189 } 2220 }
2190 2221
2191#ifdef CONFIG_SCHEDSTATS
2192 if (task_hot(tmp, busiest->timestamp_last_tick, sd))
2193 schedstat_inc(sd, lb_hot_gained[idle]);
2194#endif
2195
2196 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2222 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2197 pulled++; 2223 pulled++;
2198 rem_load_move -= tmp->load_weight; 2224 rem_load_move -= tmp->load_weight;
@@ -2230,7 +2256,7 @@ out:
2230static struct sched_group * 2256static struct sched_group *
2231find_busiest_group(struct sched_domain *sd, int this_cpu, 2257find_busiest_group(struct sched_domain *sd, int this_cpu,
2232 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2258 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2233 cpumask_t *cpus) 2259 cpumask_t *cpus, int *balance)
2234{ 2260{
2235 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2261 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2236 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2262 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2259,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2259 unsigned long load, group_capacity; 2285 unsigned long load, group_capacity;
2260 int local_group; 2286 int local_group;
2261 int i; 2287 int i;
2288 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2262 unsigned long sum_nr_running, sum_weighted_load; 2289 unsigned long sum_nr_running, sum_weighted_load;
2263 2290
2264 local_group = cpu_isset(this_cpu, group->cpumask); 2291 local_group = cpu_isset(this_cpu, group->cpumask);
2265 2292
2293 if (local_group)
2294 balance_cpu = first_cpu(group->cpumask);
2295
2266 /* Tally up the load of all CPUs in the group */ 2296 /* Tally up the load of all CPUs in the group */
2267 sum_weighted_load = sum_nr_running = avg_load = 0; 2297 sum_weighted_load = sum_nr_running = avg_load = 0;
2268 2298
@@ -2278,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2278 *sd_idle = 0; 2308 *sd_idle = 0;
2279 2309
2280 /* Bias balancing toward cpus of our domain */ 2310 /* Bias balancing toward cpus of our domain */
2281 if (local_group) 2311 if (local_group) {
2312 if (idle_cpu(i) && !first_idle_cpu) {
2313 first_idle_cpu = 1;
2314 balance_cpu = i;
2315 }
2316
2282 load = target_load(i, load_idx); 2317 load = target_load(i, load_idx);
2283 else 2318 } else
2284 load = source_load(i, load_idx); 2319 load = source_load(i, load_idx);
2285 2320
2286 avg_load += load; 2321 avg_load += load;
@@ -2288,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2288 sum_weighted_load += rq->raw_weighted_load; 2323 sum_weighted_load += rq->raw_weighted_load;
2289 } 2324 }
2290 2325
2326 /*
2327 * First idle cpu or the first cpu(busiest) in this sched group
2328 * is eligible for doing load balancing at this and above
2329 * domains.
2330 */
2331 if (local_group && balance_cpu != this_cpu && balance) {
2332 *balance = 0;
2333 goto ret;
2334 }
2335
2291 total_load += avg_load; 2336 total_load += avg_load;
2292 total_pwr += group->cpu_power; 2337 total_pwr += group->cpu_power;
2293 2338
@@ -2447,18 +2492,21 @@ small_imbalance:
2447 pwr_now /= SCHED_LOAD_SCALE; 2492 pwr_now /= SCHED_LOAD_SCALE;
2448 2493
2449 /* Amount of load we'd subtract */ 2494 /* Amount of load we'd subtract */
2450 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; 2495 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2496 busiest->cpu_power;
2451 if (max_load > tmp) 2497 if (max_load > tmp)
2452 pwr_move += busiest->cpu_power * 2498 pwr_move += busiest->cpu_power *
2453 min(busiest_load_per_task, max_load - tmp); 2499 min(busiest_load_per_task, max_load - tmp);
2454 2500
2455 /* Amount of load we'd add */ 2501 /* Amount of load we'd add */
2456 if (max_load*busiest->cpu_power < 2502 if (max_load * busiest->cpu_power <
2457 busiest_load_per_task*SCHED_LOAD_SCALE) 2503 busiest_load_per_task * SCHED_LOAD_SCALE)
2458 tmp = max_load*busiest->cpu_power/this->cpu_power; 2504 tmp = max_load * busiest->cpu_power / this->cpu_power;
2459 else 2505 else
2460 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; 2506 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2461 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); 2507 this->cpu_power;
2508 pwr_move += this->cpu_power *
2509 min(this_load_per_task, this_load + tmp);
2462 pwr_move /= SCHED_LOAD_SCALE; 2510 pwr_move /= SCHED_LOAD_SCALE;
2463 2511
2464 /* Move if we gain throughput */ 2512 /* Move if we gain throughput */
@@ -2479,8 +2527,8 @@ out_balanced:
2479 *imbalance = min_load_per_task; 2527 *imbalance = min_load_per_task;
2480 return group_min; 2528 return group_min;
2481 } 2529 }
2482ret:
2483#endif 2530#endif
2531ret:
2484 *imbalance = 0; 2532 *imbalance = 0;
2485 return NULL; 2533 return NULL;
2486} 2534}
@@ -2529,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2529/* 2577/*
2530 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2578 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2531 * tasks if there is an imbalance. 2579 * tasks if there is an imbalance.
2532 *
2533 * Called with this_rq unlocked.
2534 */ 2580 */
2535static int load_balance(int this_cpu, struct rq *this_rq, 2581static int load_balance(int this_cpu, struct rq *this_rq,
2536 struct sched_domain *sd, enum idle_type idle) 2582 struct sched_domain *sd, enum idle_type idle,
2583 int *balance)
2537{ 2584{
2538 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2585 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2539 struct sched_group *group; 2586 struct sched_group *group;
2540 unsigned long imbalance; 2587 unsigned long imbalance;
2541 struct rq *busiest; 2588 struct rq *busiest;
2542 cpumask_t cpus = CPU_MASK_ALL; 2589 cpumask_t cpus = CPU_MASK_ALL;
2590 unsigned long flags;
2543 2591
2544 /* 2592 /*
2545 * When power savings policy is enabled for the parent domain, idle 2593 * When power savings policy is enabled for the parent domain, idle
@@ -2555,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2555 2603
2556redo: 2604redo:
2557 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2605 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2558 &cpus); 2606 &cpus, balance);
2607
2608 if (*balance == 0)
2609 goto out_balanced;
2610
2559 if (!group) { 2611 if (!group) {
2560 schedstat_inc(sd, lb_nobusyg[idle]); 2612 schedstat_inc(sd, lb_nobusyg[idle]);
2561 goto out_balanced; 2613 goto out_balanced;
@@ -2579,11 +2631,13 @@ redo:
2579 * still unbalanced. nr_moved simply stays zero, so it is 2631 * still unbalanced. nr_moved simply stays zero, so it is
2580 * correctly treated as an imbalance. 2632 * correctly treated as an imbalance.
2581 */ 2633 */
2634 local_irq_save(flags);
2582 double_rq_lock(this_rq, busiest); 2635 double_rq_lock(this_rq, busiest);
2583 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2636 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2584 minus_1_or_zero(busiest->nr_running), 2637 minus_1_or_zero(busiest->nr_running),
2585 imbalance, sd, idle, &all_pinned); 2638 imbalance, sd, idle, &all_pinned);
2586 double_rq_unlock(this_rq, busiest); 2639 double_rq_unlock(this_rq, busiest);
2640 local_irq_restore(flags);
2587 2641
2588 /* All tasks on this runqueue were pinned by CPU affinity */ 2642 /* All tasks on this runqueue were pinned by CPU affinity */
2589 if (unlikely(all_pinned)) { 2643 if (unlikely(all_pinned)) {
@@ -2600,13 +2654,13 @@ redo:
2600 2654
2601 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2655 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2602 2656
2603 spin_lock(&busiest->lock); 2657 spin_lock_irqsave(&busiest->lock, flags);
2604 2658
2605 /* don't kick the migration_thread, if the curr 2659 /* don't kick the migration_thread, if the curr
2606 * task on busiest cpu can't be moved to this_cpu 2660 * task on busiest cpu can't be moved to this_cpu
2607 */ 2661 */
2608 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 2662 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2609 spin_unlock(&busiest->lock); 2663 spin_unlock_irqrestore(&busiest->lock, flags);
2610 all_pinned = 1; 2664 all_pinned = 1;
2611 goto out_one_pinned; 2665 goto out_one_pinned;
2612 } 2666 }
@@ -2616,7 +2670,7 @@ redo:
2616 busiest->push_cpu = this_cpu; 2670 busiest->push_cpu = this_cpu;
2617 active_balance = 1; 2671 active_balance = 1;
2618 } 2672 }
2619 spin_unlock(&busiest->lock); 2673 spin_unlock_irqrestore(&busiest->lock, flags);
2620 if (active_balance) 2674 if (active_balance)
2621 wake_up_process(busiest->migration_thread); 2675 wake_up_process(busiest->migration_thread);
2622 2676
@@ -2695,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2695 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2749 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2696redo: 2750redo:
2697 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2751 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2698 &sd_idle, &cpus); 2752 &sd_idle, &cpus, NULL);
2699 if (!group) { 2753 if (!group) {
2700 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2754 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2701 goto out_balanced; 2755 goto out_balanced;
@@ -2755,14 +2809,28 @@ out_balanced:
2755static void idle_balance(int this_cpu, struct rq *this_rq) 2809static void idle_balance(int this_cpu, struct rq *this_rq)
2756{ 2810{
2757 struct sched_domain *sd; 2811 struct sched_domain *sd;
2812 int pulled_task = 0;
2813 unsigned long next_balance = jiffies + 60 * HZ;
2758 2814
2759 for_each_domain(this_cpu, sd) { 2815 for_each_domain(this_cpu, sd) {
2760 if (sd->flags & SD_BALANCE_NEWIDLE) { 2816 if (sd->flags & SD_BALANCE_NEWIDLE) {
2761 /* If we've pulled tasks over stop searching: */ 2817 /* If we've pulled tasks over stop searching: */
2762 if (load_balance_newidle(this_cpu, this_rq, sd)) 2818 pulled_task = load_balance_newidle(this_cpu,
2819 this_rq, sd);
2820 if (time_after(next_balance,
2821 sd->last_balance + sd->balance_interval))
2822 next_balance = sd->last_balance
2823 + sd->balance_interval;
2824 if (pulled_task)
2763 break; 2825 break;
2764 } 2826 }
2765 } 2827 }
2828 if (!pulled_task)
2829 /*
2830 * We are going idle. next_balance may be set based on
2831 * a busy processor. So reset next_balance.
2832 */
2833 this_rq->next_balance = next_balance;
2766} 2834}
2767 2835
2768/* 2836/*
@@ -2815,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2815 spin_unlock(&target_rq->lock); 2883 spin_unlock(&target_rq->lock);
2816} 2884}
2817 2885
2818/* 2886static void update_load(struct rq *this_rq)
2819 * rebalance_tick will get called every timer tick, on every CPU.
2820 *
2821 * It checks each scheduling domain to see if it is due to be balanced,
2822 * and initiates a balancing operation if so.
2823 *
2824 * Balancing parameters are set up in arch_init_sched_domains.
2825 */
2826
2827/* Don't have all balancing operations going off at once: */
2828static inline unsigned long cpu_offset(int cpu)
2829{ 2887{
2830 return jiffies + cpu * HZ / NR_CPUS; 2888 unsigned long this_load;
2831}
2832
2833static void
2834rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2835{
2836 unsigned long this_load, interval, j = cpu_offset(this_cpu);
2837 struct sched_domain *sd;
2838 int i, scale; 2889 int i, scale;
2839 2890
2840 this_load = this_rq->raw_weighted_load; 2891 this_load = this_rq->raw_weighted_load;
@@ -2854,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2854 new_load += scale-1; 2905 new_load += scale-1;
2855 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; 2906 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2856 } 2907 }
2908}
2909
2910/*
2911 * run_rebalance_domains is triggered when needed from the scheduler tick.
2912 *
2913 * It checks each scheduling domain to see if it is due to be balanced,
2914 * and initiates a balancing operation if so.
2915 *
2916 * Balancing parameters are set up in arch_init_sched_domains.
2917 */
2918static DEFINE_SPINLOCK(balancing);
2919
2920static void run_rebalance_domains(struct softirq_action *h)
2921{
2922 int this_cpu = smp_processor_id(), balance = 1;
2923 struct rq *this_rq = cpu_rq(this_cpu);
2924 unsigned long interval;
2925 struct sched_domain *sd;
2926 /*
2927 * We are idle if there are no processes running. This
2928 * is valid even if we are the idle process (SMT).
2929 */
2930 enum idle_type idle = !this_rq->nr_running ?
2931 SCHED_IDLE : NOT_IDLE;
2932 /* Earliest time when we have to call run_rebalance_domains again */
2933 unsigned long next_balance = jiffies + 60*HZ;
2857 2934
2858 for_each_domain(this_cpu, sd) { 2935 for_each_domain(this_cpu, sd) {
2859 if (!(sd->flags & SD_LOAD_BALANCE)) 2936 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2868,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2868 if (unlikely(!interval)) 2945 if (unlikely(!interval))
2869 interval = 1; 2946 interval = 1;
2870 2947
2871 if (j - sd->last_balance >= interval) { 2948 if (sd->flags & SD_SERIALIZE) {
2872 if (load_balance(this_cpu, this_rq, sd, idle)) { 2949 if (!spin_trylock(&balancing))
2950 goto out;
2951 }
2952
2953 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2954 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
2873 /* 2955 /*
2874 * We've pulled tasks over so either we're no 2956 * We've pulled tasks over so either we're no
2875 * longer idle, or one of our SMT siblings is 2957 * longer idle, or one of our SMT siblings is
@@ -2877,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2877 */ 2959 */
2878 idle = NOT_IDLE; 2960 idle = NOT_IDLE;
2879 } 2961 }
2880 sd->last_balance += interval; 2962 sd->last_balance = jiffies;
2881 } 2963 }
2964 if (sd->flags & SD_SERIALIZE)
2965 spin_unlock(&balancing);
2966out:
2967 if (time_after(next_balance, sd->last_balance + interval))
2968 next_balance = sd->last_balance + interval;
2969
2970 /*
2971 * Stop the load balance at this level. There is another
2972 * CPU in our sched group which is doing load balancing more
2973 * actively.
2974 */
2975 if (!balance)
2976 break;
2882 } 2977 }
2978 this_rq->next_balance = next_balance;
2883} 2979}
2884#else 2980#else
2885/* 2981/*
2886 * on UP we do not need to balance between CPUs: 2982 * on UP we do not need to balance between CPUs:
2887 */ 2983 */
2888static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
2889{
2890}
2891static inline void idle_balance(int cpu, struct rq *rq) 2984static inline void idle_balance(int cpu, struct rq *rq)
2892{ 2985{
2893} 2986}
2894#endif 2987#endif
2895 2988
2896static inline int wake_priority_sleeper(struct rq *rq) 2989static inline void wake_priority_sleeper(struct rq *rq)
2897{ 2990{
2898 int ret = 0;
2899
2900#ifdef CONFIG_SCHED_SMT 2991#ifdef CONFIG_SCHED_SMT
2992 if (!rq->nr_running)
2993 return;
2994
2901 spin_lock(&rq->lock); 2995 spin_lock(&rq->lock);
2902 /* 2996 /*
2903 * If an SMT sibling task has been put to sleep for priority 2997 * If an SMT sibling task has been put to sleep for priority
2904 * reasons reschedule the idle task to see if it can now run. 2998 * reasons reschedule the idle task to see if it can now run.
2905 */ 2999 */
2906 if (rq->nr_running) { 3000 if (rq->nr_running)
2907 resched_task(rq->idle); 3001 resched_task(rq->idle);
2908 ret = 1;
2909 }
2910 spin_unlock(&rq->lock); 3002 spin_unlock(&rq->lock);
2911#endif 3003#endif
2912 return ret;
2913} 3004}
2914 3005
2915DEFINE_PER_CPU(struct kernel_stat, kstat); 3006DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2923,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2923static inline void 3014static inline void
2924update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) 3015update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2925{ 3016{
2926 p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); 3017 p->sched_time += now - p->last_ran;
3018 p->last_ran = rq->most_recent_timestamp = now;
2927} 3019}
2928 3020
2929/* 3021/*
@@ -2936,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
2936 unsigned long flags; 3028 unsigned long flags;
2937 3029
2938 local_irq_save(flags); 3030 local_irq_save(flags);
2939 ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); 3031 ns = p->sched_time + sched_clock() - p->last_ran;
2940 ns = p->sched_time + sched_clock() - ns;
2941 local_irq_restore(flags); 3032 local_irq_restore(flags);
2942 3033
2943 return ns; 3034 return ns;
@@ -3037,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3037 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3128 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3038} 3129}
3039 3130
3040/* 3131static void task_running_tick(struct rq *rq, struct task_struct *p)
3041 * This function gets called by the timer code, with HZ frequency.
3042 * We call it with interrupts disabled.
3043 *
3044 * It also gets called by the fork code, when changing the parent's
3045 * timeslices.
3046 */
3047void scheduler_tick(void)
3048{ 3132{
3049 unsigned long long now = sched_clock();
3050 struct task_struct *p = current;
3051 int cpu = smp_processor_id();
3052 struct rq *rq = cpu_rq(cpu);
3053
3054 update_cpu_clock(p, rq, now);
3055
3056 rq->timestamp_last_tick = now;
3057
3058 if (p == rq->idle) {
3059 if (wake_priority_sleeper(rq))
3060 goto out;
3061 rebalance_tick(cpu, rq, SCHED_IDLE);
3062 return;
3063 }
3064
3065 /* Task might have expired already, but not scheduled off yet */
3066 if (p->array != rq->active) { 3133 if (p->array != rq->active) {
3134 /* Task has expired but was not scheduled yet */
3067 set_tsk_need_resched(p); 3135 set_tsk_need_resched(p);
3068 goto out; 3136 return;
3069 } 3137 }
3070 spin_lock(&rq->lock); 3138 spin_lock(&rq->lock);
3071 /* 3139 /*
@@ -3133,8 +3201,34 @@ void scheduler_tick(void)
3133 } 3201 }
3134out_unlock: 3202out_unlock:
3135 spin_unlock(&rq->lock); 3203 spin_unlock(&rq->lock);
3136out: 3204}
3137 rebalance_tick(cpu, rq, NOT_IDLE); 3205
3206/*
3207 * This function gets called by the timer code, with HZ frequency.
3208 * We call it with interrupts disabled.
3209 *
3210 * It also gets called by the fork code, when changing the parent's
3211 * timeslices.
3212 */
3213void scheduler_tick(void)
3214{
3215 unsigned long long now = sched_clock();
3216 struct task_struct *p = current;
3217 int cpu = smp_processor_id();
3218 struct rq *rq = cpu_rq(cpu);
3219
3220 update_cpu_clock(p, rq, now);
3221
3222 if (p == rq->idle)
3223 /* Task on the idle queue */
3224 wake_priority_sleeper(rq);
3225 else
3226 task_running_tick(rq, p);
3227#ifdef CONFIG_SMP
3228 update_load(rq);
3229 if (time_after_eq(jiffies, rq->next_balance))
3230 raise_softirq(SCHED_SOFTIRQ);
3231#endif
3138} 3232}
3139 3233
3140#ifdef CONFIG_SCHED_SMT 3234#ifdef CONFIG_SCHED_SMT
@@ -3280,7 +3374,8 @@ void fastcall add_preempt_count(int val)
3280 /* 3374 /*
3281 * Spinlock count overflowing soon? 3375 * Spinlock count overflowing soon?
3282 */ 3376 */
3283 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 3377 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3378 PREEMPT_MASK - 10);
3284} 3379}
3285EXPORT_SYMBOL(add_preempt_count); 3380EXPORT_SYMBOL(add_preempt_count);
3286 3381
@@ -3333,6 +3428,9 @@ asmlinkage void __sched schedule(void)
3333 printk(KERN_ERR "BUG: scheduling while atomic: " 3428 printk(KERN_ERR "BUG: scheduling while atomic: "
3334 "%s/0x%08x/%d\n", 3429 "%s/0x%08x/%d\n",
3335 current->comm, preempt_count(), current->pid); 3430 current->comm, preempt_count(), current->pid);
3431 debug_show_held_locks(current);
3432 if (irqs_disabled())
3433 print_irqtrace_events(current);
3336 dump_stack(); 3434 dump_stack();
3337 } 3435 }
3338 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3436 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4804,18 +4902,18 @@ static void show_task(struct task_struct *p)
4804 show_stack(p, NULL); 4902 show_stack(p, NULL);
4805} 4903}
4806 4904
4807void show_state(void) 4905void show_state_filter(unsigned long state_filter)
4808{ 4906{
4809 struct task_struct *g, *p; 4907 struct task_struct *g, *p;
4810 4908
4811#if (BITS_PER_LONG == 32) 4909#if (BITS_PER_LONG == 32)
4812 printk("\n" 4910 printk("\n"
4813 " sibling\n"); 4911 " free sibling\n");
4814 printk(" task PC pid father child younger older\n"); 4912 printk(" task PC stack pid father child younger older\n");
4815#else 4913#else
4816 printk("\n" 4914 printk("\n"
4817 " sibling\n"); 4915 " free sibling\n");
4818 printk(" task PC pid father child younger older\n"); 4916 printk(" task PC stack pid father child younger older\n");
4819#endif 4917#endif
4820 read_lock(&tasklist_lock); 4918 read_lock(&tasklist_lock);
4821 do_each_thread(g, p) { 4919 do_each_thread(g, p) {
@@ -4824,11 +4922,16 @@ void show_state(void)
4824 * console might take alot of time: 4922 * console might take alot of time:
4825 */ 4923 */
4826 touch_nmi_watchdog(); 4924 touch_nmi_watchdog();
4827 show_task(p); 4925 if (p->state & state_filter)
4926 show_task(p);
4828 } while_each_thread(g, p); 4927 } while_each_thread(g, p);
4829 4928
4830 read_unlock(&tasklist_lock); 4929 read_unlock(&tasklist_lock);
4831 debug_show_all_locks(); 4930 /*
4931 * Only show locks if all tasks are dumped:
4932 */
4933 if (state_filter == -1)
4934 debug_show_all_locks();
4832} 4935}
4833 4936
4834/** 4937/**
@@ -4973,8 +5076,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4973 * afterwards, and pretending it was a local activate. 5076 * afterwards, and pretending it was a local activate.
4974 * This way is cleaner and logically correct. 5077 * This way is cleaner and logically correct.
4975 */ 5078 */
4976 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 5079 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
4977 + rq_dest->timestamp_last_tick; 5080 + rq_dest->most_recent_timestamp;
4978 deactivate_task(p, rq_src); 5081 deactivate_task(p, rq_src);
4979 __activate_task(p, rq_dest); 5082 __activate_task(p, rq_dest);
4980 if (TASK_PREEMPTS_CURR(p, rq_dest)) 5083 if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5050,7 +5153,10 @@ wait_to_die:
5050} 5153}
5051 5154
5052#ifdef CONFIG_HOTPLUG_CPU 5155#ifdef CONFIG_HOTPLUG_CPU
5053/* Figure out where task on dead CPU should go, use force if neccessary. */ 5156/*
5157 * Figure out where task on dead CPU should go, use force if neccessary.
5158 * NOTE: interrupts should be disabled by the caller
5159 */
5054static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5160static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5055{ 5161{
5056 unsigned long flags; 5162 unsigned long flags;
@@ -5170,6 +5276,7 @@ void idle_task_exit(void)
5170 mmdrop(mm); 5276 mmdrop(mm);
5171} 5277}
5172 5278
5279/* called under rq->lock with disabled interrupts */
5173static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5280static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5174{ 5281{
5175 struct rq *rq = cpu_rq(dead_cpu); 5282 struct rq *rq = cpu_rq(dead_cpu);
@@ -5186,10 +5293,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5186 * Drop lock around migration; if someone else moves it, 5293 * Drop lock around migration; if someone else moves it,
5187 * that's OK. No task can be added to this CPU, so iteration is 5294 * that's OK. No task can be added to this CPU, so iteration is
5188 * fine. 5295 * fine.
5296 * NOTE: interrupts should be left disabled --dev@
5189 */ 5297 */
5190 spin_unlock_irq(&rq->lock); 5298 spin_unlock(&rq->lock);
5191 move_task_off_dead_cpu(dead_cpu, p); 5299 move_task_off_dead_cpu(dead_cpu, p);
5192 spin_lock_irq(&rq->lock); 5300 spin_lock(&rq->lock);
5193 5301
5194 put_task_struct(p); 5302 put_task_struct(p);
5195} 5303}
@@ -5342,16 +5450,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5342 if (!(sd->flags & SD_LOAD_BALANCE)) { 5450 if (!(sd->flags & SD_LOAD_BALANCE)) {
5343 printk("does not load-balance\n"); 5451 printk("does not load-balance\n");
5344 if (sd->parent) 5452 if (sd->parent)
5345 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); 5453 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5454 " has parent");
5346 break; 5455 break;
5347 } 5456 }
5348 5457
5349 printk("span %s\n", str); 5458 printk("span %s\n", str);
5350 5459
5351 if (!cpu_isset(cpu, sd->span)) 5460 if (!cpu_isset(cpu, sd->span))
5352 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 5461 printk(KERN_ERR "ERROR: domain->span does not contain "
5462 "CPU%d\n", cpu);
5353 if (!cpu_isset(cpu, group->cpumask)) 5463 if (!cpu_isset(cpu, group->cpumask))
5354 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 5464 printk(KERN_ERR "ERROR: domain->groups does not contain"
5465 " CPU%d\n", cpu);
5355 5466
5356 printk(KERN_DEBUG); 5467 printk(KERN_DEBUG);
5357 for (i = 0; i < level + 2; i++) 5468 for (i = 0; i < level + 2; i++)
@@ -5366,7 +5477,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5366 5477
5367 if (!group->cpu_power) { 5478 if (!group->cpu_power) {
5368 printk("\n"); 5479 printk("\n");
5369 printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); 5480 printk(KERN_ERR "ERROR: domain->cpu_power not "
5481 "set\n");
5370 } 5482 }
5371 5483
5372 if (!cpus_weight(group->cpumask)) { 5484 if (!cpus_weight(group->cpumask)) {
@@ -5389,15 +5501,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5389 printk("\n"); 5501 printk("\n");
5390 5502
5391 if (!cpus_equal(sd->span, groupmask)) 5503 if (!cpus_equal(sd->span, groupmask))
5392 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5504 printk(KERN_ERR "ERROR: groups don't span "
5505 "domain->span\n");
5393 5506
5394 level++; 5507 level++;
5395 sd = sd->parent; 5508 sd = sd->parent;
5509 if (!sd)
5510 continue;
5396 5511
5397 if (sd) { 5512 if (!cpus_subset(groupmask, sd->span))
5398 if (!cpus_subset(groupmask, sd->span)) 5513 printk(KERN_ERR "ERROR: parent span is not a superset "
5399 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 5514 "of domain->span\n");
5400 }
5401 5515
5402 } while (sd); 5516 } while (sd);
5403} 5517}
@@ -5511,28 +5625,27 @@ static int __init isolated_cpu_setup(char *str)
5511__setup ("isolcpus=", isolated_cpu_setup); 5625__setup ("isolcpus=", isolated_cpu_setup);
5512 5626
5513/* 5627/*
5514 * init_sched_build_groups takes an array of groups, the cpumask we wish 5628 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5515 * to span, and a pointer to a function which identifies what group a CPU 5629 * to a function which identifies what group(along with sched group) a CPU
5516 * belongs to. The return value of group_fn must be a valid index into the 5630 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5517 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we 5631 * (due to the fact that we keep track of groups covered with a cpumask_t).
5518 * keep track of groups covered with a cpumask_t).
5519 * 5632 *
5520 * init_sched_build_groups will build a circular linked list of the groups 5633 * init_sched_build_groups will build a circular linked list of the groups
5521 * covered by the given span, and will set each group's ->cpumask correctly, 5634 * covered by the given span, and will set each group's ->cpumask correctly,
5522 * and ->cpu_power to 0. 5635 * and ->cpu_power to 0.
5523 */ 5636 */
5524static void 5637static void
5525init_sched_build_groups(struct sched_group groups[], cpumask_t span, 5638init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5526 const cpumask_t *cpu_map, 5639 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5527 int (*group_fn)(int cpu, const cpumask_t *cpu_map)) 5640 struct sched_group **sg))
5528{ 5641{
5529 struct sched_group *first = NULL, *last = NULL; 5642 struct sched_group *first = NULL, *last = NULL;
5530 cpumask_t covered = CPU_MASK_NONE; 5643 cpumask_t covered = CPU_MASK_NONE;
5531 int i; 5644 int i;
5532 5645
5533 for_each_cpu_mask(i, span) { 5646 for_each_cpu_mask(i, span) {
5534 int group = group_fn(i, cpu_map); 5647 struct sched_group *sg;
5535 struct sched_group *sg = &groups[group]; 5648 int group = group_fn(i, cpu_map, &sg);
5536 int j; 5649 int j;
5537 5650
5538 if (cpu_isset(i, covered)) 5651 if (cpu_isset(i, covered))
@@ -5542,7 +5655,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5542 sg->cpu_power = 0; 5655 sg->cpu_power = 0;
5543 5656
5544 for_each_cpu_mask(j, span) { 5657 for_each_cpu_mask(j, span) {
5545 if (group_fn(j, cpu_map) != group) 5658 if (group_fn(j, cpu_map, NULL) != group)
5546 continue; 5659 continue;
5547 5660
5548 cpu_set(j, covered); 5661 cpu_set(j, covered);
@@ -5716,8 +5829,9 @@ __setup("max_cache_size=", setup_max_cache_size);
5716 */ 5829 */
5717static void touch_cache(void *__cache, unsigned long __size) 5830static void touch_cache(void *__cache, unsigned long __size)
5718{ 5831{
5719 unsigned long size = __size/sizeof(long), chunk1 = size/3, 5832 unsigned long size = __size / sizeof(long);
5720 chunk2 = 2*size/3; 5833 unsigned long chunk1 = size / 3;
5834 unsigned long chunk2 = 2 * size / 3;
5721 unsigned long *cache = __cache; 5835 unsigned long *cache = __cache;
5722 int i; 5836 int i;
5723 5837
@@ -5826,11 +5940,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5826 */ 5940 */
5827 measure_one(cache, size, cpu1, cpu2); 5941 measure_one(cache, size, cpu1, cpu2);
5828 for (i = 0; i < ITERATIONS; i++) 5942 for (i = 0; i < ITERATIONS; i++)
5829 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); 5943 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
5830 5944
5831 measure_one(cache, size, cpu2, cpu1); 5945 measure_one(cache, size, cpu2, cpu1);
5832 for (i = 0; i < ITERATIONS; i++) 5946 for (i = 0; i < ITERATIONS; i++)
5833 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); 5947 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
5834 5948
5835 /* 5949 /*
5836 * (We measure the non-migrating [cached] cost on both 5950 * (We measure the non-migrating [cached] cost on both
@@ -5840,17 +5954,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5840 5954
5841 measure_one(cache, size, cpu1, cpu1); 5955 measure_one(cache, size, cpu1, cpu1);
5842 for (i = 0; i < ITERATIONS; i++) 5956 for (i = 0; i < ITERATIONS; i++)
5843 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); 5957 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
5844 5958
5845 measure_one(cache, size, cpu2, cpu2); 5959 measure_one(cache, size, cpu2, cpu2);
5846 for (i = 0; i < ITERATIONS; i++) 5960 for (i = 0; i < ITERATIONS; i++)
5847 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); 5961 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
5848 5962
5849 /* 5963 /*
5850 * Get the per-iteration migration cost: 5964 * Get the per-iteration migration cost:
5851 */ 5965 */
5852 do_div(cost1, 2*ITERATIONS); 5966 do_div(cost1, 2 * ITERATIONS);
5853 do_div(cost2, 2*ITERATIONS); 5967 do_div(cost2, 2 * ITERATIONS);
5854 5968
5855 return cost1 - cost2; 5969 return cost1 - cost2;
5856} 5970}
@@ -5888,7 +6002,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5888 */ 6002 */
5889 cache = vmalloc(max_size); 6003 cache = vmalloc(max_size);
5890 if (!cache) { 6004 if (!cache) {
5891 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 6005 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
5892 return 1000000; /* return 1 msec on very small boxen */ 6006 return 1000000; /* return 1 msec on very small boxen */
5893 } 6007 }
5894 6008
@@ -5913,7 +6027,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5913 avg_fluct = (avg_fluct + fluct)/2; 6027 avg_fluct = (avg_fluct + fluct)/2;
5914 6028
5915 if (migration_debug) 6029 if (migration_debug)
5916 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", 6030 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6031 "(%8Ld %8Ld)\n",
5917 cpu1, cpu2, size, 6032 cpu1, cpu2, size,
5918 (long)cost / 1000000, 6033 (long)cost / 1000000,
5919 ((long)cost / 100000) % 10, 6034 ((long)cost / 100000) % 10,
@@ -6008,20 +6123,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
6008 -1 6123 -1
6009#endif 6124#endif
6010 ); 6125 );
6011 if (system_state == SYSTEM_BOOTING) { 6126 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6012 if (num_online_cpus() > 1) { 6127 printk("migration_cost=");
6013 printk("migration_cost="); 6128 for (distance = 0; distance <= max_distance; distance++) {
6014 for (distance = 0; distance <= max_distance; distance++) { 6129 if (distance)
6015 if (distance) 6130 printk(",");
6016 printk(","); 6131 printk("%ld", (long)migration_cost[distance] / 1000);
6017 printk("%ld", (long)migration_cost[distance] / 1000);
6018 }
6019 printk("\n");
6020 } 6132 }
6133 printk("\n");
6021 } 6134 }
6022 j1 = jiffies; 6135 j1 = jiffies;
6023 if (migration_debug) 6136 if (migration_debug)
6024 printk("migration: %ld seconds\n", (j1-j0)/HZ); 6137 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6025 6138
6026 /* 6139 /*
6027 * Move back to the original CPU. NUMA-Q gets confused 6140 * Move back to the original CPU. NUMA-Q gets confused
@@ -6118,10 +6231,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6118 */ 6231 */
6119#ifdef CONFIG_SCHED_SMT 6232#ifdef CONFIG_SCHED_SMT
6120static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6233static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6121static struct sched_group sched_group_cpus[NR_CPUS]; 6234static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6122 6235
6123static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) 6236static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
6237 struct sched_group **sg)
6124{ 6238{
6239 if (sg)
6240 *sg = &per_cpu(sched_group_cpus, cpu);
6125 return cpu; 6241 return cpu;
6126} 6242}
6127#endif 6243#endif
@@ -6131,39 +6247,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
6131 */ 6247 */
6132#ifdef CONFIG_SCHED_MC 6248#ifdef CONFIG_SCHED_MC
6133static DEFINE_PER_CPU(struct sched_domain, core_domains); 6249static DEFINE_PER_CPU(struct sched_domain, core_domains);
6134static struct sched_group sched_group_core[NR_CPUS]; 6250static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6135#endif 6251#endif
6136 6252
6137#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6253#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6138static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6254static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6255 struct sched_group **sg)
6139{ 6256{
6257 int group;
6140 cpumask_t mask = cpu_sibling_map[cpu]; 6258 cpumask_t mask = cpu_sibling_map[cpu];
6141 cpus_and(mask, mask, *cpu_map); 6259 cpus_and(mask, mask, *cpu_map);
6142 return first_cpu(mask); 6260 group = first_cpu(mask);
6261 if (sg)
6262 *sg = &per_cpu(sched_group_core, group);
6263 return group;
6143} 6264}
6144#elif defined(CONFIG_SCHED_MC) 6265#elif defined(CONFIG_SCHED_MC)
6145static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6266static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6267 struct sched_group **sg)
6146{ 6268{
6269 if (sg)
6270 *sg = &per_cpu(sched_group_core, cpu);
6147 return cpu; 6271 return cpu;
6148} 6272}
6149#endif 6273#endif
6150 6274
6151static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6275static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6152static struct sched_group sched_group_phys[NR_CPUS]; 6276static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6153 6277
6154static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) 6278static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
6279 struct sched_group **sg)
6155{ 6280{
6281 int group;
6156#ifdef CONFIG_SCHED_MC 6282#ifdef CONFIG_SCHED_MC
6157 cpumask_t mask = cpu_coregroup_map(cpu); 6283 cpumask_t mask = cpu_coregroup_map(cpu);
6158 cpus_and(mask, mask, *cpu_map); 6284 cpus_and(mask, mask, *cpu_map);
6159 return first_cpu(mask); 6285 group = first_cpu(mask);
6160#elif defined(CONFIG_SCHED_SMT) 6286#elif defined(CONFIG_SCHED_SMT)
6161 cpumask_t mask = cpu_sibling_map[cpu]; 6287 cpumask_t mask = cpu_sibling_map[cpu];
6162 cpus_and(mask, mask, *cpu_map); 6288 cpus_and(mask, mask, *cpu_map);
6163 return first_cpu(mask); 6289 group = first_cpu(mask);
6164#else 6290#else
6165 return cpu; 6291 group = cpu;
6166#endif 6292#endif
6293 if (sg)
6294 *sg = &per_cpu(sched_group_phys, group);
6295 return group;
6167} 6296}
6168 6297
6169#ifdef CONFIG_NUMA 6298#ifdef CONFIG_NUMA
@@ -6176,12 +6305,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
6176static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 6305static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6177 6306
6178static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 6307static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6179static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; 6308static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6180 6309
6181static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) 6310static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6311 struct sched_group **sg)
6182{ 6312{
6183 return cpu_to_node(cpu); 6313 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6314 int group;
6315
6316 cpus_and(nodemask, nodemask, *cpu_map);
6317 group = first_cpu(nodemask);
6318
6319 if (sg)
6320 *sg = &per_cpu(sched_group_allnodes, group);
6321 return group;
6184} 6322}
6323
6185static void init_numa_sched_groups_power(struct sched_group *group_head) 6324static void init_numa_sched_groups_power(struct sched_group *group_head)
6186{ 6325{
6187 struct sched_group *sg = group_head; 6326 struct sched_group *sg = group_head;
@@ -6217,16 +6356,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6217 int cpu, i; 6356 int cpu, i;
6218 6357
6219 for_each_cpu_mask(cpu, *cpu_map) { 6358 for_each_cpu_mask(cpu, *cpu_map) {
6220 struct sched_group *sched_group_allnodes
6221 = sched_group_allnodes_bycpu[cpu];
6222 struct sched_group **sched_group_nodes 6359 struct sched_group **sched_group_nodes
6223 = sched_group_nodes_bycpu[cpu]; 6360 = sched_group_nodes_bycpu[cpu];
6224 6361
6225 if (sched_group_allnodes) {
6226 kfree(sched_group_allnodes);
6227 sched_group_allnodes_bycpu[cpu] = NULL;
6228 }
6229
6230 if (!sched_group_nodes) 6362 if (!sched_group_nodes)
6231 continue; 6363 continue;
6232 6364
@@ -6320,7 +6452,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6320 struct sched_domain *sd; 6452 struct sched_domain *sd;
6321#ifdef CONFIG_NUMA 6453#ifdef CONFIG_NUMA
6322 struct sched_group **sched_group_nodes = NULL; 6454 struct sched_group **sched_group_nodes = NULL;
6323 struct sched_group *sched_group_allnodes = NULL; 6455 int sd_allnodes = 0;
6324 6456
6325 /* 6457 /*
6326 * Allocate the per-node list of sched groups 6458 * Allocate the per-node list of sched groups
@@ -6338,7 +6470,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6338 * Set up domains for cpus specified by the cpu_map. 6470 * Set up domains for cpus specified by the cpu_map.
6339 */ 6471 */
6340 for_each_cpu_mask(i, *cpu_map) { 6472 for_each_cpu_mask(i, *cpu_map) {
6341 int group;
6342 struct sched_domain *sd = NULL, *p; 6473 struct sched_domain *sd = NULL, *p;
6343 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 6474 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6344 6475
@@ -6347,26 +6478,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6347#ifdef CONFIG_NUMA 6478#ifdef CONFIG_NUMA
6348 if (cpus_weight(*cpu_map) 6479 if (cpus_weight(*cpu_map)
6349 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 6480 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6350 if (!sched_group_allnodes) {
6351 sched_group_allnodes
6352 = kmalloc_node(sizeof(struct sched_group)
6353 * MAX_NUMNODES,
6354 GFP_KERNEL,
6355 cpu_to_node(i));
6356 if (!sched_group_allnodes) {
6357 printk(KERN_WARNING
6358 "Can not alloc allnodes sched group\n");
6359 goto error;
6360 }
6361 sched_group_allnodes_bycpu[i]
6362 = sched_group_allnodes;
6363 }
6364 sd = &per_cpu(allnodes_domains, i); 6481 sd = &per_cpu(allnodes_domains, i);
6365 *sd = SD_ALLNODES_INIT; 6482 *sd = SD_ALLNODES_INIT;
6366 sd->span = *cpu_map; 6483 sd->span = *cpu_map;
6367 group = cpu_to_allnodes_group(i, cpu_map); 6484 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6368 sd->groups = &sched_group_allnodes[group];
6369 p = sd; 6485 p = sd;
6486 sd_allnodes = 1;
6370 } else 6487 } else
6371 p = NULL; 6488 p = NULL;
6372 6489
@@ -6381,36 +6498,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6381 6498
6382 p = sd; 6499 p = sd;
6383 sd = &per_cpu(phys_domains, i); 6500 sd = &per_cpu(phys_domains, i);
6384 group = cpu_to_phys_group(i, cpu_map);
6385 *sd = SD_CPU_INIT; 6501 *sd = SD_CPU_INIT;
6386 sd->span = nodemask; 6502 sd->span = nodemask;
6387 sd->parent = p; 6503 sd->parent = p;
6388 if (p) 6504 if (p)
6389 p->child = sd; 6505 p->child = sd;
6390 sd->groups = &sched_group_phys[group]; 6506 cpu_to_phys_group(i, cpu_map, &sd->groups);
6391 6507
6392#ifdef CONFIG_SCHED_MC 6508#ifdef CONFIG_SCHED_MC
6393 p = sd; 6509 p = sd;
6394 sd = &per_cpu(core_domains, i); 6510 sd = &per_cpu(core_domains, i);
6395 group = cpu_to_core_group(i, cpu_map);
6396 *sd = SD_MC_INIT; 6511 *sd = SD_MC_INIT;
6397 sd->span = cpu_coregroup_map(i); 6512 sd->span = cpu_coregroup_map(i);
6398 cpus_and(sd->span, sd->span, *cpu_map); 6513 cpus_and(sd->span, sd->span, *cpu_map);
6399 sd->parent = p; 6514 sd->parent = p;
6400 p->child = sd; 6515 p->child = sd;
6401 sd->groups = &sched_group_core[group]; 6516 cpu_to_core_group(i, cpu_map, &sd->groups);
6402#endif 6517#endif
6403 6518
6404#ifdef CONFIG_SCHED_SMT 6519#ifdef CONFIG_SCHED_SMT
6405 p = sd; 6520 p = sd;
6406 sd = &per_cpu(cpu_domains, i); 6521 sd = &per_cpu(cpu_domains, i);
6407 group = cpu_to_cpu_group(i, cpu_map);
6408 *sd = SD_SIBLING_INIT; 6522 *sd = SD_SIBLING_INIT;
6409 sd->span = cpu_sibling_map[i]; 6523 sd->span = cpu_sibling_map[i];
6410 cpus_and(sd->span, sd->span, *cpu_map); 6524 cpus_and(sd->span, sd->span, *cpu_map);
6411 sd->parent = p; 6525 sd->parent = p;
6412 p->child = sd; 6526 p->child = sd;
6413 sd->groups = &sched_group_cpus[group]; 6527 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6414#endif 6528#endif
6415 } 6529 }
6416 6530
@@ -6422,8 +6536,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6422 if (i != first_cpu(this_sibling_map)) 6536 if (i != first_cpu(this_sibling_map))
6423 continue; 6537 continue;
6424 6538
6425 init_sched_build_groups(sched_group_cpus, this_sibling_map, 6539 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
6426 cpu_map, &cpu_to_cpu_group);
6427 } 6540 }
6428#endif 6541#endif
6429 6542
@@ -6434,8 +6547,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6434 cpus_and(this_core_map, this_core_map, *cpu_map); 6547 cpus_and(this_core_map, this_core_map, *cpu_map);
6435 if (i != first_cpu(this_core_map)) 6548 if (i != first_cpu(this_core_map))
6436 continue; 6549 continue;
6437 init_sched_build_groups(sched_group_core, this_core_map, 6550 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
6438 cpu_map, &cpu_to_core_group);
6439 } 6551 }
6440#endif 6552#endif
6441 6553
@@ -6448,15 +6560,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6448 if (cpus_empty(nodemask)) 6560 if (cpus_empty(nodemask))
6449 continue; 6561 continue;
6450 6562
6451 init_sched_build_groups(sched_group_phys, nodemask, 6563 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6452 cpu_map, &cpu_to_phys_group);
6453 } 6564 }
6454 6565
6455#ifdef CONFIG_NUMA 6566#ifdef CONFIG_NUMA
6456 /* Set up node groups */ 6567 /* Set up node groups */
6457 if (sched_group_allnodes) 6568 if (sd_allnodes)
6458 init_sched_build_groups(sched_group_allnodes, *cpu_map, 6569 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
6459 cpu_map, &cpu_to_allnodes_group);
6460 6570
6461 for (i = 0; i < MAX_NUMNODES; i++) { 6571 for (i = 0; i < MAX_NUMNODES; i++) {
6462 /* Set up node groups */ 6572 /* Set up node groups */
@@ -6548,10 +6658,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6548 for (i = 0; i < MAX_NUMNODES; i++) 6658 for (i = 0; i < MAX_NUMNODES; i++)
6549 init_numa_sched_groups_power(sched_group_nodes[i]); 6659 init_numa_sched_groups_power(sched_group_nodes[i]);
6550 6660
6551 if (sched_group_allnodes) { 6661 if (sd_allnodes) {
6552 int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); 6662 struct sched_group *sg;
6553 struct sched_group *sg = &sched_group_allnodes[group];
6554 6663
6664 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6555 init_numa_sched_groups_power(sg); 6665 init_numa_sched_groups_power(sg);
6556 } 6666 }
6557#endif 6667#endif
@@ -6723,8 +6833,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6723 sched_smt_power_savings_store); 6833 sched_smt_power_savings_store);
6724#endif 6834#endif
6725 6835
6726
6727#ifdef CONFIG_HOTPLUG_CPU
6728/* 6836/*
6729 * Force a reinitialization of the sched domains hierarchy. The domains 6837 * Force a reinitialization of the sched domains hierarchy. The domains
6730 * and groups cannot be updated in place without racing with the balancing 6838 * and groups cannot be updated in place without racing with the balancing
@@ -6757,7 +6865,6 @@ static int update_sched_domains(struct notifier_block *nfb,
6757 6865
6758 return NOTIFY_OK; 6866 return NOTIFY_OK;
6759} 6867}
6760#endif
6761 6868
6762void __init sched_init_smp(void) 6869void __init sched_init_smp(void)
6763{ 6870{
@@ -6833,6 +6940,10 @@ void __init sched_init(void)
6833 6940
6834 set_load_weight(&init_task); 6941 set_load_weight(&init_task);
6835 6942
6943#ifdef CONFIG_SMP
6944 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6945#endif
6946
6836#ifdef CONFIG_RT_MUTEXES 6947#ifdef CONFIG_RT_MUTEXES
6837 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 6948 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6838#endif 6949#endif
@@ -6867,6 +6978,9 @@ void __might_sleep(char *file, int line)
6867 " context at %s:%d\n", file, line); 6978 " context at %s:%d\n", file, line);
6868 printk("in_atomic():%d, irqs_disabled():%d\n", 6979 printk("in_atomic():%d, irqs_disabled():%d\n",
6869 in_atomic(), irqs_disabled()); 6980 in_atomic(), irqs_disabled());
6981 debug_show_held_locks(current);
6982 if (irqs_disabled())
6983 print_irqtrace_events(current);
6870 dump_stack(); 6984 dump_stack();
6871 } 6985 }
6872#endif 6986#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 7ed8d5304bec..5630255d2e2a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,10 @@
23#include <linux/ptrace.h> 23#include <linux/ptrace.h>
24#include <linux/signal.h> 24#include <linux/signal.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/freezer.h>
27#include <linux/pid_namespace.h>
28#include <linux/nsproxy.h>
29
26#include <asm/param.h> 30#include <asm/param.h>
27#include <asm/uaccess.h> 31#include <asm/uaccess.h>
28#include <asm/unistd.h> 32#include <asm/unistd.h>
@@ -33,7 +37,7 @@
33 * SLAB caches for signal bits. 37 * SLAB caches for signal bits.
34 */ 38 */
35 39
36static kmem_cache_t *sigqueue_cachep; 40static struct kmem_cache *sigqueue_cachep;
37 41
38/* 42/*
39 * In POSIX a signal is sent either to a specific thread (Linux task) 43 * In POSIX a signal is sent either to a specific thread (Linux task)
@@ -267,18 +271,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
267 int override_rlimit) 271 int override_rlimit)
268{ 272{
269 struct sigqueue *q = NULL; 273 struct sigqueue *q = NULL;
274 struct user_struct *user;
270 275
271 atomic_inc(&t->user->sigpending); 276 /*
277 * In order to avoid problems with "switch_user()", we want to make
278 * sure that the compiler doesn't re-load "t->user"
279 */
280 user = t->user;
281 barrier();
282 atomic_inc(&user->sigpending);
272 if (override_rlimit || 283 if (override_rlimit ||
273 atomic_read(&t->user->sigpending) <= 284 atomic_read(&user->sigpending) <=
274 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) 285 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
275 q = kmem_cache_alloc(sigqueue_cachep, flags); 286 q = kmem_cache_alloc(sigqueue_cachep, flags);
276 if (unlikely(q == NULL)) { 287 if (unlikely(q == NULL)) {
277 atomic_dec(&t->user->sigpending); 288 atomic_dec(&user->sigpending);
278 } else { 289 } else {
279 INIT_LIST_HEAD(&q->list); 290 INIT_LIST_HEAD(&q->list);
280 q->flags = 0; 291 q->flags = 0;
281 q->user = get_uid(t->user); 292 q->user = get_uid(user);
282 } 293 }
283 return(q); 294 return(q);
284} 295}
@@ -575,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
575 error = -EPERM; 586 error = -EPERM;
576 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 587 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
577 && ((sig != SIGCONT) || 588 && ((sig != SIGCONT) ||
578 (current->signal->session != t->signal->session)) 589 (process_session(current) != process_session(t)))
579 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 590 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
580 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 591 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
581 && !capable(CAP_KILL)) 592 && !capable(CAP_KILL))
@@ -1126,8 +1137,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1126 return error; 1137 return error;
1127} 1138}
1128 1139
1129int 1140static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1130kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1131{ 1141{
1132 int error; 1142 int error;
1133 rcu_read_lock(); 1143 rcu_read_lock();
@@ -1695,7 +1705,9 @@ finish_stop(int stop_count)
1695 read_unlock(&tasklist_lock); 1705 read_unlock(&tasklist_lock);
1696 } 1706 }
1697 1707
1698 schedule(); 1708 do {
1709 schedule();
1710 } while (try_to_freeze());
1699 /* 1711 /*
1700 * Now we don't run again until continued. 1712 * Now we don't run again until continued.
1701 */ 1713 */
@@ -1870,8 +1882,12 @@ relock:
1870 if (sig_kernel_ignore(signr)) /* Default is nothing. */ 1882 if (sig_kernel_ignore(signr)) /* Default is nothing. */
1871 continue; 1883 continue;
1872 1884
1873 /* Init gets no signals it doesn't want. */ 1885 /*
1874 if (current == child_reaper) 1886 * Init of a pid space gets no signals it doesn't want from
1887 * within that pid space. It can of course get signals from
1888 * its parent pid space.
1889 */
1890 if (current == child_reaper(current))
1875 continue; 1891 continue;
1876 1892
1877 if (sig_kernel_stop(signr)) { 1893 if (sig_kernel_stop(signr)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce16..918e52df090e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
574 574
575 switch (action) { 575 switch (action) {
576 case CPU_UP_PREPARE: 576 case CPU_UP_PREPARE:
577 BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
578 BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
579 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 577 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
580 if (IS_ERR(p)) { 578 if (IS_ERR(p)) {
581 printk("ksoftirqd for %i failed\n", hotcpu); 579 printk("ksoftirqd for %i failed\n", hotcpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 476c3741511b..2c6c2bf85514 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -293,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
293} 293}
294 294
295EXPORT_SYMBOL(_spin_lock_nested); 295EXPORT_SYMBOL(_spin_lock_nested);
296unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
297{
298 unsigned long flags;
299
300 local_irq_save(flags);
301 preempt_disable();
302 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
303 /*
304 * On lockdep we dont want the hand-coded irq-enable of
305 * _raw_spin_lock_flags() code, because lockdep assumes
306 * that interrupts are not re-enabled during lock-acquire:
307 */
308#ifdef CONFIG_PROVE_SPIN_LOCKING
309 _raw_spin_lock(lock);
310#else
311 _raw_spin_lock_flags(lock, &flags);
312#endif
313 return flags;
314}
315
316EXPORT_SYMBOL(_spin_lock_irqsave_nested);
296 317
297#endif 318#endif
298 319
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801b..c7675c1bfdf2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
880 return 0; 880 return 0;
881} 881}
882 882
883static void deferred_cad(void *dummy) 883static void deferred_cad(struct work_struct *dummy)
884{ 884{
885 kernel_restart(NULL); 885 kernel_restart(NULL);
886} 886}
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
892 */ 892 */
893void ctrl_alt_del(void) 893void ctrl_alt_del(void)
894{ 894{
895 static DECLARE_WORK(cad_work, deferred_cad, NULL); 895 static DECLARE_WORK(cad_work, deferred_cad);
896 896
897 if (C_A_D) 897 if (C_A_D)
898 schedule_work(&cad_work); 898 schedule_work(&cad_work);
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
1102asmlinkage long sys_setuid(uid_t uid) 1102asmlinkage long sys_setuid(uid_t uid)
1103{ 1103{
1104 int old_euid = current->euid; 1104 int old_euid = current->euid;
1105 int old_ruid, old_suid, new_ruid, new_suid; 1105 int old_ruid, old_suid, new_suid;
1106 int retval; 1106 int retval;
1107 1107
1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
1109 if (retval) 1109 if (retval)
1110 return retval; 1110 return retval;
1111 1111
1112 old_ruid = new_ruid = current->uid; 1112 old_ruid = current->uid;
1113 old_suid = current->suid; 1113 old_suid = current->suid;
1114 new_suid = old_suid; 1114 new_suid = old_suid;
1115 1115
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1381 1381
1382 if (p->real_parent == group_leader) { 1382 if (p->real_parent == group_leader) {
1383 err = -EPERM; 1383 err = -EPERM;
1384 if (p->signal->session != group_leader->signal->session) 1384 if (process_session(p) != process_session(group_leader))
1385 goto out; 1385 goto out;
1386 err = -EACCES; 1386 err = -EACCES;
1387 if (p->did_exec) 1387 if (p->did_exec)
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1397 goto out; 1397 goto out;
1398 1398
1399 if (pgid != pid) { 1399 if (pgid != pid) {
1400 struct task_struct *p; 1400 struct task_struct *g =
1401 find_task_by_pid_type(PIDTYPE_PGID, pgid);
1401 1402
1402 do_each_task_pid(pgid, PIDTYPE_PGID, p) { 1403 if (!g || process_session(g) != process_session(group_leader))
1403 if (p->signal->session == group_leader->signal->session) 1404 goto out;
1404 goto ok_pgid;
1405 } while_each_task_pid(pgid, PIDTYPE_PGID, p);
1406 goto out;
1407 } 1405 }
1408 1406
1409ok_pgid:
1410 err = security_task_setpgid(p, pgid); 1407 err = security_task_setpgid(p, pgid);
1411 if (err) 1408 if (err)
1412 goto out; 1409 goto out;
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void)
1459asmlinkage long sys_getsid(pid_t pid) 1456asmlinkage long sys_getsid(pid_t pid)
1460{ 1457{
1461 if (!pid) 1458 if (!pid)
1462 return current->signal->session; 1459 return process_session(current);
1463 else { 1460 else {
1464 int retval; 1461 int retval;
1465 struct task_struct *p; 1462 struct task_struct *p;
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
1471 if (p) { 1468 if (p) {
1472 retval = security_task_getsid(p); 1469 retval = security_task_getsid(p);
1473 if (!retval) 1470 if (!retval)
1474 retval = p->signal->session; 1471 retval = process_session(p);
1475 } 1472 }
1476 read_unlock(&tasklist_lock); 1473 read_unlock(&tasklist_lock);
1477 return retval; 1474 return retval;
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void)
1484 pid_t session; 1481 pid_t session;
1485 int err = -EPERM; 1482 int err = -EPERM;
1486 1483
1487 mutex_lock(&tty_mutex);
1488 write_lock_irq(&tasklist_lock); 1484 write_lock_irq(&tasklist_lock);
1489 1485
1490 /* Fail if I am already a session leader */ 1486 /* Fail if I am already a session leader */
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void)
1504 1500
1505 group_leader->signal->leader = 1; 1501 group_leader->signal->leader = 1;
1506 __set_special_pids(session, session); 1502 __set_special_pids(session, session);
1503
1504 spin_lock(&group_leader->sighand->siglock);
1507 group_leader->signal->tty = NULL; 1505 group_leader->signal->tty = NULL;
1508 group_leader->signal->tty_old_pgrp = 0; 1506 group_leader->signal->tty_old_pgrp = 0;
1507 spin_unlock(&group_leader->sighand->siglock);
1508
1509 err = process_group(group_leader); 1509 err = process_group(group_leader);
1510out: 1510out:
1511 write_unlock_irq(&tasklist_lock); 1511 write_unlock_irq(&tasklist_lock);
1512 mutex_unlock(&tty_mutex);
1513 return err; 1512 return err;
1514} 1513}
1515 1514
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0e53314b14de..d7306d0f3dfc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,7 @@ cond_syscall(sys_madvise);
135cond_syscall(sys_mremap); 135cond_syscall(sys_mremap);
136cond_syscall(sys_remap_file_pages); 136cond_syscall(sys_remap_file_pages);
137cond_syscall(compat_sys_move_pages); 137cond_syscall(compat_sys_move_pages);
138cond_syscall(compat_sys_migrate_pages);
138 139
139/* block-layer dependent */ 140/* block-layer dependent */
140cond_syscall(sys_bdflush); 141cond_syscall(sys_bdflush);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bff2c18fb5a..600b33358ded 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
54 54
55#ifdef CONFIG_X86 55#ifdef CONFIG_X86
56#include <asm/nmi.h> 56#include <asm/nmi.h>
57#include <asm/stacktrace.h>
57#endif 58#endif
58 59
59#if defined(CONFIG_SYSCTL) 60#if defined(CONFIG_SYSCTL)
@@ -64,7 +65,6 @@ extern int sysctl_overcommit_memory;
64extern int sysctl_overcommit_ratio; 65extern int sysctl_overcommit_ratio;
65extern int sysctl_panic_on_oom; 66extern int sysctl_panic_on_oom;
66extern int max_threads; 67extern int max_threads;
67extern int sysrq_enabled;
68extern int core_uses_pid; 68extern int core_uses_pid;
69extern int suid_dumpable; 69extern int suid_dumpable;
70extern char core_pattern[]; 70extern char core_pattern[];
@@ -91,7 +91,9 @@ extern char modprobe_path[];
91extern int sg_big_buff; 91extern int sg_big_buff;
92#endif 92#endif
93#ifdef CONFIG_SYSVIPC 93#ifdef CONFIG_SYSVIPC
94static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, 94static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
95 void __user *buffer, size_t *lenp, loff_t *ppos);
96static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
95 void __user *buffer, size_t *lenp, loff_t *ppos); 97 void __user *buffer, size_t *lenp, loff_t *ppos);
96#endif 98#endif
97 99
@@ -130,12 +132,22 @@ extern int max_lock_depth;
130 132
131#ifdef CONFIG_SYSCTL_SYSCALL 133#ifdef CONFIG_SYSCTL_SYSCALL
132static int parse_table(int __user *, int, void __user *, size_t __user *, 134static int parse_table(int __user *, int, void __user *, size_t __user *,
133 void __user *, size_t, ctl_table *, void **); 135 void __user *, size_t, ctl_table *);
134#endif 136#endif
135 137
136static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 138static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
137 void __user *buffer, size_t *lenp, loff_t *ppos); 139 void __user *buffer, size_t *lenp, loff_t *ppos);
138 140
141static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
142 void __user *oldval, size_t __user *oldlenp,
143 void __user *newval, size_t newlen);
144
145#ifdef CONFIG_SYSVIPC
146static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
147 void __user *oldval, size_t __user *oldlenp,
148 void __user *newval, size_t newlen);
149#endif
150
139#ifdef CONFIG_PROC_SYSCTL 151#ifdef CONFIG_PROC_SYSCTL
140static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 152static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
141 void __user *buffer, size_t *lenp, loff_t *ppos); 153 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -162,6 +174,40 @@ extern ctl_table inotify_table[];
162int sysctl_legacy_va_layout; 174int sysctl_legacy_va_layout;
163#endif 175#endif
164 176
177static void *get_uts(ctl_table *table, int write)
178{
179 char *which = table->data;
180#ifdef CONFIG_UTS_NS
181 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
182 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
183#endif
184 if (!write)
185 down_read(&uts_sem);
186 else
187 down_write(&uts_sem);
188 return which;
189}
190
191static void put_uts(ctl_table *table, int write, void *which)
192{
193 if (!write)
194 up_read(&uts_sem);
195 else
196 up_write(&uts_sem);
197}
198
199#ifdef CONFIG_SYSVIPC
200static void *get_ipc(ctl_table *table, int write)
201{
202 char *which = table->data;
203 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
204 which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
205 return which;
206}
207#else
208#define get_ipc(T,W) ((T)->data)
209#endif
210
165/* /proc declarations: */ 211/* /proc declarations: */
166 212
167#ifdef CONFIG_PROC_SYSCTL 213#ifdef CONFIG_PROC_SYSCTL
@@ -170,7 +216,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
170static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); 216static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
171static int proc_opensys(struct inode *, struct file *); 217static int proc_opensys(struct inode *, struct file *);
172 218
173struct file_operations proc_sys_file_operations = { 219const struct file_operations proc_sys_file_operations = {
174 .open = proc_opensys, 220 .open = proc_opensys,
175 .read = proc_readsys, 221 .read = proc_readsys,
176 .write = proc_writesys, 222 .write = proc_writesys,
@@ -228,7 +274,6 @@ static ctl_table root_table[] = {
228}; 274};
229 275
230static ctl_table kern_table[] = { 276static ctl_table kern_table[] = {
231#ifndef CONFIG_UTS_NS
232 { 277 {
233 .ctl_name = KERN_OSTYPE, 278 .ctl_name = KERN_OSTYPE,
234 .procname = "ostype", 279 .procname = "ostype",
@@ -236,7 +281,7 @@ static ctl_table kern_table[] = {
236 .maxlen = sizeof(init_uts_ns.name.sysname), 281 .maxlen = sizeof(init_uts_ns.name.sysname),
237 .mode = 0444, 282 .mode = 0444,
238 .proc_handler = &proc_do_uts_string, 283 .proc_handler = &proc_do_uts_string,
239 .strategy = &sysctl_string, 284 .strategy = &sysctl_uts_string,
240 }, 285 },
241 { 286 {
242 .ctl_name = KERN_OSRELEASE, 287 .ctl_name = KERN_OSRELEASE,
@@ -245,7 +290,7 @@ static ctl_table kern_table[] = {
245 .maxlen = sizeof(init_uts_ns.name.release), 290 .maxlen = sizeof(init_uts_ns.name.release),
246 .mode = 0444, 291 .mode = 0444,
247 .proc_handler = &proc_do_uts_string, 292 .proc_handler = &proc_do_uts_string,
248 .strategy = &sysctl_string, 293 .strategy = &sysctl_uts_string,
249 }, 294 },
250 { 295 {
251 .ctl_name = KERN_VERSION, 296 .ctl_name = KERN_VERSION,
@@ -254,7 +299,7 @@ static ctl_table kern_table[] = {
254 .maxlen = sizeof(init_uts_ns.name.version), 299 .maxlen = sizeof(init_uts_ns.name.version),
255 .mode = 0444, 300 .mode = 0444,
256 .proc_handler = &proc_do_uts_string, 301 .proc_handler = &proc_do_uts_string,
257 .strategy = &sysctl_string, 302 .strategy = &sysctl_uts_string,
258 }, 303 },
259 { 304 {
260 .ctl_name = KERN_NODENAME, 305 .ctl_name = KERN_NODENAME,
@@ -263,7 +308,7 @@ static ctl_table kern_table[] = {
263 .maxlen = sizeof(init_uts_ns.name.nodename), 308 .maxlen = sizeof(init_uts_ns.name.nodename),
264 .mode = 0644, 309 .mode = 0644,
265 .proc_handler = &proc_do_uts_string, 310 .proc_handler = &proc_do_uts_string,
266 .strategy = &sysctl_string, 311 .strategy = &sysctl_uts_string,
267 }, 312 },
268 { 313 {
269 .ctl_name = KERN_DOMAINNAME, 314 .ctl_name = KERN_DOMAINNAME,
@@ -272,56 +317,8 @@ static ctl_table kern_table[] = {
272 .maxlen = sizeof(init_uts_ns.name.domainname), 317 .maxlen = sizeof(init_uts_ns.name.domainname),
273 .mode = 0644, 318 .mode = 0644,
274 .proc_handler = &proc_do_uts_string, 319 .proc_handler = &proc_do_uts_string,
275 .strategy = &sysctl_string, 320 .strategy = &sysctl_uts_string,
276 },
277#else /* !CONFIG_UTS_NS */
278 {
279 .ctl_name = KERN_OSTYPE,
280 .procname = "ostype",
281 .data = NULL,
282 /* could maybe use __NEW_UTS_LEN here? */
283 .maxlen = FIELD_SIZEOF(struct new_utsname, sysname),
284 .mode = 0444,
285 .proc_handler = &proc_do_uts_string,
286 .strategy = &sysctl_string,
287 },
288 {
289 .ctl_name = KERN_OSRELEASE,
290 .procname = "osrelease",
291 .data = NULL,
292 .maxlen = FIELD_SIZEOF(struct new_utsname, release),
293 .mode = 0444,
294 .proc_handler = &proc_do_uts_string,
295 .strategy = &sysctl_string,
296 },
297 {
298 .ctl_name = KERN_VERSION,
299 .procname = "version",
300 .data = NULL,
301 .maxlen = FIELD_SIZEOF(struct new_utsname, version),
302 .mode = 0444,
303 .proc_handler = &proc_do_uts_string,
304 .strategy = &sysctl_string,
305 },
306 {
307 .ctl_name = KERN_NODENAME,
308 .procname = "hostname",
309 .data = NULL,
310 .maxlen = FIELD_SIZEOF(struct new_utsname, nodename),
311 .mode = 0644,
312 .proc_handler = &proc_do_uts_string,
313 .strategy = &sysctl_string,
314 },
315 {
316 .ctl_name = KERN_DOMAINNAME,
317 .procname = "domainname",
318 .data = NULL,
319 .maxlen = FIELD_SIZEOF(struct new_utsname, domainname),
320 .mode = 0644,
321 .proc_handler = &proc_do_uts_string,
322 .strategy = &sysctl_string,
323 }, 321 },
324#endif /* !CONFIG_UTS_NS */
325 { 322 {
326 .ctl_name = KERN_PANIC, 323 .ctl_name = KERN_PANIC,
327 .procname = "panic", 324 .procname = "panic",
@@ -480,65 +477,72 @@ static ctl_table kern_table[] = {
480 { 477 {
481 .ctl_name = KERN_SHMMAX, 478 .ctl_name = KERN_SHMMAX,
482 .procname = "shmmax", 479 .procname = "shmmax",
483 .data = NULL, 480 .data = &init_ipc_ns.shm_ctlmax,
484 .maxlen = sizeof (size_t), 481 .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
485 .mode = 0644, 482 .mode = 0644,
486 .proc_handler = &proc_do_ipc_string, 483 .proc_handler = &proc_ipc_doulongvec_minmax,
484 .strategy = sysctl_ipc_data,
487 }, 485 },
488 { 486 {
489 .ctl_name = KERN_SHMALL, 487 .ctl_name = KERN_SHMALL,
490 .procname = "shmall", 488 .procname = "shmall",
491 .data = NULL, 489 .data = &init_ipc_ns.shm_ctlall,
492 .maxlen = sizeof (size_t), 490 .maxlen = sizeof (init_ipc_ns.shm_ctlall),
493 .mode = 0644, 491 .mode = 0644,
494 .proc_handler = &proc_do_ipc_string, 492 .proc_handler = &proc_ipc_doulongvec_minmax,
493 .strategy = sysctl_ipc_data,
495 }, 494 },
496 { 495 {
497 .ctl_name = KERN_SHMMNI, 496 .ctl_name = KERN_SHMMNI,
498 .procname = "shmmni", 497 .procname = "shmmni",
499 .data = NULL, 498 .data = &init_ipc_ns.shm_ctlmni,
500 .maxlen = sizeof (int), 499 .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
501 .mode = 0644, 500 .mode = 0644,
502 .proc_handler = &proc_do_ipc_string, 501 .proc_handler = &proc_ipc_dointvec,
502 .strategy = sysctl_ipc_data,
503 }, 503 },
504 { 504 {
505 .ctl_name = KERN_MSGMAX, 505 .ctl_name = KERN_MSGMAX,
506 .procname = "msgmax", 506 .procname = "msgmax",
507 .data = NULL, 507 .data = &init_ipc_ns.msg_ctlmax,
508 .maxlen = sizeof (int), 508 .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
509 .mode = 0644, 509 .mode = 0644,
510 .proc_handler = &proc_do_ipc_string, 510 .proc_handler = &proc_ipc_dointvec,
511 .strategy = sysctl_ipc_data,
511 }, 512 },
512 { 513 {
513 .ctl_name = KERN_MSGMNI, 514 .ctl_name = KERN_MSGMNI,
514 .procname = "msgmni", 515 .procname = "msgmni",
515 .data = NULL, 516 .data = &init_ipc_ns.msg_ctlmni,
516 .maxlen = sizeof (int), 517 .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
517 .mode = 0644, 518 .mode = 0644,
518 .proc_handler = &proc_do_ipc_string, 519 .proc_handler = &proc_ipc_dointvec,
520 .strategy = sysctl_ipc_data,
519 }, 521 },
520 { 522 {
521 .ctl_name = KERN_MSGMNB, 523 .ctl_name = KERN_MSGMNB,
522 .procname = "msgmnb", 524 .procname = "msgmnb",
523 .data = NULL, 525 .data = &init_ipc_ns.msg_ctlmnb,
524 .maxlen = sizeof (int), 526 .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
525 .mode = 0644, 527 .mode = 0644,
526 .proc_handler = &proc_do_ipc_string, 528 .proc_handler = &proc_ipc_dointvec,
529 .strategy = sysctl_ipc_data,
527 }, 530 },
528 { 531 {
529 .ctl_name = KERN_SEM, 532 .ctl_name = KERN_SEM,
530 .procname = "sem", 533 .procname = "sem",
531 .data = NULL, 534 .data = &init_ipc_ns.sem_ctls,
532 .maxlen = 4*sizeof (int), 535 .maxlen = 4*sizeof (int),
533 .mode = 0644, 536 .mode = 0644,
534 .proc_handler = &proc_do_ipc_string, 537 .proc_handler = &proc_ipc_dointvec,
538 .strategy = sysctl_ipc_data,
535 }, 539 },
536#endif 540#endif
537#ifdef CONFIG_MAGIC_SYSRQ 541#ifdef CONFIG_MAGIC_SYSRQ
538 { 542 {
539 .ctl_name = KERN_SYSRQ, 543 .ctl_name = KERN_SYSRQ,
540 .procname = "sysrq", 544 .procname = "sysrq",
541 .data = &sysrq_enabled, 545 .data = &__sysrq_enabled,
542 .maxlen = sizeof (int), 546 .maxlen = sizeof (int),
543 .mode = 0644, 547 .mode = 0644,
544 .proc_handler = &proc_dointvec, 548 .proc_handler = &proc_dointvec,
@@ -707,6 +711,14 @@ static ctl_table kern_table[] = {
707 .mode = 0444, 711 .mode = 0444,
708 .proc_handler = &proc_dointvec, 712 .proc_handler = &proc_dointvec,
709 }, 713 },
714 {
715 .ctl_name = CTL_UNNUMBERED,
716 .procname = "kstack_depth_to_print",
717 .data = &kstack_depth_to_print,
718 .maxlen = sizeof(int),
719 .mode = 0644,
720 .proc_handler = &proc_dointvec,
721 },
710#endif 722#endif
711#if defined(CONFIG_MMU) 723#if defined(CONFIG_MMU)
712 { 724 {
@@ -977,17 +989,6 @@ static ctl_table vm_table[] = {
977 .extra1 = &zero, 989 .extra1 = &zero,
978 }, 990 },
979#endif 991#endif
980#ifdef CONFIG_SWAP
981 {
982 .ctl_name = VM_SWAP_TOKEN_TIMEOUT,
983 .procname = "swap_token_timeout",
984 .data = &swap_token_default_timeout,
985 .maxlen = sizeof(swap_token_default_timeout),
986 .mode = 0644,
987 .proc_handler = &proc_dointvec_jiffies,
988 .strategy = &sysctl_jiffies,
989 },
990#endif
991#ifdef CONFIG_NUMA 992#ifdef CONFIG_NUMA
992 { 993 {
993 .ctl_name = VM_ZONE_RECLAIM_MODE, 994 .ctl_name = VM_ZONE_RECLAIM_MODE,
@@ -1241,7 +1242,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1241 do { 1242 do {
1242 struct ctl_table_header *head = 1243 struct ctl_table_header *head =
1243 list_entry(tmp, struct ctl_table_header, ctl_entry); 1244 list_entry(tmp, struct ctl_table_header, ctl_entry);
1244 void *context = NULL;
1245 1245
1246 if (!use_table(head)) 1246 if (!use_table(head))
1247 continue; 1247 continue;
@@ -1249,9 +1249,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1249 spin_unlock(&sysctl_lock); 1249 spin_unlock(&sysctl_lock);
1250 1250
1251 error = parse_table(name, nlen, oldval, oldlenp, 1251 error = parse_table(name, nlen, oldval, oldlenp,
1252 newval, newlen, head->ctl_table, 1252 newval, newlen, head->ctl_table);
1253 &context);
1254 kfree(context);
1255 1253
1256 spin_lock(&sysctl_lock); 1254 spin_lock(&sysctl_lock);
1257 unuse_table(head); 1255 unuse_table(head);
@@ -1307,7 +1305,7 @@ static inline int ctl_perm(ctl_table *table, int op)
1307static int parse_table(int __user *name, int nlen, 1305static int parse_table(int __user *name, int nlen,
1308 void __user *oldval, size_t __user *oldlenp, 1306 void __user *oldval, size_t __user *oldlenp,
1309 void __user *newval, size_t newlen, 1307 void __user *newval, size_t newlen,
1310 ctl_table *table, void **context) 1308 ctl_table *table)
1311{ 1309{
1312 int n; 1310 int n;
1313repeat: 1311repeat:
@@ -1315,7 +1313,9 @@ repeat:
1315 return -ENOTDIR; 1313 return -ENOTDIR;
1316 if (get_user(n, name)) 1314 if (get_user(n, name))
1317 return -EFAULT; 1315 return -EFAULT;
1318 for ( ; table->ctl_name; table++) { 1316 for ( ; table->ctl_name || table->procname; table++) {
1317 if (!table->ctl_name)
1318 continue;
1319 if (n == table->ctl_name || table->ctl_name == CTL_ANY) { 1319 if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
1320 int error; 1320 int error;
1321 if (table->child) { 1321 if (table->child) {
@@ -1325,7 +1325,7 @@ repeat:
1325 error = table->strategy( 1325 error = table->strategy(
1326 table, name, nlen, 1326 table, name, nlen,
1327 oldval, oldlenp, 1327 oldval, oldlenp,
1328 newval, newlen, context); 1328 newval, newlen);
1329 if (error) 1329 if (error)
1330 return error; 1330 return error;
1331 } 1331 }
@@ -1336,7 +1336,7 @@ repeat:
1336 } 1336 }
1337 error = do_sysctl_strategy(table, name, nlen, 1337 error = do_sysctl_strategy(table, name, nlen,
1338 oldval, oldlenp, 1338 oldval, oldlenp,
1339 newval, newlen, context); 1339 newval, newlen);
1340 return error; 1340 return error;
1341 } 1341 }
1342 } 1342 }
@@ -1347,7 +1347,7 @@ repeat:
1347int do_sysctl_strategy (ctl_table *table, 1347int do_sysctl_strategy (ctl_table *table,
1348 int __user *name, int nlen, 1348 int __user *name, int nlen,
1349 void __user *oldval, size_t __user *oldlenp, 1349 void __user *oldval, size_t __user *oldlenp,
1350 void __user *newval, size_t newlen, void **context) 1350 void __user *newval, size_t newlen)
1351{ 1351{
1352 int op = 0, rc; 1352 int op = 0, rc;
1353 size_t len; 1353 size_t len;
@@ -1361,7 +1361,7 @@ int do_sysctl_strategy (ctl_table *table,
1361 1361
1362 if (table->strategy) { 1362 if (table->strategy) {
1363 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1363 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1364 newval, newlen, context); 1364 newval, newlen);
1365 if (rc < 0) 1365 if (rc < 0)
1366 return rc; 1366 return rc;
1367 if (rc > 0) 1367 if (rc > 0)
@@ -1532,7 +1532,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
1532 int len; 1532 int len;
1533 mode_t mode; 1533 mode_t mode;
1534 1534
1535 for (; table->ctl_name; table++) { 1535 for (; table->ctl_name || table->procname; table++) {
1536 /* Can't do anything without a proc name. */ 1536 /* Can't do anything without a proc name. */
1537 if (!table->procname) 1537 if (!table->procname)
1538 continue; 1538 continue;
@@ -1579,7 +1579,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
1579static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) 1579static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
1580{ 1580{
1581 struct proc_dir_entry *de; 1581 struct proc_dir_entry *de;
1582 for (; table->ctl_name; table++) { 1582 for (; table->ctl_name || table->procname; table++) {
1583 if (!(de = table->de)) 1583 if (!(de = table->de))
1584 continue; 1584 continue;
1585 if (de->mode & S_IFDIR) { 1585 if (de->mode & S_IFDIR) {
@@ -1614,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1614 size_t count, loff_t *ppos) 1614 size_t count, loff_t *ppos)
1615{ 1615{
1616 int op; 1616 int op;
1617 struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); 1617 struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
1618 struct ctl_table *table; 1618 struct ctl_table *table;
1619 size_t res; 1619 size_t res;
1620 ssize_t error = -ENOTDIR; 1620 ssize_t error = -ENOTDIR;
@@ -1753,66 +1753,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
1753 * Special case of dostring for the UTS structure. This has locks 1753 * Special case of dostring for the UTS structure. This has locks
1754 * to observe. Should this be in kernel/sys.c ???? 1754 * to observe. Should this be in kernel/sys.c ????
1755 */ 1755 */
1756
1757#ifndef CONFIG_UTS_NS
1758static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1759 void __user *buffer, size_t *lenp, loff_t *ppos)
1760{
1761 int r;
1762 1756
1763 if (!write) {
1764 down_read(&uts_sem);
1765 r=proc_dostring(table,0,filp,buffer,lenp, ppos);
1766 up_read(&uts_sem);
1767 } else {
1768 down_write(&uts_sem);
1769 r=proc_dostring(table,1,filp,buffer,lenp, ppos);
1770 up_write(&uts_sem);
1771 }
1772 return r;
1773}
1774#else /* !CONFIG_UTS_NS */
1775static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 1757static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1776 void __user *buffer, size_t *lenp, loff_t *ppos) 1758 void __user *buffer, size_t *lenp, loff_t *ppos)
1777{ 1759{
1778 int r; 1760 int r;
1779 struct uts_namespace* uts_ns = current->nsproxy->uts_ns; 1761 void *which;
1780 char* which; 1762 which = get_uts(table, write);
1781 1763 r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
1782 switch (table->ctl_name) { 1764 put_uts(table, write, which);
1783 case KERN_OSTYPE:
1784 which = uts_ns->name.sysname;
1785 break;
1786 case KERN_NODENAME:
1787 which = uts_ns->name.nodename;
1788 break;
1789 case KERN_OSRELEASE:
1790 which = uts_ns->name.release;
1791 break;
1792 case KERN_VERSION:
1793 which = uts_ns->name.version;
1794 break;
1795 case KERN_DOMAINNAME:
1796 which = uts_ns->name.domainname;
1797 break;
1798 default:
1799 r = -EINVAL;
1800 goto out;
1801 }
1802
1803 if (!write) {
1804 down_read(&uts_sem);
1805 r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
1806 up_read(&uts_sem);
1807 } else {
1808 down_write(&uts_sem);
1809 r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
1810 up_write(&uts_sem);
1811 }
1812 out:
1813 return r; 1765 return r;
1814} 1766}
1815#endif /* !CONFIG_UTS_NS */
1816 1767
1817static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 1768static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1818 int *valp, 1769 int *valp,
@@ -1884,7 +1835,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
1884 p = buf; 1835 p = buf;
1885 if (*p == '-' && left > 1) { 1836 if (*p == '-' && left > 1) {
1886 neg = 1; 1837 neg = 1;
1887 left--, p++; 1838 p++;
1888 } 1839 }
1889 if (*p < '0' || *p > '9') 1840 if (*p < '0' || *p > '9')
1890 break; 1841 break;
@@ -1976,9 +1927,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
1976 1927
1977#define OP_SET 0 1928#define OP_SET 0
1978#define OP_AND 1 1929#define OP_AND 1
1979#define OP_OR 2
1980#define OP_MAX 3
1981#define OP_MIN 4
1982 1930
1983static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, 1931static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1984 int *valp, 1932 int *valp,
@@ -1990,13 +1938,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1990 switch(op) { 1938 switch(op) {
1991 case OP_SET: *valp = val; break; 1939 case OP_SET: *valp = val; break;
1992 case OP_AND: *valp &= val; break; 1940 case OP_AND: *valp &= val; break;
1993 case OP_OR: *valp |= val; break;
1994 case OP_MAX: if(*valp < val)
1995 *valp = val;
1996 break;
1997 case OP_MIN: if(*valp > val)
1998 *valp = val;
1999 break;
2000 } 1941 }
2001 } else { 1942 } else {
2002 int val = *valp; 1943 int val = *valp;
@@ -2135,7 +2076,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
2135 p = buf; 2076 p = buf;
2136 if (*p == '-' && left > 1) { 2077 if (*p == '-' && left > 1) {
2137 neg = 1; 2078 neg = 1;
2138 left--, p++; 2079 p++;
2139 } 2080 }
2140 if (*p < '0' || *p > '9') 2081 if (*p < '0' || *p > '9')
2141 break; 2082 break;
@@ -2391,46 +2332,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2391} 2332}
2392 2333
2393#ifdef CONFIG_SYSVIPC 2334#ifdef CONFIG_SYSVIPC
2394static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, 2335static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2395 void __user *buffer, size_t *lenp, loff_t *ppos) 2336 void __user *buffer, size_t *lenp, loff_t *ppos)
2396{ 2337{
2397 void *data; 2338 void *which;
2398 struct ipc_namespace *ns; 2339 which = get_ipc(table, write);
2399 2340 return __do_proc_dointvec(which, table, write, filp, buffer,
2400 ns = current->nsproxy->ipc_ns;
2401
2402 switch (table->ctl_name) {
2403 case KERN_SHMMAX:
2404 data = &ns->shm_ctlmax;
2405 goto proc_minmax;
2406 case KERN_SHMALL:
2407 data = &ns->shm_ctlall;
2408 goto proc_minmax;
2409 case KERN_SHMMNI:
2410 data = &ns->shm_ctlmni;
2411 break;
2412 case KERN_MSGMAX:
2413 data = &ns->msg_ctlmax;
2414 break;
2415 case KERN_MSGMNI:
2416 data = &ns->msg_ctlmni;
2417 break;
2418 case KERN_MSGMNB:
2419 data = &ns->msg_ctlmnb;
2420 break;
2421 case KERN_SEM:
2422 data = &ns->sem_ctls;
2423 break;
2424 default:
2425 return -EINVAL;
2426 }
2427
2428 return __do_proc_dointvec(data, table, write, filp, buffer,
2429 lenp, ppos, NULL, NULL); 2341 lenp, ppos, NULL, NULL);
2430proc_minmax: 2342}
2431 return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, 2343
2344static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2345 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
2346{
2347 void *which;
2348 which = get_ipc(table, write);
2349 return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
2432 lenp, ppos, 1l, 1l); 2350 lenp, ppos, 1l, 1l);
2433} 2351}
2352
2434#endif 2353#endif
2435 2354
2436static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 2355static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -2475,6 +2394,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
2475{ 2394{
2476 return -ENOSYS; 2395 return -ENOSYS;
2477} 2396}
2397static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2398 void __user *buffer, size_t *lenp, loff_t *ppos)
2399{
2400 return -ENOSYS;
2401}
2402static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2403 struct file *filp, void __user *buffer,
2404 size_t *lenp, loff_t *ppos)
2405{
2406 return -ENOSYS;
2407}
2478#endif 2408#endif
2479 2409
2480int proc_dointvec(ctl_table *table, int write, struct file *filp, 2410int proc_dointvec(ctl_table *table, int write, struct file *filp,
@@ -2539,7 +2469,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2539/* The generic string strategy routine: */ 2469/* The generic string strategy routine: */
2540int sysctl_string(ctl_table *table, int __user *name, int nlen, 2470int sysctl_string(ctl_table *table, int __user *name, int nlen,
2541 void __user *oldval, size_t __user *oldlenp, 2471 void __user *oldval, size_t __user *oldlenp,
2542 void __user *newval, size_t newlen, void **context) 2472 void __user *newval, size_t newlen)
2543{ 2473{
2544 if (!table->data || !table->maxlen) 2474 if (!table->data || !table->maxlen)
2545 return -ENOTDIR; 2475 return -ENOTDIR;
@@ -2585,7 +2515,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2585 */ 2515 */
2586int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2516int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2587 void __user *oldval, size_t __user *oldlenp, 2517 void __user *oldval, size_t __user *oldlenp,
2588 void __user *newval, size_t newlen, void **context) 2518 void __user *newval, size_t newlen)
2589{ 2519{
2590 2520
2591 if (newval && newlen) { 2521 if (newval && newlen) {
@@ -2621,7 +2551,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2621/* Strategy function to convert jiffies to seconds */ 2551/* Strategy function to convert jiffies to seconds */
2622int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2552int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2623 void __user *oldval, size_t __user *oldlenp, 2553 void __user *oldval, size_t __user *oldlenp,
2624 void __user *newval, size_t newlen, void **context) 2554 void __user *newval, size_t newlen)
2625{ 2555{
2626 if (oldval) { 2556 if (oldval) {
2627 size_t olen; 2557 size_t olen;
@@ -2649,7 +2579,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2649/* Strategy function to convert jiffies to seconds */ 2579/* Strategy function to convert jiffies to seconds */
2650int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2580int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2651 void __user *oldval, size_t __user *oldlenp, 2581 void __user *oldval, size_t __user *oldlenp,
2652 void __user *newval, size_t newlen, void **context) 2582 void __user *newval, size_t newlen)
2653{ 2583{
2654 if (oldval) { 2584 if (oldval) {
2655 size_t olen; 2585 size_t olen;
@@ -2674,50 +2604,140 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2674 return 1; 2604 return 1;
2675} 2605}
2676 2606
2607
2608/* The generic string strategy routine: */
2609static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2610 void __user *oldval, size_t __user *oldlenp,
2611 void __user *newval, size_t newlen)
2612{
2613 struct ctl_table uts_table;
2614 int r, write;
2615 write = newval && newlen;
2616 memcpy(&uts_table, table, sizeof(uts_table));
2617 uts_table.data = get_uts(table, write);
2618 r = sysctl_string(&uts_table, name, nlen,
2619 oldval, oldlenp, newval, newlen);
2620 put_uts(table, write, uts_table.data);
2621 return r;
2622}
2623
2624#ifdef CONFIG_SYSVIPC
2625/* The generic sysctl ipc data routine. */
2626static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2627 void __user *oldval, size_t __user *oldlenp,
2628 void __user *newval, size_t newlen)
2629{
2630 size_t len;
2631 void *data;
2632
2633 /* Get out of I don't have a variable */
2634 if (!table->data || !table->maxlen)
2635 return -ENOTDIR;
2636
2637 data = get_ipc(table, 1);
2638 if (!data)
2639 return -ENOTDIR;
2640
2641 if (oldval && oldlenp) {
2642 if (get_user(len, oldlenp))
2643 return -EFAULT;
2644 if (len) {
2645 if (len > table->maxlen)
2646 len = table->maxlen;
2647 if (copy_to_user(oldval, data, len))
2648 return -EFAULT;
2649 if (put_user(len, oldlenp))
2650 return -EFAULT;
2651 }
2652 }
2653
2654 if (newval && newlen) {
2655 if (newlen > table->maxlen)
2656 newlen = table->maxlen;
2657
2658 if (copy_from_user(data, newval, newlen))
2659 return -EFAULT;
2660 }
2661 return 1;
2662}
2663#endif
2664
2677#else /* CONFIG_SYSCTL_SYSCALL */ 2665#else /* CONFIG_SYSCTL_SYSCALL */
2678 2666
2679 2667
2680asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 2668asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2681{ 2669{
2682 static int msg_count; 2670 static int msg_count;
2671 struct __sysctl_args tmp;
2672 int name[CTL_MAXNAME];
2673 int i;
2674
2675 /* Read in the sysctl name for better debug message logging */
2676 if (copy_from_user(&tmp, args, sizeof(tmp)))
2677 return -EFAULT;
2678 if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME)
2679 return -ENOTDIR;
2680 for (i = 0; i < tmp.nlen; i++)
2681 if (get_user(name[i], tmp.name + i))
2682 return -EFAULT;
2683
2684 /* Ignore accesses to kernel.version */
2685 if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
2686 goto out;
2683 2687
2684 if (msg_count < 5) { 2688 if (msg_count < 5) {
2685 msg_count++; 2689 msg_count++;
2686 printk(KERN_INFO 2690 printk(KERN_INFO
2687 "warning: process `%s' used the removed sysctl " 2691 "warning: process `%s' used the removed sysctl "
2688 "system call\n", current->comm); 2692 "system call with ", current->comm);
2693 for (i = 0; i < tmp.nlen; i++)
2694 printk("%d.", name[i]);
2695 printk("\n");
2689 } 2696 }
2697out:
2690 return -ENOSYS; 2698 return -ENOSYS;
2691} 2699}
2692 2700
2693int sysctl_string(ctl_table *table, int __user *name, int nlen, 2701int sysctl_string(ctl_table *table, int __user *name, int nlen,
2694 void __user *oldval, size_t __user *oldlenp, 2702 void __user *oldval, size_t __user *oldlenp,
2695 void __user *newval, size_t newlen, void **context) 2703 void __user *newval, size_t newlen)
2696{ 2704{
2697 return -ENOSYS; 2705 return -ENOSYS;
2698} 2706}
2699 2707
2700int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2708int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2701 void __user *oldval, size_t __user *oldlenp, 2709 void __user *oldval, size_t __user *oldlenp,
2702 void __user *newval, size_t newlen, void **context) 2710 void __user *newval, size_t newlen)
2703{ 2711{
2704 return -ENOSYS; 2712 return -ENOSYS;
2705} 2713}
2706 2714
2707int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2715int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2708 void __user *oldval, size_t __user *oldlenp, 2716 void __user *oldval, size_t __user *oldlenp,
2709 void __user *newval, size_t newlen, void **context) 2717 void __user *newval, size_t newlen)
2710{ 2718{
2711 return -ENOSYS; 2719 return -ENOSYS;
2712} 2720}
2713 2721
2714int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2722int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2715 void __user *oldval, size_t __user *oldlenp, 2723 void __user *oldval, size_t __user *oldlenp,
2716 void __user *newval, size_t newlen, void **context) 2724 void __user *newval, size_t newlen)
2717{ 2725{
2718 return -ENOSYS; 2726 return -ENOSYS;
2719} 2727}
2720 2728
2729static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2730 void __user *oldval, size_t __user *oldlenp,
2731 void __user *newval, size_t newlen)
2732{
2733 return -ENOSYS;
2734}
2735static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2736 void __user *oldval, size_t __user *oldlenp,
2737 void __user *newval, size_t newlen)
2738{
2739 return -ENOSYS;
2740}
2721#endif /* CONFIG_SYSCTL_SYSCALL */ 2741#endif /* CONFIG_SYSCTL_SYSCALL */
2722 2742
2723/* 2743/*
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2039585ec5e1..4c3476fa058d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
34 34
35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
36static int family_registered; 36static int family_registered;
37kmem_cache_t *taskstats_cache; 37struct kmem_cache *taskstats_cache;
38 38
39static struct genl_family family = { 39static struct genl_family family = {
40 .id = GENL_ID_GENERATE, 40 .id = GENL_ID_GENERATE,
@@ -69,7 +69,7 @@ enum actions {
69}; 69};
70 70
71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
72 void **replyp, size_t size) 72 size_t size)
73{ 73{
74 struct sk_buff *skb; 74 struct sk_buff *skb;
75 void *reply; 75 void *reply;
@@ -77,8 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
77 /* 77 /*
78 * If new attributes are added, please revisit this allocation 78 * If new attributes are added, please revisit this allocation
79 */ 79 */
80 size = nlmsg_total_size(genlmsg_total_size(size)); 80 skb = genlmsg_new(size, GFP_KERNEL);
81 skb = nlmsg_new(size, GFP_KERNEL);
82 if (!skb) 81 if (!skb)
83 return -ENOMEM; 82 return -ENOMEM;
84 83
@@ -86,20 +85,15 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
86 int seq = get_cpu_var(taskstats_seqnum)++; 85 int seq = get_cpu_var(taskstats_seqnum)++;
87 put_cpu_var(taskstats_seqnum); 86 put_cpu_var(taskstats_seqnum);
88 87
89 reply = genlmsg_put(skb, 0, seq, 88 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
90 family.id, 0, 0,
91 cmd, family.version);
92 } else 89 } else
93 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, 90 reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
94 family.id, 0, 0,
95 cmd, family.version);
96 if (reply == NULL) { 91 if (reply == NULL) {
97 nlmsg_free(skb); 92 nlmsg_free(skb);
98 return -EINVAL; 93 return -EINVAL;
99 } 94 }
100 95
101 *skbp = skb; 96 *skbp = skb;
102 *replyp = reply;
103 return 0; 97 return 0;
104} 98}
105 99
@@ -124,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
124/* 118/*
125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 119 * Send taskstats data in @skb to listeners registered for @cpu's exit data
126 */ 120 */
127static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) 121static void send_cpu_listeners(struct sk_buff *skb,
122 struct listener_list *listeners)
128{ 123{
129 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 124 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
130 struct listener_list *listeners;
131 struct listener *s, *tmp; 125 struct listener *s, *tmp;
132 struct sk_buff *skb_next, *skb_cur = skb; 126 struct sk_buff *skb_next, *skb_cur = skb;
133 void *reply = genlmsg_data(genlhdr); 127 void *reply = genlmsg_data(genlhdr);
@@ -140,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
140 } 134 }
141 135
142 rc = 0; 136 rc = 0;
143 listeners = &per_cpu(listener_array, cpu);
144 down_read(&listeners->sem); 137 down_read(&listeners->sem);
145 list_for_each_entry(s, &listeners->list, list) { 138 list_for_each_entry(s, &listeners->list, list) {
146 skb_next = NULL; 139 skb_next = NULL;
@@ -191,6 +184,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
191 } else 184 } else
192 get_task_struct(tsk); 185 get_task_struct(tsk);
193 186
187 memset(stats, 0, sizeof(*stats));
194 /* 188 /*
195 * Each accounting subsystem adds calls to its functions to 189 * Each accounting subsystem adds calls to its functions to
196 * fill in relevant parts of struct taskstsats as follows 190 * fill in relevant parts of struct taskstsats as follows
@@ -233,6 +227,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
233 227
234 if (first->signal->stats) 228 if (first->signal->stats)
235 memcpy(stats, first->signal->stats, sizeof(*stats)); 229 memcpy(stats, first->signal->stats, sizeof(*stats));
230 else
231 memset(stats, 0, sizeof(*stats));
236 232
237 tsk = first; 233 tsk = first;
238 do { 234 do {
@@ -349,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask)
349 return ret; 345 return ret;
350} 346}
351 347
348static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
349{
350 struct nlattr *na, *ret;
351 int aggr;
352
353 aggr = (type == TASKSTATS_TYPE_PID)
354 ? TASKSTATS_TYPE_AGGR_PID
355 : TASKSTATS_TYPE_AGGR_TGID;
356
357 na = nla_nest_start(skb, aggr);
358 if (!na)
359 goto err;
360 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
361 goto err;
362 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
363 if (!ret)
364 goto err;
365 nla_nest_end(skb, na);
366
367 return nla_data(ret);
368err:
369 return NULL;
370}
371
352static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 372static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
353{ 373{
354 int rc = 0; 374 int rc = 0;
355 struct sk_buff *rep_skb; 375 struct sk_buff *rep_skb;
356 struct taskstats stats; 376 struct taskstats *stats;
357 void *reply;
358 size_t size; 377 size_t size;
359 struct nlattr *na;
360 cpumask_t mask; 378 cpumask_t mask;
361 379
362 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 380 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -377,141 +395,122 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
377 size = nla_total_size(sizeof(u32)) + 395 size = nla_total_size(sizeof(u32)) +
378 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 396 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
379 397
380 memset(&stats, 0, sizeof(stats)); 398 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
381 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
382 if (rc < 0) 399 if (rc < 0)
383 return rc; 400 return rc;
384 401
402 rc = -EINVAL;
385 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 403 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
386 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 404 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
387 rc = fill_pid(pid, NULL, &stats); 405 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
388 if (rc < 0) 406 if (!stats)
389 goto err; 407 goto err;
390 408
391 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 409 rc = fill_pid(pid, NULL, stats);
392 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); 410 if (rc < 0)
393 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 411 goto err;
394 stats);
395 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 412 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
396 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 413 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
397 rc = fill_tgid(tgid, NULL, &stats); 414 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
398 if (rc < 0) 415 if (!stats)
399 goto err; 416 goto err;
400 417
401 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 418 rc = fill_tgid(tgid, NULL, stats);
402 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); 419 if (rc < 0)
403 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 420 goto err;
404 stats); 421 } else
405 } else {
406 rc = -EINVAL;
407 goto err; 422 goto err;
408 }
409
410 nla_nest_end(rep_skb, na);
411 423
412 return send_reply(rep_skb, info->snd_pid); 424 return send_reply(rep_skb, info->snd_pid);
413
414nla_put_failure:
415 rc = genlmsg_cancel(rep_skb, reply);
416err: 425err:
417 nlmsg_free(rep_skb); 426 nlmsg_free(rep_skb);
418 return rc; 427 return rc;
419} 428}
420 429
421void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 430static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
422{ 431{
423 struct listener_list *listeners; 432 struct signal_struct *sig = tsk->signal;
424 struct taskstats *tmp; 433 struct taskstats *stats;
425 /*
426 * This is the cpu on which the task is exiting currently and will
427 * be the one for which the exit event is sent, even if the cpu
428 * on which this function is running changes later.
429 */
430 *mycpu = raw_smp_processor_id();
431 434
432 *ptidstats = NULL; 435 if (sig->stats || thread_group_empty(tsk))
433 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 436 goto ret;
434 if (!tmp)
435 return;
436 437
437 listeners = &per_cpu(listener_array, *mycpu); 438 /* No problem if kmem_cache_zalloc() fails */
438 down_read(&listeners->sem); 439 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
439 if (!list_empty(&listeners->list)) { 440
440 *ptidstats = tmp; 441 spin_lock_irq(&tsk->sighand->siglock);
441 tmp = NULL; 442 if (!sig->stats) {
443 sig->stats = stats;
444 stats = NULL;
442 } 445 }
443 up_read(&listeners->sem); 446 spin_unlock_irq(&tsk->sighand->siglock);
444 kfree(tmp); 447
448 if (stats)
449 kmem_cache_free(taskstats_cache, stats);
450ret:
451 return sig->stats;
445} 452}
446 453
447/* Send pid data out on exit */ 454/* Send pid data out on exit */
448void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 455void taskstats_exit(struct task_struct *tsk, int group_dead)
449 int group_dead, unsigned int mycpu)
450{ 456{
451 int rc; 457 int rc;
458 struct listener_list *listeners;
459 struct taskstats *stats;
452 struct sk_buff *rep_skb; 460 struct sk_buff *rep_skb;
453 void *reply;
454 size_t size; 461 size_t size;
455 int is_thread_group; 462 int is_thread_group;
456 struct nlattr *na;
457 463
458 if (!family_registered || !tidstats) 464 if (!family_registered)
459 return; 465 return;
460 466
461 rc = 0;
462 /* 467 /*
463 * Size includes space for nested attributes 468 * Size includes space for nested attributes
464 */ 469 */
465 size = nla_total_size(sizeof(u32)) + 470 size = nla_total_size(sizeof(u32)) +
466 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 471 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
467 472
468 is_thread_group = (tsk->signal->stats != NULL); 473 is_thread_group = !!taskstats_tgid_alloc(tsk);
469 if (is_thread_group) 474 if (is_thread_group) {
470 size = 2 * size; /* PID + STATS + TGID + STATS */ 475 /* PID + STATS + TGID + STATS */
476 size = 2 * size;
477 /* fill the tsk->signal->stats structure */
478 fill_tgid_exit(tsk);
479 }
471 480
472 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 481 listeners = &__raw_get_cpu_var(listener_array);
473 if (rc < 0) 482 if (list_empty(&listeners->list))
474 goto ret; 483 return;
475 484
476 rc = fill_pid(tsk->pid, tsk, tidstats); 485 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
477 if (rc < 0) 486 if (rc < 0)
478 goto err_skb; 487 return;
479 488
480 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 489 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
481 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); 490 if (!stats)
482 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 491 goto err;
483 *tidstats);
484 nla_nest_end(rep_skb, na);
485 492
486 if (!is_thread_group) 493 rc = fill_pid(tsk->pid, tsk, stats);
487 goto send; 494 if (rc < 0)
495 goto err;
488 496
489 /* 497 /*
490 * tsk has/had a thread group so fill the tsk->signal->stats structure
491 * Doesn't matter if tsk is the leader or the last group member leaving 498 * Doesn't matter if tsk is the leader or the last group member leaving
492 */ 499 */
493 500 if (!is_thread_group || !group_dead)
494 fill_tgid_exit(tsk);
495 if (!group_dead)
496 goto send; 501 goto send;
497 502
498 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 503 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
499 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); 504 if (!stats)
500 /* No locking needed for tsk->signal->stats since group is dead */ 505 goto err;
501 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 506
502 *tsk->signal->stats); 507 memcpy(stats, tsk->signal->stats, sizeof(*stats));
503 nla_nest_end(rep_skb, na);
504 508
505send: 509send:
506 send_cpu_listeners(rep_skb, mycpu); 510 send_cpu_listeners(rep_skb, listeners);
507 return; 511 return;
508 512err:
509nla_put_failure:
510 genlmsg_cancel(rep_skb, reply);
511err_skb:
512 nlmsg_free(rep_skb); 513 nlmsg_free(rep_skb);
513ret:
514 return;
515} 514}
516 515
517static struct genl_ops taskstats_ops = { 516static struct genl_ops taskstats_ops = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939bd9..22504afc0d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
156 /* check if clocksource is already registered */ 156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) { 157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. " 158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name); 159 "Already registered!", c->name);
160 ret = -EBUSY; 160 ret = -EBUSY;
161 } else { 161 } else {
162 /* register it */ 162 /* register it */
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
186} 186}
187EXPORT_SYMBOL(clocksource_reselect); 187EXPORT_SYMBOL(clocksource_reselect);
188 188
189#ifdef CONFIG_SYSFS
189/** 190/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource 191 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused 192 * @dev: unused
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
275 * Sysfs setup bits: 276 * Sysfs setup bits:
276 */ 277 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, 278static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource); 279 sysfs_override_clocksource);
279 280
280static SYSDEV_ATTR(available_clocksource, 0600, 281static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL); 282 sysfs_show_available_clocksources, NULL);
282 283
283static struct sysdev_class clocksource_sysclass = { 284static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"), 285 set_kset_name("clocksource"),
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
307} 308}
308 309
309device_initcall(init_clocksource_sysfs); 310device_initcall(init_clocksource_sysfs);
311#endif /* CONFIG_SYSFS */
310 312
311/** 313/**
312 * boot_override_clocksource - boot clock override 314 * boot_override_clocksource - boot clock override
diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffec1..feddf817baa5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
80EXPORT_SYMBOL(boot_tvec_bases); 80EXPORT_SYMBOL(boot_tvec_bases);
81static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 81static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
82 82
83/**
84 * __round_jiffies - function to round jiffies to a full second
85 * @j: the time in (absolute) jiffies that should be rounded
86 * @cpu: the processor number on which the timeout will happen
87 *
88 * __round_jiffies rounds an absolute time in the future (in jiffies)
89 * up or down to (approximately) full seconds. This is useful for timers
90 * for which the exact time they fire does not matter too much, as long as
91 * they fire approximately every X seconds.
92 *
93 * By rounding these timers to whole seconds, all such timers will fire
94 * at the same time, rather than at various times spread out. The goal
95 * of this is to have the CPU wake up less, which saves power.
96 *
97 * The exact rounding is skewed for each processor to avoid all
98 * processors firing at the exact same time, which could lead
99 * to lock contention or spurious cache line bouncing.
100 *
101 * The return value is the rounded version of the "j" parameter.
102 */
103unsigned long __round_jiffies(unsigned long j, int cpu)
104{
105 int rem;
106 unsigned long original = j;
107
108 /*
109 * We don't want all cpus firing their timers at once hitting the
110 * same lock or cachelines, so we skew each extra cpu with an extra
111 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
112 * already did this.
113 * The skew is done by adding 3*cpunr, then round, then subtract this
114 * extra offset again.
115 */
116 j += cpu * 3;
117
118 rem = j % HZ;
119
120 /*
121 * If the target jiffie is just after a whole second (which can happen
122 * due to delays of the timer irq, long irq off times etc etc) then
123 * we should round down to the whole second, not up. Use 1/4th second
124 * as cutoff for this rounding as an extreme upper bound for this.
125 */
126 if (rem < HZ/4) /* round down */
127 j = j - rem;
128 else /* round up */
129 j = j - rem + HZ;
130
131 /* now that we have rounded, subtract the extra skew again */
132 j -= cpu * 3;
133
134 if (j <= jiffies) /* rounding ate our timeout entirely; */
135 return original;
136 return j;
137}
138EXPORT_SYMBOL_GPL(__round_jiffies);
139
140/**
141 * __round_jiffies_relative - function to round jiffies to a full second
142 * @j: the time in (relative) jiffies that should be rounded
143 * @cpu: the processor number on which the timeout will happen
144 *
145 * __round_jiffies_relative rounds a time delta in the future (in jiffies)
146 * up or down to (approximately) full seconds. This is useful for timers
147 * for which the exact time they fire does not matter too much, as long as
148 * they fire approximately every X seconds.
149 *
150 * By rounding these timers to whole seconds, all such timers will fire
151 * at the same time, rather than at various times spread out. The goal
152 * of this is to have the CPU wake up less, which saves power.
153 *
154 * The exact rounding is skewed for each processor to avoid all
155 * processors firing at the exact same time, which could lead
156 * to lock contention or spurious cache line bouncing.
157 *
158 * The return value is the rounded version of the "j" parameter.
159 */
160unsigned long __round_jiffies_relative(unsigned long j, int cpu)
161{
162 /*
163 * In theory the following code can skip a jiffy in case jiffies
164 * increments right between the addition and the later subtraction.
165 * However since the entire point of this function is to use approximate
166 * timeouts, it's entirely ok to not handle that.
167 */
168 return __round_jiffies(j + jiffies, cpu) - jiffies;
169}
170EXPORT_SYMBOL_GPL(__round_jiffies_relative);
171
172/**
173 * round_jiffies - function to round jiffies to a full second
174 * @j: the time in (absolute) jiffies that should be rounded
175 *
176 * round_jiffies rounds an absolute time in the future (in jiffies)
177 * up or down to (approximately) full seconds. This is useful for timers
178 * for which the exact time they fire does not matter too much, as long as
179 * they fire approximately every X seconds.
180 *
181 * By rounding these timers to whole seconds, all such timers will fire
182 * at the same time, rather than at various times spread out. The goal
183 * of this is to have the CPU wake up less, which saves power.
184 *
185 * The return value is the rounded version of the "j" parameter.
186 */
187unsigned long round_jiffies(unsigned long j)
188{
189 return __round_jiffies(j, raw_smp_processor_id());
190}
191EXPORT_SYMBOL_GPL(round_jiffies);
192
193/**
194 * round_jiffies_relative - function to round jiffies to a full second
195 * @j: the time in (relative) jiffies that should be rounded
196 *
197 * round_jiffies_relative rounds a time delta in the future (in jiffies)
198 * up or down to (approximately) full seconds. This is useful for timers
199 * for which the exact time they fire does not matter too much, as long as
200 * they fire approximately every X seconds.
201 *
202 * By rounding these timers to whole seconds, all such timers will fire
203 * at the same time, rather than at various times spread out. The goal
204 * of this is to have the CPU wake up less, which saves power.
205 *
206 * The return value is the rounded version of the "j" parameter.
207 */
208unsigned long round_jiffies_relative(unsigned long j)
209{
210 return __round_jiffies_relative(j, raw_smp_processor_id());
211}
212EXPORT_SYMBOL_GPL(round_jiffies_relative);
213
214
83static inline void set_running_timer(tvec_base_t *base, 215static inline void set_running_timer(tvec_base_t *base,
84 struct timer_list *timer) 216 struct timer_list *timer)
85{ 217{
@@ -714,7 +846,7 @@ static int change_clocksource(void)
714 clock = new; 846 clock = new;
715 clock->cycle_last = now; 847 clock->cycle_last = now;
716 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 848 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
717 clock->name); 849 clock->name);
718 return 1; 850 return 1;
719 } else if (clock->update_callback) { 851 } else if (clock->update_callback) {
720 return clock->update_callback(); 852 return clock->update_callback();
@@ -722,7 +854,10 @@ static int change_clocksource(void)
722 return 0; 854 return 0;
723} 855}
724#else 856#else
725#define change_clocksource() (0) 857static inline int change_clocksource(void)
858{
859 return 0;
860}
726#endif 861#endif
727 862
728/** 863/**
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device);
820 * If the error is already larger, we look ahead even further 955 * If the error is already larger, we look ahead even further
821 * to compensate for late or lost adjustments. 956 * to compensate for late or lost adjustments.
822 */ 957 */
823static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) 958static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
959 s64 *offset)
824{ 960{
825 s64 tick_error, i; 961 s64 tick_error, i;
826 u32 look_ahead, adj; 962 u32 look_ahead, adj;
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
844 * Now calculate the error in (1 << look_ahead) ticks, but first 980 * Now calculate the error in (1 << look_ahead) ticks, but first
845 * remove the single look ahead already included in the error. 981 * remove the single look ahead already included in the error.
846 */ 982 */
847 tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); 983 tick_error = current_tick_length() >>
984 (TICK_LENGTH_SHIFT - clock->shift + 1);
848 tick_error -= clock->xtime_interval >> 1; 985 tick_error -= clock->xtime_interval >> 1;
849 error = ((error - tick_error) >> look_ahead) + tick_error; 986 error = ((error - tick_error) >> look_ahead) + tick_error;
850 987
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
896 clock->mult += adj; 1033 clock->mult += adj;
897 clock->xtime_interval += interval; 1034 clock->xtime_interval += interval;
898 clock->xtime_nsec -= offset; 1035 clock->xtime_nsec -= offset;
899 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); 1036 clock->error -= (interval - offset) <<
1037 (TICK_LENGTH_SHIFT - clock->shift);
900} 1038}
901 1039
902/** 1040/**
@@ -1008,11 +1146,15 @@ static inline void calc_load(unsigned long ticks)
1008 unsigned long active_tasks; /* fixed-point */ 1146 unsigned long active_tasks; /* fixed-point */
1009 static int count = LOAD_FREQ; 1147 static int count = LOAD_FREQ;
1010 1148
1011 active_tasks = count_active_tasks(); 1149 count -= ticks;
1012 for (count -= ticks; count < 0; count += LOAD_FREQ) { 1150 if (unlikely(count < 0)) {
1013 CALC_LOAD(avenrun[0], EXP_1, active_tasks); 1151 active_tasks = count_active_tasks();
1014 CALC_LOAD(avenrun[1], EXP_5, active_tasks); 1152 do {
1015 CALC_LOAD(avenrun[2], EXP_15, active_tasks); 1153 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1154 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1155 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1156 count += LOAD_FREQ;
1157 } while (count < 0);
1016 } 1158 }
1017} 1159}
1018 1160
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 65a5036a3d95..baacc3691415 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -80,18 +80,31 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
80 */ 80 */
81void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) 81void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
82{ 82{
83 struct mm_struct *mm;
84
83 /* convert pages-jiffies to Mbyte-usec */ 85 /* convert pages-jiffies to Mbyte-usec */
84 stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; 86 stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
85 stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; 87 stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
86 if (p->mm) { 88 mm = get_task_mm(p);
89 if (mm) {
87 /* adjust to KB unit */ 90 /* adjust to KB unit */
88 stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB; 91 stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB;
89 stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB; 92 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB;
93 mmput(mm);
90 } 94 }
91 stats->read_char = p->rchar; 95 stats->read_char = p->rchar;
92 stats->write_char = p->wchar; 96 stats->write_char = p->wchar;
93 stats->read_syscalls = p->syscr; 97 stats->read_syscalls = p->syscr;
94 stats->write_syscalls = p->syscw; 98 stats->write_syscalls = p->syscw;
99#ifdef CONFIG_TASK_IO_ACCOUNTING
100 stats->read_bytes = p->ioac.read_bytes;
101 stats->write_bytes = p->ioac.write_bytes;
102 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
103#else
104 stats->read_bytes = 0;
105 stats->write_bytes = 0;
106 stats->cancelled_write_bytes = 0;
107#endif
95} 108}
96#undef KB 109#undef KB
97#undef MB 110#undef MB
diff --git a/kernel/unwind.c b/kernel/unwind.c
index f7e50d16dbf6..09c261329249 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -14,11 +14,12 @@
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/sort.h> 15#include <linux/sort.h>
16#include <linux/stop_machine.h> 16#include <linux/stop_machine.h>
17#include <linux/uaccess.h>
17#include <asm/sections.h> 18#include <asm/sections.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include <asm/unaligned.h> 20#include <asm/unaligned.h>
20 21
21extern char __start_unwind[], __end_unwind[]; 22extern const char __start_unwind[], __end_unwind[];
22extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; 23extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
23 24
24#define MAX_STACK_DEPTH 8 25#define MAX_STACK_DEPTH 8
@@ -94,6 +95,7 @@ static const struct {
94 95
95typedef unsigned long uleb128_t; 96typedef unsigned long uleb128_t;
96typedef signed long sleb128_t; 97typedef signed long sleb128_t;
98#define sleb128abs __builtin_labs
97 99
98static struct unwind_table { 100static struct unwind_table {
99 struct { 101 struct {
@@ -135,6 +137,17 @@ struct unwind_state {
135 137
136static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; 138static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
137 139
140static unsigned unwind_debug;
141static int __init unwind_debug_setup(char *s)
142{
143 unwind_debug = simple_strtoul(s, NULL, 0);
144 return 1;
145}
146__setup("unwind_debug=", unwind_debug_setup);
147#define dprintk(lvl, fmt, args...) \
148 ((void)(lvl > unwind_debug \
149 || printk(KERN_DEBUG "unwind: " fmt "\n", ##args)))
150
138static struct unwind_table *find_table(unsigned long pc) 151static struct unwind_table *find_table(unsigned long pc)
139{ 152{
140 struct unwind_table *table; 153 struct unwind_table *table;
@@ -151,7 +164,9 @@ static struct unwind_table *find_table(unsigned long pc)
151 164
152static unsigned long read_pointer(const u8 **pLoc, 165static unsigned long read_pointer(const u8 **pLoc,
153 const void *end, 166 const void *end,
154 signed ptrType); 167 signed ptrType,
168 unsigned long text_base,
169 unsigned long data_base);
155 170
156static void init_unwind_table(struct unwind_table *table, 171static void init_unwind_table(struct unwind_table *table,
157 const char *name, 172 const char *name,
@@ -176,10 +191,13 @@ static void init_unwind_table(struct unwind_table *table,
176 /* See if the linker provided table looks valid. */ 191 /* See if the linker provided table looks valid. */
177 if (header_size <= 4 192 if (header_size <= 4
178 || header_start[0] != 1 193 || header_start[0] != 1
179 || (void *)read_pointer(&ptr, end, header_start[1]) != table_start 194 || (void *)read_pointer(&ptr, end, header_start[1], 0, 0)
180 || header_start[2] == DW_EH_PE_omit 195 != table_start
181 || read_pointer(&ptr, end, header_start[2]) <= 0 196 || !read_pointer(&ptr, end, header_start[2], 0, 0)
182 || header_start[3] == DW_EH_PE_omit) 197 || !read_pointer(&ptr, end, header_start[3], 0,
198 (unsigned long)header_start)
199 || !read_pointer(&ptr, end, header_start[3], 0,
200 (unsigned long)header_start))
183 header_start = NULL; 201 header_start = NULL;
184 table->hdrsz = header_size; 202 table->hdrsz = header_size;
185 smp_wmb(); 203 smp_wmb();
@@ -269,7 +287,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
269 ptr = (const u8 *)(fde + 2); 287 ptr = (const u8 *)(fde + 2);
270 if (!read_pointer(&ptr, 288 if (!read_pointer(&ptr,
271 (const u8 *)(fde + 1) + *fde, 289 (const u8 *)(fde + 1) + *fde,
272 ptrType)) 290 ptrType, 0, 0))
273 return; 291 return;
274 ++n; 292 ++n;
275 } 293 }
@@ -279,6 +297,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
279 297
280 hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) 298 hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
281 + 2 * n * sizeof(unsigned long); 299 + 2 * n * sizeof(unsigned long);
300 dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize);
282 header = alloc(hdrSize); 301 header = alloc(hdrSize);
283 if (!header) 302 if (!header)
284 return; 303 return;
@@ -303,7 +322,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
303 ptr = (const u8 *)(fde + 2); 322 ptr = (const u8 *)(fde + 2);
304 header->table[n].start = read_pointer(&ptr, 323 header->table[n].start = read_pointer(&ptr,
305 (const u8 *)(fde + 1) + *fde, 324 (const u8 *)(fde + 1) + *fde,
306 fde_pointer_type(cie)); 325 fde_pointer_type(cie), 0, 0);
307 header->table[n].fde = (unsigned long)fde; 326 header->table[n].fde = (unsigned long)fde;
308 ++n; 327 ++n;
309 } 328 }
@@ -486,7 +505,9 @@ static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
486 505
487static unsigned long read_pointer(const u8 **pLoc, 506static unsigned long read_pointer(const u8 **pLoc,
488 const void *end, 507 const void *end,
489 signed ptrType) 508 signed ptrType,
509 unsigned long text_base,
510 unsigned long data_base)
490{ 511{
491 unsigned long value = 0; 512 unsigned long value = 0;
492 union { 513 union {
@@ -498,13 +519,17 @@ static unsigned long read_pointer(const u8 **pLoc,
498 const unsigned long *pul; 519 const unsigned long *pul;
499 } ptr; 520 } ptr;
500 521
501 if (ptrType < 0 || ptrType == DW_EH_PE_omit) 522 if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
523 dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end);
502 return 0; 524 return 0;
525 }
503 ptr.p8 = *pLoc; 526 ptr.p8 = *pLoc;
504 switch(ptrType & DW_EH_PE_FORM) { 527 switch(ptrType & DW_EH_PE_FORM) {
505 case DW_EH_PE_data2: 528 case DW_EH_PE_data2:
506 if (end < (const void *)(ptr.p16u + 1)) 529 if (end < (const void *)(ptr.p16u + 1)) {
530 dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end);
507 return 0; 531 return 0;
532 }
508 if(ptrType & DW_EH_PE_signed) 533 if(ptrType & DW_EH_PE_signed)
509 value = get_unaligned(ptr.p16s++); 534 value = get_unaligned(ptr.p16s++);
510 else 535 else
@@ -512,8 +537,10 @@ static unsigned long read_pointer(const u8 **pLoc,
512 break; 537 break;
513 case DW_EH_PE_data4: 538 case DW_EH_PE_data4:
514#ifdef CONFIG_64BIT 539#ifdef CONFIG_64BIT
515 if (end < (const void *)(ptr.p32u + 1)) 540 if (end < (const void *)(ptr.p32u + 1)) {
541 dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end);
516 return 0; 542 return 0;
543 }
517 if(ptrType & DW_EH_PE_signed) 544 if(ptrType & DW_EH_PE_signed)
518 value = get_unaligned(ptr.p32s++); 545 value = get_unaligned(ptr.p32s++);
519 else 546 else
@@ -525,8 +552,10 @@ static unsigned long read_pointer(const u8 **pLoc,
525 BUILD_BUG_ON(sizeof(u32) != sizeof(value)); 552 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
526#endif 553#endif
527 case DW_EH_PE_native: 554 case DW_EH_PE_native:
528 if (end < (const void *)(ptr.pul + 1)) 555 if (end < (const void *)(ptr.pul + 1)) {
556 dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end);
529 return 0; 557 return 0;
558 }
530 value = get_unaligned(ptr.pul++); 559 value = get_unaligned(ptr.pul++);
531 break; 560 break;
532 case DW_EH_PE_leb128: 561 case DW_EH_PE_leb128:
@@ -534,10 +563,14 @@ static unsigned long read_pointer(const u8 **pLoc,
534 value = ptrType & DW_EH_PE_signed 563 value = ptrType & DW_EH_PE_signed
535 ? get_sleb128(&ptr.p8, end) 564 ? get_sleb128(&ptr.p8, end)
536 : get_uleb128(&ptr.p8, end); 565 : get_uleb128(&ptr.p8, end);
537 if ((const void *)ptr.p8 > end) 566 if ((const void *)ptr.p8 > end) {
567 dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end);
538 return 0; 568 return 0;
569 }
539 break; 570 break;
540 default: 571 default:
572 dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
573 ptrType, ptr.p8, end);
541 return 0; 574 return 0;
542 } 575 }
543 switch(ptrType & DW_EH_PE_ADJUST) { 576 switch(ptrType & DW_EH_PE_ADJUST) {
@@ -546,12 +579,33 @@ static unsigned long read_pointer(const u8 **pLoc,
546 case DW_EH_PE_pcrel: 579 case DW_EH_PE_pcrel:
547 value += (unsigned long)*pLoc; 580 value += (unsigned long)*pLoc;
548 break; 581 break;
582 case DW_EH_PE_textrel:
583 if (likely(text_base)) {
584 value += text_base;
585 break;
586 }
587 dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
588 ptrType, *pLoc, end);
589 return 0;
590 case DW_EH_PE_datarel:
591 if (likely(data_base)) {
592 value += data_base;
593 break;
594 }
595 dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
596 ptrType, *pLoc, end);
597 return 0;
549 default: 598 default:
599 dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
600 ptrType, *pLoc, end);
550 return 0; 601 return 0;
551 } 602 }
552 if ((ptrType & DW_EH_PE_indirect) 603 if ((ptrType & DW_EH_PE_indirect)
553 && __get_user(value, (unsigned long *)value)) 604 && probe_kernel_address((unsigned long *)value, value)) {
605 dprintk(1, "Cannot read indirect value %lx (%p,%p).",
606 value, *pLoc, end);
554 return 0; 607 return 0;
608 }
555 *pLoc = ptr.p8; 609 *pLoc = ptr.p8;
556 610
557 return value; 611 return value;
@@ -594,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie)
594 case 'P': { 648 case 'P': {
595 signed ptrType = *ptr++; 649 signed ptrType = *ptr++;
596 650
597 if (!read_pointer(&ptr, end, ptrType) || ptr > end) 651 if (!read_pointer(&ptr, end, ptrType, 0, 0)
652 || ptr > end)
598 return -1; 653 return -1;
599 } 654 }
600 break; 655 break;
@@ -654,7 +709,8 @@ static int processCFI(const u8 *start,
654 case DW_CFA_nop: 709 case DW_CFA_nop:
655 break; 710 break;
656 case DW_CFA_set_loc: 711 case DW_CFA_set_loc:
657 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) 712 state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0);
713 if (state->loc == 0)
658 result = 0; 714 result = 0;
659 break; 715 break;
660 case DW_CFA_advance_loc1: 716 case DW_CFA_advance_loc1:
@@ -700,8 +756,10 @@ static int processCFI(const u8 *start,
700 state->label = NULL; 756 state->label = NULL;
701 return 1; 757 return 1;
702 } 758 }
703 if (state->stackDepth >= MAX_STACK_DEPTH) 759 if (state->stackDepth >= MAX_STACK_DEPTH) {
760 dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end);
704 return 0; 761 return 0;
762 }
705 state->stack[state->stackDepth++] = ptr.p8; 763 state->stack[state->stackDepth++] = ptr.p8;
706 break; 764 break;
707 case DW_CFA_restore_state: 765 case DW_CFA_restore_state:
@@ -716,8 +774,10 @@ static int processCFI(const u8 *start,
716 result = processCFI(start, end, 0, ptrType, state); 774 result = processCFI(start, end, 0, ptrType, state);
717 state->loc = loc; 775 state->loc = loc;
718 state->label = label; 776 state->label = label;
719 } else 777 } else {
778 dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end);
720 return 0; 779 return 0;
780 }
721 break; 781 break;
722 case DW_CFA_def_cfa: 782 case DW_CFA_def_cfa:
723 state->cfa.reg = get_uleb128(&ptr.p8, end); 783 state->cfa.reg = get_uleb128(&ptr.p8, end);
@@ -749,6 +809,7 @@ static int processCFI(const u8 *start,
749 break; 809 break;
750 case DW_CFA_GNU_window_save: 810 case DW_CFA_GNU_window_save:
751 default: 811 default:
812 dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end);
752 result = 0; 813 result = 0;
753 break; 814 break;
754 } 815 }
@@ -764,12 +825,17 @@ static int processCFI(const u8 *start,
764 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); 825 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
765 break; 826 break;
766 } 827 }
767 if (ptr.p8 > end) 828 if (ptr.p8 > end) {
829 dprintk(1, "Data overrun (%p,%p).", ptr.p8, end);
768 result = 0; 830 result = 0;
831 }
769 if (result && targetLoc != 0 && targetLoc < state->loc) 832 if (result && targetLoc != 0 && targetLoc < state->loc)
770 return 1; 833 return 1;
771 } 834 }
772 835
836 if (result && ptr.p8 < end)
837 dprintk(1, "Data underrun (%p,%p).", ptr.p8, end);
838
773 return result 839 return result
774 && ptr.p8 == end 840 && ptr.p8 == end
775 && (targetLoc == 0 841 && (targetLoc == 0
@@ -786,7 +852,7 @@ int unwind(struct unwind_frame_info *frame)
786#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) 852#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
787 const u32 *fde = NULL, *cie = NULL; 853 const u32 *fde = NULL, *cie = NULL;
788 const u8 *ptr = NULL, *end = NULL; 854 const u8 *ptr = NULL, *end = NULL;
789 unsigned long pc = UNW_PC(frame) - frame->call_frame; 855 unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
790 unsigned long startLoc = 0, endLoc = 0, cfa; 856 unsigned long startLoc = 0, endLoc = 0, cfa;
791 unsigned i; 857 unsigned i;
792 signed ptrType = -1; 858 signed ptrType = -1;
@@ -813,9 +879,9 @@ int unwind(struct unwind_frame_info *frame)
813 ptr = hdr + 4; 879 ptr = hdr + 4;
814 end = hdr + table->hdrsz; 880 end = hdr + table->hdrsz;
815 if (tableSize 881 if (tableSize
816 && read_pointer(&ptr, end, hdr[1]) 882 && read_pointer(&ptr, end, hdr[1], 0, 0)
817 == (unsigned long)table->address 883 == (unsigned long)table->address
818 && (i = read_pointer(&ptr, end, hdr[2])) > 0 884 && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0
819 && i == (end - ptr) / (2 * tableSize) 885 && i == (end - ptr) / (2 * tableSize)
820 && !((end - ptr) % (2 * tableSize))) { 886 && !((end - ptr) % (2 * tableSize))) {
821 do { 887 do {
@@ -823,7 +889,8 @@ int unwind(struct unwind_frame_info *frame)
823 889
824 startLoc = read_pointer(&cur, 890 startLoc = read_pointer(&cur,
825 cur + tableSize, 891 cur + tableSize,
826 hdr[3]); 892 hdr[3], 0,
893 (unsigned long)hdr);
827 if (pc < startLoc) 894 if (pc < startLoc)
828 i /= 2; 895 i /= 2;
829 else { 896 else {
@@ -834,13 +901,17 @@ int unwind(struct unwind_frame_info *frame)
834 if (i == 1 901 if (i == 1
835 && (startLoc = read_pointer(&ptr, 902 && (startLoc = read_pointer(&ptr,
836 ptr + tableSize, 903 ptr + tableSize,
837 hdr[3])) != 0 904 hdr[3], 0,
905 (unsigned long)hdr)) != 0
838 && pc >= startLoc) 906 && pc >= startLoc)
839 fde = (void *)read_pointer(&ptr, 907 fde = (void *)read_pointer(&ptr,
840 ptr + tableSize, 908 ptr + tableSize,
841 hdr[3]); 909 hdr[3], 0,
910 (unsigned long)hdr);
842 } 911 }
843 } 912 }
913 if(hdr && !fde)
914 dprintk(3, "Binary lookup for %lx failed.", pc);
844 915
845 if (fde != NULL) { 916 if (fde != NULL) {
846 cie = cie_for_fde(fde, table); 917 cie = cie_for_fde(fde, table);
@@ -851,17 +922,19 @@ int unwind(struct unwind_frame_info *frame)
851 && (ptrType = fde_pointer_type(cie)) >= 0 922 && (ptrType = fde_pointer_type(cie)) >= 0
852 && read_pointer(&ptr, 923 && read_pointer(&ptr,
853 (const u8 *)(fde + 1) + *fde, 924 (const u8 *)(fde + 1) + *fde,
854 ptrType) == startLoc) { 925 ptrType, 0, 0) == startLoc) {
855 if (!(ptrType & DW_EH_PE_indirect)) 926 if (!(ptrType & DW_EH_PE_indirect))
856 ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; 927 ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
857 endLoc = startLoc 928 endLoc = startLoc
858 + read_pointer(&ptr, 929 + read_pointer(&ptr,
859 (const u8 *)(fde + 1) + *fde, 930 (const u8 *)(fde + 1) + *fde,
860 ptrType); 931 ptrType, 0, 0);
861 if(pc >= endLoc) 932 if(pc >= endLoc)
862 fde = NULL; 933 fde = NULL;
863 } else 934 } else
864 fde = NULL; 935 fde = NULL;
936 if(!fde)
937 dprintk(1, "Binary lookup result for %lx discarded.", pc);
865 } 938 }
866 if (fde == NULL) { 939 if (fde == NULL) {
867 for (fde = table->address, tableSize = table->size; 940 for (fde = table->address, tableSize = table->size;
@@ -881,7 +954,7 @@ int unwind(struct unwind_frame_info *frame)
881 ptr = (const u8 *)(fde + 2); 954 ptr = (const u8 *)(fde + 2);
882 startLoc = read_pointer(&ptr, 955 startLoc = read_pointer(&ptr,
883 (const u8 *)(fde + 1) + *fde, 956 (const u8 *)(fde + 1) + *fde,
884 ptrType); 957 ptrType, 0, 0);
885 if (!startLoc) 958 if (!startLoc)
886 continue; 959 continue;
887 if (!(ptrType & DW_EH_PE_indirect)) 960 if (!(ptrType & DW_EH_PE_indirect))
@@ -889,10 +962,12 @@ int unwind(struct unwind_frame_info *frame)
889 endLoc = startLoc 962 endLoc = startLoc
890 + read_pointer(&ptr, 963 + read_pointer(&ptr,
891 (const u8 *)(fde + 1) + *fde, 964 (const u8 *)(fde + 1) + *fde,
892 ptrType); 965 ptrType, 0, 0);
893 if (pc >= startLoc && pc < endLoc) 966 if (pc >= startLoc && pc < endLoc)
894 break; 967 break;
895 } 968 }
969 if(!fde)
970 dprintk(3, "Linear lookup for %lx failed.", pc);
896 } 971 }
897 } 972 }
898 if (cie != NULL) { 973 if (cie != NULL) {
@@ -926,6 +1001,8 @@ int unwind(struct unwind_frame_info *frame)
926 if (ptr >= end || *ptr) 1001 if (ptr >= end || *ptr)
927 cie = NULL; 1002 cie = NULL;
928 } 1003 }
1004 if(!cie)
1005 dprintk(1, "CIE unusable (%p,%p).", ptr, end);
929 ++ptr; 1006 ++ptr;
930 } 1007 }
931 if (cie != NULL) { 1008 if (cie != NULL) {
@@ -935,17 +1012,27 @@ int unwind(struct unwind_frame_info *frame)
935 state.dataAlign = get_sleb128(&ptr, end); 1012 state.dataAlign = get_sleb128(&ptr, end);
936 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) 1013 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
937 cie = NULL; 1014 cie = NULL;
938 else { 1015 else if (UNW_PC(frame) % state.codeAlign
1016 || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
1017 dprintk(1, "Input pointer(s) misaligned (%lx,%lx).",
1018 UNW_PC(frame), UNW_SP(frame));
1019 return -EPERM;
1020 } else {
939 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); 1021 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
940 /* skip augmentation */ 1022 /* skip augmentation */
941 if (((const char *)(cie + 2))[1] == 'z') 1023 if (((const char *)(cie + 2))[1] == 'z') {
942 ptr += get_uleb128(&ptr, end); 1024 uleb128_t augSize = get_uleb128(&ptr, end);
1025
1026 ptr += augSize;
1027 }
943 if (ptr > end 1028 if (ptr > end
944 || retAddrReg >= ARRAY_SIZE(reg_info) 1029 || retAddrReg >= ARRAY_SIZE(reg_info)
945 || REG_INVALID(retAddrReg) 1030 || REG_INVALID(retAddrReg)
946 || reg_info[retAddrReg].width != sizeof(unsigned long)) 1031 || reg_info[retAddrReg].width != sizeof(unsigned long))
947 cie = NULL; 1032 cie = NULL;
948 } 1033 }
1034 if(!cie)
1035 dprintk(1, "CIE validation failed (%p,%p).", ptr, end);
949 } 1036 }
950 if (cie != NULL) { 1037 if (cie != NULL) {
951 state.cieStart = ptr; 1038 state.cieStart = ptr;
@@ -959,13 +1046,15 @@ int unwind(struct unwind_frame_info *frame)
959 if ((ptr += augSize) > end) 1046 if ((ptr += augSize) > end)
960 fde = NULL; 1047 fde = NULL;
961 } 1048 }
1049 if(!fde)
1050 dprintk(1, "FDE validation failed (%p,%p).", ptr, end);
962 } 1051 }
963 if (cie == NULL || fde == NULL) { 1052 if (cie == NULL || fde == NULL) {
964#ifdef CONFIG_FRAME_POINTER 1053#ifdef CONFIG_FRAME_POINTER
965 unsigned long top, bottom; 1054 unsigned long top, bottom;
966#endif
967 1055
968#ifdef CONFIG_FRAME_POINTER 1056 if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long))
1057 return -EPERM;
969 top = STACK_TOP(frame->task); 1058 top = STACK_TOP(frame->task);
970 bottom = STACK_BOTTOM(frame->task); 1059 bottom = STACK_BOTTOM(frame->task);
971# if FRAME_RETADDR_OFFSET < 0 1060# if FRAME_RETADDR_OFFSET < 0
@@ -981,18 +1070,19 @@ int unwind(struct unwind_frame_info *frame)
981 & (sizeof(unsigned long) - 1))) { 1070 & (sizeof(unsigned long) - 1))) {
982 unsigned long link; 1071 unsigned long link;
983 1072
984 if (!__get_user(link, 1073 if (!probe_kernel_address(
985 (unsigned long *)(UNW_FP(frame) 1074 (unsigned long *)(UNW_FP(frame)
986 + FRAME_LINK_OFFSET)) 1075 + FRAME_LINK_OFFSET),
1076 link)
987# if FRAME_RETADDR_OFFSET < 0 1077# if FRAME_RETADDR_OFFSET < 0
988 && link > bottom && link < UNW_FP(frame) 1078 && link > bottom && link < UNW_FP(frame)
989# else 1079# else
990 && link > UNW_FP(frame) && link < bottom 1080 && link > UNW_FP(frame) && link < bottom
991# endif 1081# endif
992 && !(link & (sizeof(link) - 1)) 1082 && !(link & (sizeof(link) - 1))
993 && !__get_user(UNW_PC(frame), 1083 && !probe_kernel_address(
994 (unsigned long *)(UNW_FP(frame) 1084 (unsigned long *)(UNW_FP(frame)
995 + FRAME_RETADDR_OFFSET))) { 1085 + FRAME_RETADDR_OFFSET), UNW_PC(frame))) {
996 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET 1086 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
997# if FRAME_RETADDR_OFFSET < 0 1087# if FRAME_RETADDR_OFFSET < 0
998 - 1088 -
@@ -1015,8 +1105,11 @@ int unwind(struct unwind_frame_info *frame)
1015 || state.regs[retAddrReg].where == Nowhere 1105 || state.regs[retAddrReg].where == Nowhere
1016 || state.cfa.reg >= ARRAY_SIZE(reg_info) 1106 || state.cfa.reg >= ARRAY_SIZE(reg_info)
1017 || reg_info[state.cfa.reg].width != sizeof(unsigned long) 1107 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
1018 || state.cfa.offs % sizeof(unsigned long)) 1108 || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
1109 || state.cfa.offs % sizeof(unsigned long)) {
1110 dprintk(1, "Unusable unwind info (%p,%p).", ptr, end);
1019 return -EIO; 1111 return -EIO;
1112 }
1020 /* update frame */ 1113 /* update frame */
1021#ifndef CONFIG_AS_CFI_SIGNAL_FRAME 1114#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
1022 if(frame->call_frame 1115 if(frame->call_frame
@@ -1035,10 +1128,14 @@ int unwind(struct unwind_frame_info *frame)
1035#else 1128#else
1036# define CASES CASE(8); CASE(16); CASE(32); CASE(64) 1129# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
1037#endif 1130#endif
1131 pc = UNW_PC(frame);
1132 sp = UNW_SP(frame);
1038 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { 1133 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
1039 if (REG_INVALID(i)) { 1134 if (REG_INVALID(i)) {
1040 if (state.regs[i].where == Nowhere) 1135 if (state.regs[i].where == Nowhere)
1041 continue; 1136 continue;
1137 dprintk(1, "Cannot restore register %u (%d).",
1138 i, state.regs[i].where);
1042 return -EIO; 1139 return -EIO;
1043 } 1140 }
1044 switch(state.regs[i].where) { 1141 switch(state.regs[i].where) {
@@ -1047,8 +1144,11 @@ int unwind(struct unwind_frame_info *frame)
1047 case Register: 1144 case Register:
1048 if (state.regs[i].value >= ARRAY_SIZE(reg_info) 1145 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
1049 || REG_INVALID(state.regs[i].value) 1146 || REG_INVALID(state.regs[i].value)
1050 || reg_info[i].width > reg_info[state.regs[i].value].width) 1147 || reg_info[i].width > reg_info[state.regs[i].value].width) {
1148 dprintk(1, "Cannot restore register %u from register %lu.",
1149 i, state.regs[i].value);
1051 return -EIO; 1150 return -EIO;
1151 }
1052 switch(reg_info[state.regs[i].value].width) { 1152 switch(reg_info[state.regs[i].value].width) {
1053#define CASE(n) \ 1153#define CASE(n) \
1054 case sizeof(u##n): \ 1154 case sizeof(u##n): \
@@ -1058,6 +1158,9 @@ int unwind(struct unwind_frame_info *frame)
1058 CASES; 1158 CASES;
1059#undef CASE 1159#undef CASE
1060 default: 1160 default:
1161 dprintk(1, "Unsupported register size %u (%lu).",
1162 reg_info[state.regs[i].value].width,
1163 state.regs[i].value);
1061 return -EIO; 1164 return -EIO;
1062 } 1165 }
1063 break; 1166 break;
@@ -1082,12 +1185,17 @@ int unwind(struct unwind_frame_info *frame)
1082 CASES; 1185 CASES;
1083#undef CASE 1186#undef CASE
1084 default: 1187 default:
1188 dprintk(1, "Unsupported register size %u (%u).",
1189 reg_info[i].width, i);
1085 return -EIO; 1190 return -EIO;
1086 } 1191 }
1087 break; 1192 break;
1088 case Value: 1193 case Value:
1089 if (reg_info[i].width != sizeof(unsigned long)) 1194 if (reg_info[i].width != sizeof(unsigned long)) {
1195 dprintk(1, "Unsupported value size %u (%u).",
1196 reg_info[i].width, i);
1090 return -EIO; 1197 return -EIO;
1198 }
1091 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value 1199 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
1092 * state.dataAlign; 1200 * state.dataAlign;
1093 break; 1201 break;
@@ -1099,15 +1207,20 @@ int unwind(struct unwind_frame_info *frame)
1099 % sizeof(unsigned long) 1207 % sizeof(unsigned long)
1100 || addr < startLoc 1208 || addr < startLoc
1101 || addr + sizeof(unsigned long) < addr 1209 || addr + sizeof(unsigned long) < addr
1102 || addr + sizeof(unsigned long) > endLoc) 1210 || addr + sizeof(unsigned long) > endLoc) {
1211 dprintk(1, "Bad memory location %lx (%lx).",
1212 addr, state.regs[i].value);
1103 return -EIO; 1213 return -EIO;
1214 }
1104 switch(reg_info[i].width) { 1215 switch(reg_info[i].width) {
1105#define CASE(n) case sizeof(u##n): \ 1216#define CASE(n) case sizeof(u##n): \
1106 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ 1217 probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
1107 break 1218 break
1108 CASES; 1219 CASES;
1109#undef CASE 1220#undef CASE
1110 default: 1221 default:
1222 dprintk(1, "Unsupported memory size %u (%u).",
1223 reg_info[i].width, i);
1111 return -EIO; 1224 return -EIO;
1112 } 1225 }
1113 } 1226 }
@@ -1115,6 +1228,17 @@ int unwind(struct unwind_frame_info *frame)
1115 } 1228 }
1116 } 1229 }
1117 1230
1231 if (UNW_PC(frame) % state.codeAlign
1232 || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
1233 dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
1234 UNW_PC(frame), UNW_SP(frame));
1235 return -EIO;
1236 }
1237 if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) {
1238 dprintk(1, "No progress (%lx,%lx).", pc, sp);
1239 return -EIO;
1240 }
1241
1118 return 0; 1242 return 0;
1119#undef CASES 1243#undef CASES
1120#undef FRAME_REG 1244#undef FRAME_REG
diff --git a/kernel/user.c b/kernel/user.c
index 6408c0424291..4869563080e9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) 27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
28 28
29static kmem_cache_t *uid_cachep; 29static struct kmem_cache *uid_cachep;
30static struct list_head uidhash_table[UIDHASH_SZ]; 30static struct list_head uidhash_table[UIDHASH_SZ];
31 31
32/* 32/*
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
132 if (!up) { 132 if (!up) {
133 struct user_struct *new; 133 struct user_struct *new;
134 134
135 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); 135 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
136 if (!new) 136 if (!new)
137 return NULL; 137 return NULL;
138 new->uid = uid; 138 new->uid = uid;
@@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user)
187 atomic_dec(&old_user->processes); 187 atomic_dec(&old_user->processes);
188 switch_uid_keyring(new_user); 188 switch_uid_keyring(new_user);
189 current->user = new_user; 189 current->user = new_user;
190
191 /*
192 * We need to synchronize with __sigqueue_alloc()
193 * doing a get_uid(p->user).. If that saw the old
194 * user value, we need to wait until it has exited
195 * its critical region before we can free the old
196 * structure.
197 */
198 smp_mb();
199 spin_unlock_wait(&current->sighand->siglock);
200
190 free_uid(old_user); 201 free_uid(old_user);
191 suid_keys(current); 202 suid_keys(current);
192} 203}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 17c2f03d2c27..db49886bfae1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,9 @@
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30#include <linux/hardirq.h> 30#include <linux/hardirq.h>
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/freezer.h>
33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h>
32 35
33/* 36/*
34 * The per-CPU workqueue (if single thread, we always use the first 37 * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct {
55 struct task_struct *thread; 58 struct task_struct *thread;
56 59
57 int run_depth; /* Detect run_workqueue() recursion depth */ 60 int run_depth; /* Detect run_workqueue() recursion depth */
61
62 int freezeable; /* Freeze the thread during suspend */
58} ____cacheline_aligned; 63} ____cacheline_aligned;
59 64
60/* 65/*
@@ -80,6 +85,99 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
80 return list_empty(&wq->list); 85 return list_empty(&wq->list);
81} 86}
82 87
88/*
89 * Set the workqueue on which a work item is to be run
90 * - Must *only* be called if the pending flag is set
91 */
92static inline void set_wq_data(struct work_struct *work, void *wq)
93{
94 unsigned long new;
95
96 BUG_ON(!work_pending(work));
97
98 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
99 new |= work->management & WORK_STRUCT_FLAG_MASK;
100 work->management = new;
101}
102
103static inline void *get_wq_data(struct work_struct *work)
104{
105 return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
106}
107
108static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
109{
110 int ret = 0;
111 unsigned long flags;
112
113 spin_lock_irqsave(&cwq->lock, flags);
114 /*
115 * We need to re-validate the work info after we've gotten
116 * the cpu_workqueue lock. We can run the work now iff:
117 *
118 * - the wq_data still matches the cpu_workqueue_struct
119 * - AND the work is still marked pending
120 * - AND the work is still on a list (which will be this
121 * workqueue_struct list)
122 *
123 * All these conditions are important, because we
124 * need to protect against the work being run right
125 * now on another CPU (all but the last one might be
126 * true if it's currently running and has not been
127 * released yet, for example).
128 */
129 if (get_wq_data(work) == cwq
130 && work_pending(work)
131 && !list_empty(&work->entry)) {
132 work_func_t f = work->func;
133 list_del_init(&work->entry);
134 spin_unlock_irqrestore(&cwq->lock, flags);
135
136 if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
137 work_release(work);
138 f(work);
139
140 spin_lock_irqsave(&cwq->lock, flags);
141 cwq->remove_sequence++;
142 wake_up(&cwq->work_done);
143 ret = 1;
144 }
145 spin_unlock_irqrestore(&cwq->lock, flags);
146 return ret;
147}
148
149/**
150 * run_scheduled_work - run scheduled work synchronously
151 * @work: work to run
152 *
153 * This checks if the work was pending, and runs it
154 * synchronously if so. It returns a boolean to indicate
155 * whether it had any scheduled work to run or not.
156 *
157 * NOTE! This _only_ works for normal work_structs. You
158 * CANNOT use this for delayed work, because the wq data
159 * for delayed work will not point properly to the per-
160 * CPU workqueue struct, but will change!
161 */
162int fastcall run_scheduled_work(struct work_struct *work)
163{
164 for (;;) {
165 struct cpu_workqueue_struct *cwq;
166
167 if (!work_pending(work))
168 return 0;
169 if (list_empty(&work->entry))
170 return 0;
171 /* NOTE! This depends intimately on __queue_work! */
172 cwq = get_wq_data(work);
173 if (!cwq)
174 return 0;
175 if (__run_work(cwq, work))
176 return 1;
177 }
178}
179EXPORT_SYMBOL(run_scheduled_work);
180
83/* Preempt must be disabled. */ 181/* Preempt must be disabled. */
84static void __queue_work(struct cpu_workqueue_struct *cwq, 182static void __queue_work(struct cpu_workqueue_struct *cwq,
85 struct work_struct *work) 183 struct work_struct *work)
@@ -87,7 +185,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
87 unsigned long flags; 185 unsigned long flags;
88 186
89 spin_lock_irqsave(&cwq->lock, flags); 187 spin_lock_irqsave(&cwq->lock, flags);
90 work->wq_data = cwq; 188 set_wq_data(work, cwq);
91 list_add_tail(&work->entry, &cwq->worklist); 189 list_add_tail(&work->entry, &cwq->worklist);
92 cwq->insert_sequence++; 190 cwq->insert_sequence++;
93 wake_up(&cwq->more_work); 191 wake_up(&cwq->more_work);
@@ -108,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
108{ 206{
109 int ret = 0, cpu = get_cpu(); 207 int ret = 0, cpu = get_cpu();
110 208
111 if (!test_and_set_bit(0, &work->pending)) { 209 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
112 if (unlikely(is_single_threaded(wq))) 210 if (unlikely(is_single_threaded(wq)))
113 cpu = singlethread_cpu; 211 cpu = singlethread_cpu;
114 BUG_ON(!list_empty(&work->entry)); 212 BUG_ON(!list_empty(&work->entry));
@@ -122,38 +220,42 @@ EXPORT_SYMBOL_GPL(queue_work);
122 220
123static void delayed_work_timer_fn(unsigned long __data) 221static void delayed_work_timer_fn(unsigned long __data)
124{ 222{
125 struct work_struct *work = (struct work_struct *)__data; 223 struct delayed_work *dwork = (struct delayed_work *)__data;
126 struct workqueue_struct *wq = work->wq_data; 224 struct workqueue_struct *wq = get_wq_data(&dwork->work);
127 int cpu = smp_processor_id(); 225 int cpu = smp_processor_id();
128 226
129 if (unlikely(is_single_threaded(wq))) 227 if (unlikely(is_single_threaded(wq)))
130 cpu = singlethread_cpu; 228 cpu = singlethread_cpu;
131 229
132 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 230 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
133} 231}
134 232
135/** 233/**
136 * queue_delayed_work - queue work on a workqueue after delay 234 * queue_delayed_work - queue work on a workqueue after delay
137 * @wq: workqueue to use 235 * @wq: workqueue to use
138 * @work: work to queue 236 * @work: delayable work to queue
139 * @delay: number of jiffies to wait before queueing 237 * @delay: number of jiffies to wait before queueing
140 * 238 *
141 * Returns 0 if @work was already on a queue, non-zero otherwise. 239 * Returns 0 if @work was already on a queue, non-zero otherwise.
142 */ 240 */
143int fastcall queue_delayed_work(struct workqueue_struct *wq, 241int fastcall queue_delayed_work(struct workqueue_struct *wq,
144 struct work_struct *work, unsigned long delay) 242 struct delayed_work *dwork, unsigned long delay)
145{ 243{
146 int ret = 0; 244 int ret = 0;
147 struct timer_list *timer = &work->timer; 245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work;
247
248 if (delay == 0)
249 return queue_work(wq, work);
148 250
149 if (!test_and_set_bit(0, &work->pending)) { 251 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
150 BUG_ON(timer_pending(timer)); 252 BUG_ON(timer_pending(timer));
151 BUG_ON(!list_empty(&work->entry)); 253 BUG_ON(!list_empty(&work->entry));
152 254
153 /* This stores wq for the moment, for the timer_fn */ 255 /* This stores wq for the moment, for the timer_fn */
154 work->wq_data = wq; 256 set_wq_data(work, wq);
155 timer->expires = jiffies + delay; 257 timer->expires = jiffies + delay;
156 timer->data = (unsigned long)work; 258 timer->data = (unsigned long)dwork;
157 timer->function = delayed_work_timer_fn; 259 timer->function = delayed_work_timer_fn;
158 add_timer(timer); 260 add_timer(timer);
159 ret = 1; 261 ret = 1;
@@ -172,19 +274,20 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
172 * Returns 0 if @work was already on a queue, non-zero otherwise. 274 * Returns 0 if @work was already on a queue, non-zero otherwise.
173 */ 275 */
174int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 276int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
175 struct work_struct *work, unsigned long delay) 277 struct delayed_work *dwork, unsigned long delay)
176{ 278{
177 int ret = 0; 279 int ret = 0;
178 struct timer_list *timer = &work->timer; 280 struct timer_list *timer = &dwork->timer;
281 struct work_struct *work = &dwork->work;
179 282
180 if (!test_and_set_bit(0, &work->pending)) { 283 if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
181 BUG_ON(timer_pending(timer)); 284 BUG_ON(timer_pending(timer));
182 BUG_ON(!list_empty(&work->entry)); 285 BUG_ON(!list_empty(&work->entry));
183 286
184 /* This stores wq for the moment, for the timer_fn */ 287 /* This stores wq for the moment, for the timer_fn */
185 work->wq_data = wq; 288 set_wq_data(work, wq);
186 timer->expires = jiffies + delay; 289 timer->expires = jiffies + delay;
187 timer->data = (unsigned long)work; 290 timer->data = (unsigned long)dwork;
188 timer->function = delayed_work_timer_fn; 291 timer->function = delayed_work_timer_fn;
189 add_timer_on(timer, cpu); 292 add_timer_on(timer, cpu);
190 ret = 1; 293 ret = 1;
@@ -212,15 +315,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
212 while (!list_empty(&cwq->worklist)) { 315 while (!list_empty(&cwq->worklist)) {
213 struct work_struct *work = list_entry(cwq->worklist.next, 316 struct work_struct *work = list_entry(cwq->worklist.next,
214 struct work_struct, entry); 317 struct work_struct, entry);
215 void (*f) (void *) = work->func; 318 work_func_t f = work->func;
216 void *data = work->data;
217 319
218 list_del_init(cwq->worklist.next); 320 list_del_init(cwq->worklist.next);
219 spin_unlock_irqrestore(&cwq->lock, flags); 321 spin_unlock_irqrestore(&cwq->lock, flags);
220 322
221 BUG_ON(work->wq_data != cwq); 323 BUG_ON(get_wq_data(work) != cwq);
222 clear_bit(0, &work->pending); 324 if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
223 f(data); 325 work_release(work);
326 f(work);
327
328 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
329 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
330 "%s/0x%08x/%d\n",
331 current->comm, preempt_count(),
332 current->pid);
333 printk(KERN_ERR " last function: ");
334 print_symbol("%s\n", (unsigned long)f);
335 debug_show_held_locks(current);
336 dump_stack();
337 }
224 338
225 spin_lock_irqsave(&cwq->lock, flags); 339 spin_lock_irqsave(&cwq->lock, flags);
226 cwq->remove_sequence++; 340 cwq->remove_sequence++;
@@ -237,7 +351,8 @@ static int worker_thread(void *__cwq)
237 struct k_sigaction sa; 351 struct k_sigaction sa;
238 sigset_t blocked; 352 sigset_t blocked;
239 353
240 current->flags |= PF_NOFREEZE; 354 if (!cwq->freezeable)
355 current->flags |= PF_NOFREEZE;
241 356
242 set_user_nice(current, -5); 357 set_user_nice(current, -5);
243 358
@@ -260,6 +375,9 @@ static int worker_thread(void *__cwq)
260 375
261 set_current_state(TASK_INTERRUPTIBLE); 376 set_current_state(TASK_INTERRUPTIBLE);
262 while (!kthread_should_stop()) { 377 while (!kthread_should_stop()) {
378 if (cwq->freezeable)
379 try_to_freeze();
380
263 add_wait_queue(&cwq->more_work, &wait); 381 add_wait_queue(&cwq->more_work, &wait);
264 if (list_empty(&cwq->worklist)) 382 if (list_empty(&cwq->worklist))
265 schedule(); 383 schedule();
@@ -336,7 +454,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
336EXPORT_SYMBOL_GPL(flush_workqueue); 454EXPORT_SYMBOL_GPL(flush_workqueue);
337 455
338static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 456static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
339 int cpu) 457 int cpu, int freezeable)
340{ 458{
341 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 459 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
342 struct task_struct *p; 460 struct task_struct *p;
@@ -346,6 +464,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
346 cwq->thread = NULL; 464 cwq->thread = NULL;
347 cwq->insert_sequence = 0; 465 cwq->insert_sequence = 0;
348 cwq->remove_sequence = 0; 466 cwq->remove_sequence = 0;
467 cwq->freezeable = freezeable;
349 INIT_LIST_HEAD(&cwq->worklist); 468 INIT_LIST_HEAD(&cwq->worklist);
350 init_waitqueue_head(&cwq->more_work); 469 init_waitqueue_head(&cwq->more_work);
351 init_waitqueue_head(&cwq->work_done); 470 init_waitqueue_head(&cwq->work_done);
@@ -361,7 +480,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
361} 480}
362 481
363struct workqueue_struct *__create_workqueue(const char *name, 482struct workqueue_struct *__create_workqueue(const char *name,
364 int singlethread) 483 int singlethread, int freezeable)
365{ 484{
366 int cpu, destroy = 0; 485 int cpu, destroy = 0;
367 struct workqueue_struct *wq; 486 struct workqueue_struct *wq;
@@ -381,7 +500,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
381 mutex_lock(&workqueue_mutex); 500 mutex_lock(&workqueue_mutex);
382 if (singlethread) { 501 if (singlethread) {
383 INIT_LIST_HEAD(&wq->list); 502 INIT_LIST_HEAD(&wq->list);
384 p = create_workqueue_thread(wq, singlethread_cpu); 503 p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
385 if (!p) 504 if (!p)
386 destroy = 1; 505 destroy = 1;
387 else 506 else
@@ -389,7 +508,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
389 } else { 508 } else {
390 list_add(&wq->list, &workqueues); 509 list_add(&wq->list, &workqueues);
391 for_each_online_cpu(cpu) { 510 for_each_online_cpu(cpu) {
392 p = create_workqueue_thread(wq, cpu); 511 p = create_workqueue_thread(wq, cpu, freezeable);
393 if (p) { 512 if (p) {
394 kthread_bind(p, cpu); 513 kthread_bind(p, cpu);
395 wake_up_process(p); 514 wake_up_process(p);
@@ -468,38 +587,37 @@ EXPORT_SYMBOL(schedule_work);
468 587
469/** 588/**
470 * schedule_delayed_work - put work task in global workqueue after delay 589 * schedule_delayed_work - put work task in global workqueue after delay
471 * @work: job to be done 590 * @dwork: job to be done
472 * @delay: number of jiffies to wait 591 * @delay: number of jiffies to wait or 0 for immediate execution
473 * 592 *
474 * After waiting for a given time this puts a job in the kernel-global 593 * After waiting for a given time this puts a job in the kernel-global
475 * workqueue. 594 * workqueue.
476 */ 595 */
477int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) 596int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
478{ 597{
479 return queue_delayed_work(keventd_wq, work, delay); 598 return queue_delayed_work(keventd_wq, dwork, delay);
480} 599}
481EXPORT_SYMBOL(schedule_delayed_work); 600EXPORT_SYMBOL(schedule_delayed_work);
482 601
483/** 602/**
484 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 603 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
485 * @cpu: cpu to use 604 * @cpu: cpu to use
486 * @work: job to be done 605 * @dwork: job to be done
487 * @delay: number of jiffies to wait 606 * @delay: number of jiffies to wait
488 * 607 *
489 * After waiting for a given time this puts a job in the kernel-global 608 * After waiting for a given time this puts a job in the kernel-global
490 * workqueue on the specified CPU. 609 * workqueue on the specified CPU.
491 */ 610 */
492int schedule_delayed_work_on(int cpu, 611int schedule_delayed_work_on(int cpu,
493 struct work_struct *work, unsigned long delay) 612 struct delayed_work *dwork, unsigned long delay)
494{ 613{
495 return queue_delayed_work_on(cpu, keventd_wq, work, delay); 614 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
496} 615}
497EXPORT_SYMBOL(schedule_delayed_work_on); 616EXPORT_SYMBOL(schedule_delayed_work_on);
498 617
499/** 618/**
500 * schedule_on_each_cpu - call a function on each online CPU from keventd 619 * schedule_on_each_cpu - call a function on each online CPU from keventd
501 * @func: the function to call 620 * @func: the function to call
502 * @info: a pointer to pass to func()
503 * 621 *
504 * Returns zero on success. 622 * Returns zero on success.
505 * Returns -ve errno on failure. 623 * Returns -ve errno on failure.
@@ -508,7 +626,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
508 * 626 *
509 * schedule_on_each_cpu() is very slow. 627 * schedule_on_each_cpu() is very slow.
510 */ 628 */
511int schedule_on_each_cpu(void (*func)(void *info), void *info) 629int schedule_on_each_cpu(work_func_t func)
512{ 630{
513 int cpu; 631 int cpu;
514 struct work_struct *works; 632 struct work_struct *works;
@@ -519,7 +637,7 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
519 637
520 mutex_lock(&workqueue_mutex); 638 mutex_lock(&workqueue_mutex);
521 for_each_online_cpu(cpu) { 639 for_each_online_cpu(cpu) {
522 INIT_WORK(per_cpu_ptr(works, cpu), func, info); 640 INIT_WORK(per_cpu_ptr(works, cpu), func);
523 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 641 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
524 per_cpu_ptr(works, cpu)); 642 per_cpu_ptr(works, cpu));
525 } 643 }
@@ -539,12 +657,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
539 * cancel_rearming_delayed_workqueue - reliably kill off a delayed 657 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
540 * work whose handler rearms the delayed work. 658 * work whose handler rearms the delayed work.
541 * @wq: the controlling workqueue structure 659 * @wq: the controlling workqueue structure
542 * @work: the delayed work struct 660 * @dwork: the delayed work struct
543 */ 661 */
544void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, 662void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
545 struct work_struct *work) 663 struct delayed_work *dwork)
546{ 664{
547 while (!cancel_delayed_work(work)) 665 while (!cancel_delayed_work(dwork))
548 flush_workqueue(wq); 666 flush_workqueue(wq);
549} 667}
550EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); 668EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,18 +670,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
552/** 670/**
553 * cancel_rearming_delayed_work - reliably kill off a delayed keventd 671 * cancel_rearming_delayed_work - reliably kill off a delayed keventd
554 * work whose handler rearms the delayed work. 672 * work whose handler rearms the delayed work.
555 * @work: the delayed work struct 673 * @dwork: the delayed work struct
556 */ 674 */
557void cancel_rearming_delayed_work(struct work_struct *work) 675void cancel_rearming_delayed_work(struct delayed_work *dwork)
558{ 676{
559 cancel_rearming_delayed_workqueue(keventd_wq, work); 677 cancel_rearming_delayed_workqueue(keventd_wq, dwork);
560} 678}
561EXPORT_SYMBOL(cancel_rearming_delayed_work); 679EXPORT_SYMBOL(cancel_rearming_delayed_work);
562 680
563/** 681/**
564 * execute_in_process_context - reliably execute the routine with user context 682 * execute_in_process_context - reliably execute the routine with user context
565 * @fn: the function to execute 683 * @fn: the function to execute
566 * @data: data to pass to the function
567 * @ew: guaranteed storage for the execute work structure (must 684 * @ew: guaranteed storage for the execute work structure (must
568 * be available when the work executes) 685 * be available when the work executes)
569 * 686 *
@@ -573,15 +690,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
573 * Returns: 0 - function was executed 690 * Returns: 0 - function was executed
574 * 1 - function was scheduled for execution 691 * 1 - function was scheduled for execution
575 */ 692 */
576int execute_in_process_context(void (*fn)(void *data), void *data, 693int execute_in_process_context(work_func_t fn, struct execute_work *ew)
577 struct execute_work *ew)
578{ 694{
579 if (!in_interrupt()) { 695 if (!in_interrupt()) {
580 fn(data); 696 fn(&ew->work);
581 return 0; 697 return 0;
582 } 698 }
583 699
584 INIT_WORK(&ew->work, fn, data); 700 INIT_WORK(&ew->work, fn);
585 schedule_work(&ew->work); 701 schedule_work(&ew->work);
586 702
587 return 1; 703 return 1;
@@ -609,7 +725,6 @@ int current_is_keventd(void)
609 725
610} 726}
611 727
612#ifdef CONFIG_HOTPLUG_CPU
613/* Take the work from this (downed) CPU. */ 728/* Take the work from this (downed) CPU. */
614static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 729static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
615{ 730{
@@ -642,7 +757,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
642 mutex_lock(&workqueue_mutex); 757 mutex_lock(&workqueue_mutex);
643 /* Create a new workqueue thread for it. */ 758 /* Create a new workqueue thread for it. */
644 list_for_each_entry(wq, &workqueues, list) { 759 list_for_each_entry(wq, &workqueues, list) {
645 if (!create_workqueue_thread(wq, hotcpu)) { 760 if (!create_workqueue_thread(wq, hotcpu, 0)) {
646 printk("workqueue for %i failed\n", hotcpu); 761 printk("workqueue for %i failed\n", hotcpu);
647 return NOTIFY_BAD; 762 return NOTIFY_BAD;
648 } 763 }
@@ -692,7 +807,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
692 807
693 return NOTIFY_OK; 808 return NOTIFY_OK;
694} 809}
695#endif
696 810
697void init_workqueues(void) 811void init_workqueues(void)
698{ 812{