aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2007-01-17 18:34:51 -0500
committerDavid Woodhouse <dwmw2@infradead.org>2007-01-17 18:34:51 -0500
commit9cdf083f981b8d37b3212400a359368661385099 (patch)
treeaa15a6a08ad87e650dea40fb59b3180bef0d345b /kernel
parente499e01d234a31d59679b7b1e1cf628d917ba49a (diff)
parenta8b3485287731978899ced11f24628c927890e78 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz20
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c29
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/auditfilter.c7
-rw-r--r--kernel/auditsc.c13
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/cpuset.c124
-rw-r--r--kernel/delayacct.c4
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exit.c110
-rw-r--r--kernel/fork.c127
-rw-r--r--kernel/futex.c55
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/kallsyms.c33
-rw-r--r--kernel/kexec.c60
-rw-r--r--kernel/kmod.c18
-rw-r--r--kernel/kprobes.c117
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/latency.c1
-rw-r--r--kernel/lockdep.c245
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c6
-rw-r--r--kernel/module.c103
-rw-r--r--kernel/mutex-debug.c3
-rw-r--r--kernel/mutex.c9
-rw-r--r--kernel/nsproxy.c38
-rw-r--r--kernel/params.c6
-rw-r--r--kernel/pid.c77
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/disk.c72
-rw-r--r--kernel/power/main.c16
-rw-r--r--kernel/power/power.h32
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c143
-rw-r--r--kernel/power/snapshot.c860
-rw-r--r--kernel/power/swap.c348
-rw-r--r--kernel/power/swsusp.c98
-rw-r--r--kernel/power/user.c103
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/profile.c61
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutorture.c7
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/sched.c577
-rw-r--r--kernel/signal.c23
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/sysctl.c416
-rw-r--r--kernel/taskstats.c180
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/timer.c169
-rw-r--r--kernel/tsacct.c9
-rw-r--r--kernel/unwind.c1182
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/workqueue.c222
63 files changed, 3078 insertions, 2826 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8b..4af15802ccd4 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
7 default HZ_250 7 default HZ_250
8 help 8 help
9 Allows the configuration of the timer frequency. It is customary 9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more 10 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
11 beneficial for servers and NUMA systems that do not need to have 11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus 12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts. 13 contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
19 config HZ_100 19 config HZ_100
20 bool "100 HZ" 20 bool "100 HZ"
21 help 21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems 22 100 Hz is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if 23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring. 24 too many timer interrupts are occurring.
25 25
26 config HZ_250 26 config HZ_250
27 bool "250 HZ" 27 bool "250 HZ"
28 help 28 help
29 250 HZ is a good compromise choice allowing server performance 29 250 Hz is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even 30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems. 31 on SMP and NUMA systems. If you are going to be using NTSC video
32 or multimedia, selected 300Hz instead.
33
34 config HZ_300
35 bool "300 HZ"
36 help
37 300 Hz is a good compromise choice allowing server performance
38 while also showing good interactive responsiveness even
39 on SMP and NUMA systems and exactly dividing by both PAL and
40 NTSC frame rates for video and multimedia work.
32 41
33 config HZ_1000 42 config HZ_1000
34 bool "1000 HZ" 43 bool "1000 HZ"
35 help 44 help
36 1000 HZ is the preferred choice for desktop systems and other 45 1000 Hz is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events. 46 systems requiring fast interactive responses to events.
38 47
39endchoice 48endchoice
@@ -42,5 +51,6 @@ config HZ
42 int 51 int
43 default 100 if HZ_100 52 default 100 if HZ_100
44 default 250 if HZ_250 53 default 250 if HZ_250
54 default 300 if HZ_300
45 default 1000 if HZ_1000 55 default 1000 if HZ_1000
46 56
diff --git a/kernel/Makefile b/kernel/Makefile
index 5e3f3b75563a..14f4d45e0ae9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -31,7 +31,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
31obj-$(CONFIG_UID16) += uid16.o 31obj-$(CONFIG_UID16) += uid16.o
32obj-$(CONFIG_MODULES) += module.o 32obj-$(CONFIG_MODULES) += module.o
33obj-$(CONFIG_KALLSYMS) += kallsyms.o 33obj-$(CONFIG_KALLSYMS) += kallsyms.o
34obj-$(CONFIG_STACK_UNWIND) += unwind.o
35obj-$(CONFIG_PM) += power/ 34obj-$(CONFIG_PM) += power/
36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 35obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
37obj-$(CONFIG_KEXEC) += kexec.o 36obj-$(CONFIG_KEXEC) += kexec.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a81..70d0d88e5554 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
89 struct timer_list timer; 89 struct timer_list timer;
90}; 90};
91 91
92static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; 92static struct acct_glbs acct_globals __cacheline_aligned =
93 {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
93 94
94/* 95/*
95 * Called whenever the timer says to check the free space. 96 * Called whenever the timer says to check the free space.
@@ -117,7 +118,7 @@ static int check_free_space(struct file *file)
117 spin_unlock(&acct_globals.lock); 118 spin_unlock(&acct_globals.lock);
118 119
119 /* May block */ 120 /* May block */
120 if (vfs_statfs(file->f_dentry, &sbuf)) 121 if (vfs_statfs(file->f_path.dentry, &sbuf))
121 return res; 122 return res;
122 suspend = sbuf.f_blocks * SUSPEND; 123 suspend = sbuf.f_blocks * SUSPEND;
123 resume = sbuf.f_blocks * RESUME; 124 resume = sbuf.f_blocks * RESUME;
@@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file)
193 add_timer(&acct_globals.timer); 194 add_timer(&acct_globals.timer);
194 } 195 }
195 if (old_acct) { 196 if (old_acct) {
196 mnt_unpin(old_acct->f_vfsmnt); 197 mnt_unpin(old_acct->f_path.mnt);
197 spin_unlock(&acct_globals.lock); 198 spin_unlock(&acct_globals.lock);
198 do_acct_process(old_acct); 199 do_acct_process(old_acct);
199 filp_close(old_acct, NULL); 200 filp_close(old_acct, NULL);
@@ -211,7 +212,7 @@ static int acct_on(char *name)
211 if (IS_ERR(file)) 212 if (IS_ERR(file))
212 return PTR_ERR(file); 213 return PTR_ERR(file);
213 214
214 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { 215 if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
215 filp_close(file, NULL); 216 filp_close(file, NULL);
216 return -EACCES; 217 return -EACCES;
217 } 218 }
@@ -228,11 +229,11 @@ static int acct_on(char *name)
228 } 229 }
229 230
230 spin_lock(&acct_globals.lock); 231 spin_lock(&acct_globals.lock);
231 mnt_pin(file->f_vfsmnt); 232 mnt_pin(file->f_path.mnt);
232 acct_file_reopen(file); 233 acct_file_reopen(file);
233 spin_unlock(&acct_globals.lock); 234 spin_unlock(&acct_globals.lock);
234 235
235 mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ 236 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
236 237
237 return 0; 238 return 0;
238} 239}
@@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
282void acct_auto_close_mnt(struct vfsmount *m) 283void acct_auto_close_mnt(struct vfsmount *m)
283{ 284{
284 spin_lock(&acct_globals.lock); 285 spin_lock(&acct_globals.lock);
285 if (acct_globals.file && acct_globals.file->f_vfsmnt == m) 286 if (acct_globals.file && acct_globals.file->f_path.mnt == m)
286 acct_file_reopen(NULL); 287 acct_file_reopen(NULL);
287 spin_unlock(&acct_globals.lock); 288 spin_unlock(&acct_globals.lock);
288} 289}
@@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
298{ 299{
299 spin_lock(&acct_globals.lock); 300 spin_lock(&acct_globals.lock);
300 if (acct_globals.file && 301 if (acct_globals.file &&
301 acct_globals.file->f_vfsmnt->mnt_sb == sb) { 302 acct_globals.file->f_path.mnt->mnt_sb == sb) {
302 acct_file_reopen(NULL); 303 acct_file_reopen(NULL);
303 } 304 }
304 spin_unlock(&acct_globals.lock); 305 spin_unlock(&acct_globals.lock);
@@ -427,6 +428,7 @@ static void do_acct_process(struct file *file)
427 u64 elapsed; 428 u64 elapsed;
428 u64 run_time; 429 u64 run_time;
429 struct timespec uptime; 430 struct timespec uptime;
431 struct tty_struct *tty;
430 432
431 /* 433 /*
432 * First check to see if there is enough free_space to continue 434 * First check to see if there is enough free_space to continue
@@ -483,16 +485,9 @@ static void do_acct_process(struct file *file)
483 ac.ac_ppid = current->parent->tgid; 485 ac.ac_ppid = current->parent->tgid;
484#endif 486#endif
485 487
486 mutex_lock(&tty_mutex);
487 /* FIXME: Whoever is responsible for current->signal locking needs
488 to use the same locking all over the kernel and document it */
489 read_lock(&tasklist_lock);
490 ac.ac_tty = current->signal->tty ?
491 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
492 read_unlock(&tasklist_lock);
493 mutex_unlock(&tty_mutex);
494
495 spin_lock_irq(&current->sighand->siglock); 488 spin_lock_irq(&current->sighand->siglock);
489 tty = current->signal->tty;
490 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
496 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 491 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
497 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 492 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
498 ac.ac_flag = pacct->ac_flag; 493 ac.ac_flag = pacct->ac_flag;
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b0..d9b690ac684b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h> 59#include <linux/inotify.h>
60#include <linux/freezer.h>
60 61
61#include "audit.h" 62#include "audit.h"
62 63
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8e..9c8c23227c7f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
636 struct audit_rule *rule; 636 struct audit_rule *rule;
637 int i; 637 int i;
638 638
639 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 639 rule = kzalloc(sizeof(*rule), GFP_KERNEL);
640 if (unlikely(!rule)) 640 if (unlikely(!rule))
641 return NULL; 641 return NULL;
642 memset(rule, 0, sizeof(*rule));
643 642
644 rule->flags = krule->flags | krule->listnr; 643 rule->flags = krule->flags | krule->listnr;
645 rule->action = krule->action; 644 rule->action = krule->action;
@@ -801,8 +800,8 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
801 800
802 /* our own copy of se_str */ 801 /* our own copy of se_str */
803 se_str = kstrdup(sf->se_str, GFP_KERNEL); 802 se_str = kstrdup(sf->se_str, GFP_KERNEL);
804 if (unlikely(IS_ERR(se_str))) 803 if (unlikely(!se_str))
805 return -ENOMEM; 804 return -ENOMEM;
806 df->se_str = se_str; 805 df->se_str = se_str;
807 806
808 /* our own (refreshed) copy of se_rule */ 807 /* our own (refreshed) copy of se_rule */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 42f2f1179711..298897559ca4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
64#include <linux/tty.h> 64#include <linux/tty.h>
65#include <linux/selinux.h> 65#include <linux/selinux.h>
66#include <linux/binfmts.h> 66#include <linux/binfmts.h>
67#include <linux/highmem.h>
67#include <linux/syscalls.h> 68#include <linux/syscalls.h>
68 69
69#include "audit.h" 70#include "audit.h"
@@ -730,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context)
730 printk(KERN_ERR "audit: freed %d contexts\n", count); 731 printk(KERN_ERR "audit: freed %d contexts\n", count);
731} 732}
732 733
733static void audit_log_task_context(struct audit_buffer *ab) 734void audit_log_task_context(struct audit_buffer *ab)
734{ 735{
735 char *ctx = NULL; 736 char *ctx = NULL;
736 ssize_t len = 0; 737 ssize_t len = 0;
@@ -759,6 +760,8 @@ error_path:
759 return; 760 return;
760} 761}
761 762
763EXPORT_SYMBOL(audit_log_task_context);
764
762static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 765static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
763{ 766{
764 char name[sizeof(tsk->comm)]; 767 char name[sizeof(tsk->comm)];
@@ -778,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
778 if ((vma->vm_flags & VM_EXECUTABLE) && 781 if ((vma->vm_flags & VM_EXECUTABLE) &&
779 vma->vm_file) { 782 vma->vm_file) {
780 audit_log_d_path(ab, "exe=", 783 audit_log_d_path(ab, "exe=",
781 vma->vm_file->f_dentry, 784 vma->vm_file->f_path.dentry,
782 vma->vm_file->f_vfsmnt); 785 vma->vm_file->f_path.mnt);
783 break; 786 break;
784 } 787 }
785 vma = vma->vm_next; 788 vma = vma->vm_next;
@@ -823,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
823 context->return_code); 826 context->return_code);
824 827
825 mutex_lock(&tty_mutex); 828 mutex_lock(&tty_mutex);
829 read_lock(&tasklist_lock);
826 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 830 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
827 tty = tsk->signal->tty->name; 831 tty = tsk->signal->tty->name;
828 else 832 else
829 tty = "(none)"; 833 tty = "(none)";
834 read_unlock(&tasklist_lock);
830 audit_log_format(ab, 835 audit_log_format(ab,
831 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 836 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
832 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 837 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1487,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1487 return ctx ? ctx->loginuid : -1; 1492 return ctx ? ctx->loginuid : -1;
1488} 1493}
1489 1494
1495EXPORT_SYMBOL(audit_get_loginuid);
1496
1490/** 1497/**
1491 * __audit_mq_open - record audit data for a POSIX MQ open 1498 * __audit_mq_open - record audit data for a POSIX MQ open
1492 * @oflag: open flag 1499 * @oflag: open flag
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4ad..8fa1fb28f8a7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
75 return count; 75 return count;
76} 76}
77 77
78static struct file_operations ikconfig_file_ops = { 78static const struct file_operations ikconfig_file_ops = {
79 .owner = THIS_MODULE, 79 .owner = THIS_MODULE,
80 .read = ikconfig_read_current, 80 .read = ikconfig_read_current,
81}; 81};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 272254f20d97..7406fe6966f9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -204,7 +204,7 @@ int cpu_down(unsigned int cpu)
204#endif /*CONFIG_HOTPLUG_CPU*/ 204#endif /*CONFIG_HOTPLUG_CPU*/
205 205
206/* Requires cpu_add_remove_lock to be held */ 206/* Requires cpu_add_remove_lock to be held */
207static int __devinit _cpu_up(unsigned int cpu) 207static int __cpuinit _cpu_up(unsigned int cpu)
208{ 208{
209 int ret; 209 int ret;
210 void *hcpu = (void *)(long)cpu; 210 void *hcpu = (void *)(long)cpu;
@@ -239,7 +239,7 @@ out_notify:
239 return ret; 239 return ret;
240} 240}
241 241
242int __devinit cpu_up(unsigned int cpu) 242int __cpuinit cpu_up(unsigned int cpu)
243{ 243{
244 int err = 0; 244 int err = 0;
245 245
@@ -258,7 +258,7 @@ static cpumask_t frozen_cpus;
258 258
259int disable_nonboot_cpus(void) 259int disable_nonboot_cpus(void)
260{ 260{
261 int cpu, first_cpu, error; 261 int cpu, first_cpu, error = 0;
262 262
263 mutex_lock(&cpu_add_remove_lock); 263 mutex_lock(&cpu_add_remove_lock);
264 first_cpu = first_cpu(cpu_present_map); 264 first_cpu = first_cpu(cpu_present_map);
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
270 goto out; 270 goto out;
271 } 271 }
272 } 272 }
273 error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); 273
274 if (error) {
275 printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
276 goto out;
277 }
278 /* We take down all of the non-boot CPUs in one shot to avoid races 274 /* We take down all of the non-boot CPUs in one shot to avoid races
279 * with the userspace trying to use the CPU hotplug at the same time 275 * with the userspace trying to use the CPU hotplug at the same time
280 */ 276 */
@@ -298,7 +294,7 @@ int disable_nonboot_cpus(void)
298 /* Make sure the CPUs won't be enabled by someone else */ 294 /* Make sure the CPUs won't be enabled by someone else */
299 cpu_hotplug_disabled = 1; 295 cpu_hotplug_disabled = 1;
300 } else { 296 } else {
301 printk(KERN_ERR "Non-boot CPUs are not disabled"); 297 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
302 } 298 }
303out: 299out:
304 mutex_unlock(&cpu_add_remove_lock); 300 mutex_unlock(&cpu_add_remove_lock);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930e..6b05dc69c959 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
413 * 413 *
414 * 414 *
415 * When reading/writing to a file: 415 * When reading/writing to a file:
416 * - the cpuset to use in file->f_dentry->d_parent->d_fsdata 416 * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
417 * - the 'cftype' of the file is file->f_dentry->d_fsdata 417 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
418 */ 418 */
419 419
420struct cftype { 420struct cftype {
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
729 } 729 }
730 730
731 /* Remaining checks don't apply to root cpuset */ 731 /* Remaining checks don't apply to root cpuset */
732 if ((par = cur->parent) == NULL) 732 if (cur == &top_cpuset)
733 return 0; 733 return 0;
734 734
735 par = cur->parent;
736
735 /* We must be a subset of our parent cpuset */ 737 /* We must be a subset of our parent cpuset */
736 if (!is_cpuset_subset(trial, par)) 738 if (!is_cpuset_subset(trial, par))
737 return -EACCES; 739 return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1060 cpu_exclusive_changed = 1062 cpu_exclusive_changed =
1061 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 1063 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
1062 mutex_lock(&callback_mutex); 1064 mutex_lock(&callback_mutex);
1063 if (turning_on) 1065 cs->flags = trialcs.flags;
1064 set_bit(bit, &cs->flags);
1065 else
1066 clear_bit(bit, &cs->flags);
1067 mutex_unlock(&callback_mutex); 1066 mutex_unlock(&callback_mutex);
1068 1067
1069 if (cpu_exclusive_changed) 1068 if (cpu_exclusive_changed)
@@ -1281,18 +1280,19 @@ typedef enum {
1281 FILE_TASKLIST, 1280 FILE_TASKLIST,
1282} cpuset_filetype_t; 1281} cpuset_filetype_t;
1283 1282
1284static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, 1283static ssize_t cpuset_common_file_write(struct file *file,
1284 const char __user *userbuf,
1285 size_t nbytes, loff_t *unused_ppos) 1285 size_t nbytes, loff_t *unused_ppos)
1286{ 1286{
1287 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1287 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1288 struct cftype *cft = __d_cft(file->f_dentry); 1288 struct cftype *cft = __d_cft(file->f_path.dentry);
1289 cpuset_filetype_t type = cft->private; 1289 cpuset_filetype_t type = cft->private;
1290 char *buffer; 1290 char *buffer;
1291 char *pathbuf = NULL; 1291 char *pathbuf = NULL;
1292 int retval = 0; 1292 int retval = 0;
1293 1293
1294 /* Crude upper limit on largest legitimate cpulist user might write. */ 1294 /* Crude upper limit on largest legitimate cpulist user might write. */
1295 if (nbytes > 100 + 6 * NR_CPUS) 1295 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
1296 return -E2BIG; 1296 return -E2BIG;
1297 1297
1298 /* +1 for nul-terminator */ 1298 /* +1 for nul-terminator */
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
1367 size_t nbytes, loff_t *ppos) 1367 size_t nbytes, loff_t *ppos)
1368{ 1368{
1369 ssize_t retval = 0; 1369 ssize_t retval = 0;
1370 struct cftype *cft = __d_cft(file->f_dentry); 1370 struct cftype *cft = __d_cft(file->f_path.dentry);
1371 if (!cft) 1371 if (!cft)
1372 return -ENODEV; 1372 return -ENODEV;
1373 1373
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1417static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, 1417static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1418 size_t nbytes, loff_t *ppos) 1418 size_t nbytes, loff_t *ppos)
1419{ 1419{
1420 struct cftype *cft = __d_cft(file->f_dentry); 1420 struct cftype *cft = __d_cft(file->f_path.dentry);
1421 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1421 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1422 cpuset_filetype_t type = cft->private; 1422 cpuset_filetype_t type = cft->private;
1423 char *page; 1423 char *page;
1424 ssize_t retval = 0; 1424 ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
1476 loff_t *ppos) 1476 loff_t *ppos)
1477{ 1477{
1478 ssize_t retval = 0; 1478 ssize_t retval = 0;
1479 struct cftype *cft = __d_cft(file->f_dentry); 1479 struct cftype *cft = __d_cft(file->f_path.dentry);
1480 if (!cft) 1480 if (!cft)
1481 return -ENODEV; 1481 return -ENODEV;
1482 1482
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
1498 if (err) 1498 if (err)
1499 return err; 1499 return err;
1500 1500
1501 cft = __d_cft(file->f_dentry); 1501 cft = __d_cft(file->f_path.dentry);
1502 if (!cft) 1502 if (!cft)
1503 return -ENODEV; 1503 return -ENODEV;
1504 if (cft->open) 1504 if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
1511 1511
1512static int cpuset_file_release(struct inode *inode, struct file *file) 1512static int cpuset_file_release(struct inode *inode, struct file *file)
1513{ 1513{
1514 struct cftype *cft = __d_cft(file->f_dentry); 1514 struct cftype *cft = __d_cft(file->f_path.dentry);
1515 if (cft->release) 1515 if (cft->release)
1516 return cft->release(inode, file); 1516 return cft->release(inode, file);
1517 return 0; 1517 return 0;
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1533} 1533}
1534 1534
1535static struct file_operations cpuset_file_operations = { 1535static const struct file_operations cpuset_file_operations = {
1536 .read = cpuset_file_read, 1536 .read = cpuset_file_read,
1537 .write = cpuset_file_write, 1537 .write = cpuset_file_write,
1538 .llseek = generic_file_llseek, 1538 .llseek = generic_file_llseek,
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1700 */ 1700 */
1701static int cpuset_tasks_open(struct inode *unused, struct file *file) 1701static int cpuset_tasks_open(struct inode *unused, struct file *file)
1702{ 1702{
1703 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1703 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1704 struct ctr_struct *ctr; 1704 struct ctr_struct *ctr;
1705 pid_t *pidarray; 1705 pid_t *pidarray;
1706 int npids; 1706 int npids;
@@ -2045,7 +2045,6 @@ out:
2045 return err; 2045 return err;
2046} 2046}
2047 2047
2048#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
2049/* 2048/*
2050 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 2049 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
2051 * or memory nodes, we need to walk over the cpuset hierarchy, 2050 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void)
2109 mutex_unlock(&callback_mutex); 2108 mutex_unlock(&callback_mutex);
2110 mutex_unlock(&manage_mutex); 2109 mutex_unlock(&manage_mutex);
2111} 2110}
2112#endif
2113 2111
2114#ifdef CONFIG_HOTPLUG_CPU
2115/* 2112/*
2116 * The top_cpuset tracks what CPUs and Memory Nodes are online, 2113 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2117 * period. This is necessary in order to make cpusets transparent 2114 * period. This is necessary in order to make cpusets transparent
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
2128 common_cpu_mem_hotplug_unplug(); 2125 common_cpu_mem_hotplug_unplug();
2129 return 0; 2126 return 0;
2130} 2127}
2131#endif
2132 2128
2133#ifdef CONFIG_MEMORY_HOTPLUG 2129#ifdef CONFIG_MEMORY_HOTPLUG
2134/* 2130/*
@@ -2346,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2346} 2342}
2347 2343
2348/** 2344/**
2349 * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? 2345 * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
2350 * @z: is this zone on an allowed node? 2346 * @z: is this zone on an allowed node?
2351 * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) 2347 * @gfp_mask: memory allocation flags
2352 * 2348 *
2353 * If we're in interrupt, yes, we can always allocate. If zone 2349 * If we're in interrupt, yes, we can always allocate. If
2350 * __GFP_THISNODE is set, yes, we can always allocate. If zone
2354 * z's node is in our tasks mems_allowed, yes. If it's not a 2351 * z's node is in our tasks mems_allowed, yes. If it's not a
2355 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2352 * __GFP_HARDWALL request and this zone's nodes is in the nearest
2356 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2353 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
2357 * Otherwise, no. 2354 * Otherwise, no.
2358 * 2355 *
2356 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
2357 * reduces to cpuset_zone_allowed_hardwall(). Otherwise,
2358 * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
2359 * from an enclosing cpuset.
2360 *
2361 * cpuset_zone_allowed_hardwall() only handles the simpler case of
2362 * hardwall cpusets, and never sleeps.
2363 *
2364 * The __GFP_THISNODE placement logic is really handled elsewhere,
2365 * by forcibly using a zonelist starting at a specified node, and by
2366 * (in get_page_from_freelist()) refusing to consider the zones for
2367 * any node on the zonelist except the first. By the time any such
2368 * calls get to this routine, we should just shut up and say 'yes'.
2369 *
2359 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2370 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2360 * and do not allow allocations outside the current tasks cpuset. 2371 * and do not allow allocations outside the current tasks cpuset.
2361 * GFP_KERNEL allocations are not so marked, so can escape to the 2372 * GFP_KERNEL allocations are not so marked, so can escape to the
2362 * nearest mem_exclusive ancestor cpuset. 2373 * nearest enclosing mem_exclusive ancestor cpuset.
2363 * 2374 *
2364 * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() 2375 * Scanning up parent cpusets requires callback_mutex. The
2365 * routine only calls here with __GFP_HARDWALL bit _not_ set if 2376 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2366 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 2377 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2367 * mems_allowed came up empty on the first pass over the zonelist. 2378 * current tasks mems_allowed came up empty on the first pass over
2368 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 2379 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
2369 * short of memory, might require taking the callback_mutex mutex. 2380 * cpuset are short of memory, might require taking the callback_mutex
2381 * mutex.
2370 * 2382 *
2371 * The first call here from mm/page_alloc:get_page_from_freelist() 2383 * The first call here from mm/page_alloc:get_page_from_freelist()
2372 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so 2384 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
2373 * no allocation on a node outside the cpuset is allowed (unless in 2385 * so no allocation on a node outside the cpuset is allowed (unless
2374 * interrupt, of course). 2386 * in interrupt, of course).
2375 * 2387 *
2376 * The second pass through get_page_from_freelist() doesn't even call 2388 * The second pass through get_page_from_freelist() doesn't even call
2377 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 2389 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
@@ -2384,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2384 * GFP_USER - only nodes in current tasks mems allowed ok. 2396 * GFP_USER - only nodes in current tasks mems allowed ok.
2385 * 2397 *
2386 * Rule: 2398 * Rule:
2387 * Don't call cpuset_zone_allowed() if you can't sleep, unless you 2399 * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
2388 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables 2400 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2389 * the code that might scan up ancestor cpusets and sleep. 2401 * the code that might scan up ancestor cpusets and sleep.
2390 **/ 2402 */
2391 2403
2392int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 2404int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2393{ 2405{
2394 int node; /* node that zone z is on */ 2406 int node; /* node that zone z is on */
2395 const struct cpuset *cs; /* current cpuset ancestors */ 2407 const struct cpuset *cs; /* current cpuset ancestors */
@@ -2419,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2419 return allowed; 2431 return allowed;
2420} 2432}
2421 2433
2434/*
2435 * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
2436 * @z: is this zone on an allowed node?
2437 * @gfp_mask: memory allocation flags
2438 *
2439 * If we're in interrupt, yes, we can always allocate.
2440 * If __GFP_THISNODE is set, yes, we can always allocate. If zone
2441 * z's node is in our tasks mems_allowed, yes. Otherwise, no.
2442 *
2443 * The __GFP_THISNODE placement logic is really handled elsewhere,
2444 * by forcibly using a zonelist starting at a specified node, and by
2445 * (in get_page_from_freelist()) refusing to consider the zones for
2446 * any node on the zonelist except the first. By the time any such
2447 * calls get to this routine, we should just shut up and say 'yes'.
2448 *
2449 * Unlike the cpuset_zone_allowed_softwall() variant, above,
2450 * this variant requires that the zone be in the current tasks
2451 * mems_allowed or that we're in interrupt. It does not scan up the
2452 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2453 * It never sleeps.
2454 */
2455
2456int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2457{
2458 int node; /* node that zone z is on */
2459
2460 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2461 return 1;
2462 node = zone_to_nid(z);
2463 if (node_isset(node, current->mems_allowed))
2464 return 1;
2465 return 0;
2466}
2467
2422/** 2468/**
2423 * cpuset_lock - lock out any changes to cpuset structures 2469 * cpuset_lock - lock out any changes to cpuset structures
2424 * 2470 *
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 66a0ea48751d..766d5912b26a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
20#include <linux/delayacct.h> 20#include <linux/delayacct.h>
21 21
22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
23kmem_cache_t *delayacct_cache; 23struct kmem_cache *delayacct_cache;
24 24
25static int __init delayacct_setup_disable(char *str) 25static int __init delayacct_setup_disable(char *str)
26{ 26{
@@ -41,7 +41,7 @@ void delayacct_init(void)
41 41
42void __delayacct_tsk_init(struct task_struct *tsk) 42void __delayacct_tsk_init(struct task_struct *tsk)
43{ 43{
44 tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); 44 tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
45 if (tsk->delays) 45 if (tsk->delays)
46 spin_lock_init(&tsk->delays->lock); 46 spin_lock_init(&tsk->delays->lock);
47} 47}
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938a..937b13ca33ba 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
140 return single_open(file, proc_dma_show, NULL); 140 return single_open(file, proc_dma_show, NULL);
141} 141}
142 142
143static struct file_operations proc_dma_operations = { 143static const struct file_operations proc_dma_operations = {
144 .open = proc_dma_open, 144 .open = proc_dma_open,
145 .read = seq_read, 145 .read = seq_read,
146 .llseek = seq_lseek, 146 .llseek = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4e8ca3..35401720635b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/tty.h> 15#include <linux/tty.h>
16#include <linux/namespace.h> 16#include <linux/mnt_namespace.h>
17#include <linux/key.h> 17#include <linux/key.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -22,6 +22,7 @@
22#include <linux/file.h> 22#include <linux/file.h>
23#include <linux/binfmts.h> 23#include <linux/binfmts.h>
24#include <linux/nsproxy.h> 24#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h>
25#include <linux/ptrace.h> 26#include <linux/ptrace.h>
26#include <linux/profile.h> 27#include <linux/profile.h>
27#include <linux/mount.h> 28#include <linux/mount.h>
@@ -48,7 +49,6 @@
48#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
49 50
50extern void sem_exit (void); 51extern void sem_exit (void);
51extern struct task_struct *child_reaper;
52 52
53static void exit_mm(struct task_struct * tsk); 53static void exit_mm(struct task_struct * tsk);
54 54
@@ -189,21 +189,18 @@ repeat:
189int session_of_pgrp(int pgrp) 189int session_of_pgrp(int pgrp)
190{ 190{
191 struct task_struct *p; 191 struct task_struct *p;
192 int sid = -1; 192 int sid = 0;
193 193
194 read_lock(&tasklist_lock); 194 read_lock(&tasklist_lock);
195 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 195
196 if (p->signal->session > 0) { 196 p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
197 sid = p->signal->session; 197 if (p == NULL)
198 goto out; 198 p = find_task_by_pid(pgrp);
199 } 199 if (p != NULL)
200 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 200 sid = process_session(p);
201 p = find_task_by_pid(pgrp); 201
202 if (p)
203 sid = p->signal->session;
204out:
205 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
206 203
207 return sid; 204 return sid;
208} 205}
209 206
@@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
225 || p->exit_state 222 || p->exit_state
226 || is_init(p->real_parent)) 223 || is_init(p->real_parent))
227 continue; 224 continue;
228 if (process_group(p->real_parent) != pgrp 225 if (process_group(p->real_parent) != pgrp &&
229 && p->real_parent->signal->session == p->signal->session) { 226 process_session(p->real_parent) == process_session(p)) {
230 ret = 0; 227 ret = 0;
231 break; 228 break;
232 } 229 }
@@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp)
260} 257}
261 258
262/** 259/**
263 * reparent_to_init - Reparent the calling kernel thread to the init task. 260 * reparent_to_init - Reparent the calling kernel thread to the init task
261 * of the pid space that the thread belongs to.
264 * 262 *
265 * If a kernel thread is launched as a result of a system call, or if 263 * If a kernel thread is launched as a result of a system call, or if
266 * it ever exits, it should generally reparent itself to init so that 264 * it ever exits, it should generally reparent itself to init so that
@@ -278,8 +276,8 @@ static void reparent_to_init(void)
278 ptrace_unlink(current); 276 ptrace_unlink(current);
279 /* Reparent to init */ 277 /* Reparent to init */
280 remove_parent(current); 278 remove_parent(current);
281 current->parent = child_reaper; 279 current->parent = child_reaper(current);
282 current->real_parent = child_reaper; 280 current->real_parent = child_reaper(current);
283 add_parent(current); 281 add_parent(current);
284 282
285 /* Set the exit signal to SIGCHLD so we signal init on exit */ 283 /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
302{ 300{
303 struct task_struct *curr = current->group_leader; 301 struct task_struct *curr = current->group_leader;
304 302
305 if (curr->signal->session != session) { 303 if (process_session(curr) != session) {
306 detach_pid(curr, PIDTYPE_SID); 304 detach_pid(curr, PIDTYPE_SID);
307 curr->signal->session = session; 305 set_signal_session(curr->signal, session);
308 attach_pid(curr, PIDTYPE_SID, session); 306 attach_pid(curr, PIDTYPE_SID, session);
309 } 307 }
310 if (process_group(curr) != pgrp) { 308 if (process_group(curr) != pgrp) {
@@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
314 } 312 }
315} 313}
316 314
317void set_special_pids(pid_t session, pid_t pgrp) 315static void set_special_pids(pid_t session, pid_t pgrp)
318{ 316{
319 write_lock_irq(&tasklist_lock); 317 write_lock_irq(&tasklist_lock);
320 __set_special_pids(session, pgrp); 318 __set_special_pids(session, pgrp);
@@ -384,9 +382,7 @@ void daemonize(const char *name, ...)
384 exit_mm(current); 382 exit_mm(current);
385 383
386 set_special_pids(1, 1); 384 set_special_pids(1, 1);
387 mutex_lock(&tty_mutex); 385 proc_clear_tty(current);
388 current->signal->tty = NULL;
389 mutex_unlock(&tty_mutex);
390 386
391 /* Block and flush all signals */ 387 /* Block and flush all signals */
392 sigfillset(&blocked); 388 sigfillset(&blocked);
@@ -429,7 +425,7 @@ static void close_files(struct files_struct * files)
429 for (;;) { 425 for (;;) {
430 unsigned long set; 426 unsigned long set;
431 i = j * __NFDBITS; 427 i = j * __NFDBITS;
432 if (i >= fdt->max_fdset || i >= fdt->max_fds) 428 if (i >= fdt->max_fds)
433 break; 429 break;
434 set = fdt->open_fds->fds_bits[j++]; 430 set = fdt->open_fds->fds_bits[j++];
435 while (set) { 431 while (set) {
@@ -470,9 +466,7 @@ void fastcall put_files_struct(struct files_struct *files)
470 * you can free files immediately. 466 * you can free files immediately.
471 */ 467 */
472 fdt = files_fdtable(files); 468 fdt = files_fdtable(files);
473 if (fdt == &files->fdtab) 469 if (fdt != &files->fdtab)
474 fdt->free_files = files;
475 else
476 kmem_cache_free(files_cachep, files); 470 kmem_cache_free(files_cachep, files);
477 free_fdtable(fdt); 471 free_fdtable(fdt);
478 } 472 }
@@ -603,10 +597,6 @@ choose_new_parent(struct task_struct *p, struct task_struct *reaper)
603static void 597static void
604reparent_thread(struct task_struct *p, struct task_struct *father, int traced) 598reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
605{ 599{
606 /* We don't want people slaying init. */
607 if (p->exit_signal != -1)
608 p->exit_signal = SIGCHLD;
609
610 if (p->pdeath_signal) 600 if (p->pdeath_signal)
611 /* We already hold the tasklist_lock here. */ 601 /* We already hold the tasklist_lock here. */
612 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); 602 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
@@ -626,13 +616,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
626 p->parent = p->real_parent; 616 p->parent = p->real_parent;
627 add_parent(p); 617 add_parent(p);
628 618
629 /* If we'd notified the old parent about this child's death, 619 if (p->state == TASK_TRACED) {
630 * also notify the new parent.
631 */
632 if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
633 thread_group_empty(p))
634 do_notify_parent(p, p->exit_signal);
635 else if (p->state == TASK_TRACED) {
636 /* 620 /*
637 * If it was at a trace stop, turn it into 621 * If it was at a trace stop, turn it into
638 * a normal stop since it's no longer being 622 * a normal stop since it's no longer being
@@ -642,6 +626,23 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
642 } 626 }
643 } 627 }
644 628
629 /* If this is a threaded reparent there is no need to
630 * notify anyone anything has happened.
631 */
632 if (p->real_parent->group_leader == father->group_leader)
633 return;
634
635 /* We don't want people slaying init. */
636 if (p->exit_signal != -1)
637 p->exit_signal = SIGCHLD;
638
639 /* If we'd notified the old parent about this child's death,
640 * also notify the new parent.
641 */
642 if (!traced && p->exit_state == EXIT_ZOMBIE &&
643 p->exit_signal != -1 && thread_group_empty(p))
644 do_notify_parent(p, p->exit_signal);
645
645 /* 646 /*
646 * process group orphan check 647 * process group orphan check
647 * Case ii: Our child is in a different pgrp 648 * Case ii: Our child is in a different pgrp
@@ -649,10 +650,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
649 * outside, so the child pgrp is now orphaned. 650 * outside, so the child pgrp is now orphaned.
650 */ 651 */
651 if ((process_group(p) != process_group(father)) && 652 if ((process_group(p) != process_group(father)) &&
652 (p->signal->session == father->signal->session)) { 653 (process_session(p) == process_session(father))) {
653 int pgrp = process_group(p); 654 int pgrp = process_group(p);
654 655
655 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { 656 if (will_become_orphaned_pgrp(pgrp, NULL) &&
657 has_stopped_jobs(pgrp)) {
656 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); 658 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
657 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); 659 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
658 } 660 }
@@ -663,7 +665,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
663 * When we die, we re-parent all our children. 665 * When we die, we re-parent all our children.
664 * Try to give them to another thread in our thread 666 * Try to give them to another thread in our thread
665 * group, and if no such member exists, give it to 667 * group, and if no such member exists, give it to
666 * the global child reaper process (ie "init") 668 * the child reaper process (ie "init") in our pid
669 * space.
667 */ 670 */
668static void 671static void
669forget_original_parent(struct task_struct *father, struct list_head *to_release) 672forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -674,7 +677,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
674 do { 677 do {
675 reaper = next_thread(reaper); 678 reaper = next_thread(reaper);
676 if (reaper == father) { 679 if (reaper == father) {
677 reaper = child_reaper; 680 reaper = child_reaper(father);
678 break; 681 break;
679 } 682 }
680 } while (reaper->exit_state); 683 } while (reaper->exit_state);
@@ -786,7 +789,7 @@ static void exit_notify(struct task_struct *tsk)
786 t = tsk->real_parent; 789 t = tsk->real_parent;
787 790
788 if ((process_group(t) != process_group(tsk)) && 791 if ((process_group(t) != process_group(tsk)) &&
789 (t->signal->session == tsk->signal->session) && 792 (process_session(t) == process_session(tsk)) &&
790 will_become_orphaned_pgrp(process_group(tsk), tsk) && 793 will_become_orphaned_pgrp(process_group(tsk), tsk) &&
791 has_stopped_jobs(process_group(tsk))) { 794 has_stopped_jobs(process_group(tsk))) {
792 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); 795 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
@@ -850,9 +853,7 @@ static void exit_notify(struct task_struct *tsk)
850fastcall NORET_TYPE void do_exit(long code) 853fastcall NORET_TYPE void do_exit(long code)
851{ 854{
852 struct task_struct *tsk = current; 855 struct task_struct *tsk = current;
853 struct taskstats *tidstats;
854 int group_dead; 856 int group_dead;
855 unsigned int mycpu;
856 857
857 profile_task_exit(tsk); 858 profile_task_exit(tsk);
858 859
@@ -862,8 +863,13 @@ fastcall NORET_TYPE void do_exit(long code)
862 panic("Aiee, killing interrupt handler!"); 863 panic("Aiee, killing interrupt handler!");
863 if (unlikely(!tsk->pid)) 864 if (unlikely(!tsk->pid))
864 panic("Attempted to kill the idle task!"); 865 panic("Attempted to kill the idle task!");
865 if (unlikely(tsk == child_reaper)) 866 if (unlikely(tsk == child_reaper(tsk))) {
866 panic("Attempted to kill init!"); 867 if (tsk->nsproxy->pid_ns != &init_pid_ns)
868 tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
869 else
870 panic("Attempted to kill init!");
871 }
872
867 873
868 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 874 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
869 current->ptrace_message = code; 875 current->ptrace_message = code;
@@ -890,8 +896,6 @@ fastcall NORET_TYPE void do_exit(long code)
890 current->comm, current->pid, 896 current->comm, current->pid,
891 preempt_count()); 897 preempt_count());
892 898
893 taskstats_exit_alloc(&tidstats, &mycpu);
894
895 acct_update_integrals(tsk); 899 acct_update_integrals(tsk);
896 if (tsk->mm) { 900 if (tsk->mm) {
897 update_hiwater_rss(tsk->mm); 901 update_hiwater_rss(tsk->mm);
@@ -911,8 +915,8 @@ fastcall NORET_TYPE void do_exit(long code)
911#endif 915#endif
912 if (unlikely(tsk->audit_context)) 916 if (unlikely(tsk->audit_context))
913 audit_free(tsk); 917 audit_free(tsk);
914 taskstats_exit_send(tsk, tidstats, group_dead, mycpu); 918
915 taskstats_exit_free(tidstats); 919 taskstats_exit(tsk, group_dead);
916 920
917 exit_mm(tsk); 921 exit_mm(tsk);
918 922
diff --git a/kernel/fork.c b/kernel/fork.c
index 8cdd3e72ba55..fc723e595cd5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/completion.h> 20#include <linux/completion.h>
21#include <linux/namespace.h> 21#include <linux/mnt_namespace.h>
22#include <linux/personality.h> 22#include <linux/personality.h>
23#include <linux/mempolicy.h> 23#include <linux/mempolicy.h>
24#include <linux/sem.h> 24#include <linux/sem.h>
@@ -36,6 +36,7 @@
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/jiffies.h> 37#include <linux/jiffies.h>
38#include <linux/futex.h> 38#include <linux/futex.h>
39#include <linux/task_io_accounting_ops.h>
39#include <linux/rcupdate.h> 40#include <linux/rcupdate.h>
40#include <linux/ptrace.h> 41#include <linux/ptrace.h>
41#include <linux/mount.h> 42#include <linux/mount.h>
@@ -82,26 +83,26 @@ int nr_processes(void)
82#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 83#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
83# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 84# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
84# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 85# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
85static kmem_cache_t *task_struct_cachep; 86static struct kmem_cache *task_struct_cachep;
86#endif 87#endif
87 88
88/* SLAB cache for signal_struct structures (tsk->signal) */ 89/* SLAB cache for signal_struct structures (tsk->signal) */
89static kmem_cache_t *signal_cachep; 90static struct kmem_cache *signal_cachep;
90 91
91/* SLAB cache for sighand_struct structures (tsk->sighand) */ 92/* SLAB cache for sighand_struct structures (tsk->sighand) */
92kmem_cache_t *sighand_cachep; 93struct kmem_cache *sighand_cachep;
93 94
94/* SLAB cache for files_struct structures (tsk->files) */ 95/* SLAB cache for files_struct structures (tsk->files) */
95kmem_cache_t *files_cachep; 96struct kmem_cache *files_cachep;
96 97
97/* SLAB cache for fs_struct structures (tsk->fs) */ 98/* SLAB cache for fs_struct structures (tsk->fs) */
98kmem_cache_t *fs_cachep; 99struct kmem_cache *fs_cachep;
99 100
100/* SLAB cache for vm_area_struct structures */ 101/* SLAB cache for vm_area_struct structures */
101kmem_cache_t *vm_area_cachep; 102struct kmem_cache *vm_area_cachep;
102 103
103/* SLAB cache for mm_struct structures (tsk->mm) */ 104/* SLAB cache for mm_struct structures (tsk->mm) */
104static kmem_cache_t *mm_cachep; 105static struct kmem_cache *mm_cachep;
105 106
106void free_task(struct task_struct *tsk) 107void free_task(struct task_struct *tsk)
107{ 108{
@@ -202,7 +203,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
202 struct mempolicy *pol; 203 struct mempolicy *pol;
203 204
204 down_write(&oldmm->mmap_sem); 205 down_write(&oldmm->mmap_sem);
205 flush_cache_mm(oldmm); 206 flush_cache_dup_mm(oldmm);
206 /* 207 /*
207 * Not linked in yet - no deadlock potential: 208 * Not linked in yet - no deadlock potential:
208 */ 209 */
@@ -237,7 +238,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
237 goto fail_nomem; 238 goto fail_nomem;
238 charge = len; 239 charge = len;
239 } 240 }
240 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 241 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
241 if (!tmp) 242 if (!tmp)
242 goto fail_nomem; 243 goto fail_nomem;
243 *tmp = *mpnt; 244 *tmp = *mpnt;
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
252 anon_vma_link(tmp); 253 anon_vma_link(tmp);
253 file = tmp->vm_file; 254 file = tmp->vm_file;
254 if (file) { 255 if (file) {
255 struct inode *inode = file->f_dentry->d_inode; 256 struct inode *inode = file->f_path.dentry->d_inode;
256 get_file(file); 257 get_file(file);
257 if (tmp->vm_flags & VM_DENYWRITE) 258 if (tmp->vm_flags & VM_DENYWRITE)
258 atomic_dec(&inode->i_writecount); 259 atomic_dec(&inode->i_writecount);
@@ -319,7 +320,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
319 320
320 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 321 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
321 322
322#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) 323#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
323#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 324#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
324 325
325#include <linux/init_task.h> 326#include <linux/init_task.h>
@@ -448,7 +449,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
448 tsk->vfork_done = NULL; 449 tsk->vfork_done = NULL;
449 complete(vfork_done); 450 complete(vfork_done);
450 } 451 }
451 if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { 452
453 /*
454 * If we're exiting normally, clear a user-space tid field if
455 * requested. We leave this alone when dying by signal, to leave
456 * the value intact in a core dump, and to save the unnecessary
457 * trouble otherwise. Userland only wants this done for a sys_exit.
458 */
459 if (tsk->clear_child_tid
460 && !(tsk->flags & PF_SIGNALED)
461 && atomic_read(&mm->mm_users) > 1) {
452 u32 __user * tidptr = tsk->clear_child_tid; 462 u32 __user * tidptr = tsk->clear_child_tid;
453 tsk->clear_child_tid = NULL; 463 tsk->clear_child_tid = NULL;
454 464
@@ -479,6 +489,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
479 489
480 memcpy(mm, oldmm, sizeof(*mm)); 490 memcpy(mm, oldmm, sizeof(*mm));
481 491
492 /* Initializing for Swap token stuff */
493 mm->token_priority = 0;
494 mm->last_interval = 0;
495
482 if (!mm_init(mm)) 496 if (!mm_init(mm))
483 goto fail_nomem; 497 goto fail_nomem;
484 498
@@ -542,6 +556,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
542 goto fail_nomem; 556 goto fail_nomem;
543 557
544good_mm: 558good_mm:
559 /* Initializing for Swap token stuff */
560 mm->token_priority = 0;
561 mm->last_interval = 0;
562
545 tsk->mm = mm; 563 tsk->mm = mm;
546 tsk->active_mm = mm; 564 tsk->active_mm = mm;
547 return 0; 565 return 0;
@@ -596,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
596 614
597static int count_open_files(struct fdtable *fdt) 615static int count_open_files(struct fdtable *fdt)
598{ 616{
599 int size = fdt->max_fdset; 617 int size = fdt->max_fds;
600 int i; 618 int i;
601 619
602 /* Find the last open fd */ 620 /* Find the last open fd */
@@ -613,7 +631,7 @@ static struct files_struct *alloc_files(void)
613 struct files_struct *newf; 631 struct files_struct *newf;
614 struct fdtable *fdt; 632 struct fdtable *fdt;
615 633
616 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); 634 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
617 if (!newf) 635 if (!newf)
618 goto out; 636 goto out;
619 637
@@ -623,12 +641,10 @@ static struct files_struct *alloc_files(void)
623 newf->next_fd = 0; 641 newf->next_fd = 0;
624 fdt = &newf->fdtab; 642 fdt = &newf->fdtab;
625 fdt->max_fds = NR_OPEN_DEFAULT; 643 fdt->max_fds = NR_OPEN_DEFAULT;
626 fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
627 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; 644 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
628 fdt->open_fds = (fd_set *)&newf->open_fds_init; 645 fdt->open_fds = (fd_set *)&newf->open_fds_init;
629 fdt->fd = &newf->fd_array[0]; 646 fdt->fd = &newf->fd_array[0];
630 INIT_RCU_HEAD(&fdt->rcu); 647 INIT_RCU_HEAD(&fdt->rcu);
631 fdt->free_files = NULL;
632 fdt->next = NULL; 648 fdt->next = NULL;
633 rcu_assign_pointer(newf->fdt, fdt); 649 rcu_assign_pointer(newf->fdt, fdt);
634out: 650out:
@@ -644,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
644{ 660{
645 struct files_struct *newf; 661 struct files_struct *newf;
646 struct file **old_fds, **new_fds; 662 struct file **old_fds, **new_fds;
647 int open_files, size, i, expand; 663 int open_files, size, i;
648 struct fdtable *old_fdt, *new_fdt; 664 struct fdtable *old_fdt, *new_fdt;
649 665
650 *errorp = -ENOMEM; 666 *errorp = -ENOMEM;
@@ -655,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
655 spin_lock(&oldf->file_lock); 671 spin_lock(&oldf->file_lock);
656 old_fdt = files_fdtable(oldf); 672 old_fdt = files_fdtable(oldf);
657 new_fdt = files_fdtable(newf); 673 new_fdt = files_fdtable(newf);
658 size = old_fdt->max_fdset;
659 open_files = count_open_files(old_fdt); 674 open_files = count_open_files(old_fdt);
660 expand = 0;
661 675
662 /* 676 /*
663 * Check whether we need to allocate a larger fd array or fd set. 677 * Check whether we need to allocate a larger fd array and fd set.
664 * Note: we're not a clone task, so the open count won't change. 678 * Note: we're not a clone task, so the open count won't change.
665 */ 679 */
666 if (open_files > new_fdt->max_fdset) {
667 new_fdt->max_fdset = 0;
668 expand = 1;
669 }
670 if (open_files > new_fdt->max_fds) { 680 if (open_files > new_fdt->max_fds) {
671 new_fdt->max_fds = 0; 681 new_fdt->max_fds = 0;
672 expand = 1;
673 }
674
675 /* if the old fdset gets grown now, we'll only copy up to "size" fds */
676 if (expand) {
677 spin_unlock(&oldf->file_lock); 682 spin_unlock(&oldf->file_lock);
678 spin_lock(&newf->file_lock); 683 spin_lock(&newf->file_lock);
679 *errorp = expand_files(newf, open_files-1); 684 *errorp = expand_files(newf, open_files-1);
@@ -693,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
693 old_fds = old_fdt->fd; 698 old_fds = old_fdt->fd;
694 new_fds = new_fdt->fd; 699 new_fds = new_fdt->fd;
695 700
696 memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); 701 memcpy(new_fdt->open_fds->fds_bits,
697 memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); 702 old_fdt->open_fds->fds_bits, open_files/8);
703 memcpy(new_fdt->close_on_exec->fds_bits,
704 old_fdt->close_on_exec->fds_bits, open_files/8);
698 705
699 for (i = open_files; i != 0; i--) { 706 for (i = open_files; i != 0; i--) {
700 struct file *f = *old_fds++; 707 struct file *f = *old_fds++;
@@ -719,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
719 /* This is long word aligned thus could use a optimized version */ 726 /* This is long word aligned thus could use a optimized version */
720 memset(new_fds, 0, size); 727 memset(new_fds, 0, size);
721 728
722 if (new_fdt->max_fdset > open_files) { 729 if (new_fdt->max_fds > open_files) {
723 int left = (new_fdt->max_fdset-open_files)/8; 730 int left = (new_fdt->max_fds-open_files)/8;
724 int start = open_files / (8 * sizeof(unsigned long)); 731 int start = open_files / (8 * sizeof(unsigned long));
725 732
726 memset(&new_fdt->open_fds->fds_bits[start], 0, left); 733 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
727 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); 734 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
728 } 735 }
729 736
730out:
731 return newf; 737 return newf;
732 738
733out_release: 739out_release:
734 free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
735 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
736 free_fd_array(new_fdt->fd, new_fdt->max_fds);
737 kmem_cache_free(files_cachep, newf); 740 kmem_cache_free(files_cachep, newf);
741out:
738 return NULL; 742 return NULL;
739} 743}
740 744
@@ -830,7 +834,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
830 if (clone_flags & CLONE_THREAD) { 834 if (clone_flags & CLONE_THREAD) {
831 atomic_inc(&current->signal->count); 835 atomic_inc(&current->signal->count);
832 atomic_inc(&current->signal->live); 836 atomic_inc(&current->signal->live);
833 taskstats_tgid_alloc(current);
834 return 0; 837 return 0;
835 } 838 }
836 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 839 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -1039,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1039 p->wchar = 0; /* I/O counter: bytes written */ 1042 p->wchar = 0; /* I/O counter: bytes written */
1040 p->syscr = 0; /* I/O counter: read syscalls */ 1043 p->syscr = 0; /* I/O counter: read syscalls */
1041 p->syscw = 0; /* I/O counter: write syscalls */ 1044 p->syscw = 0; /* I/O counter: write syscalls */
1045 task_io_accounting_init(p);
1042 acct_clear_integrals(p); 1046 acct_clear_integrals(p);
1043 1047
1044 p->it_virt_expires = cputime_zero; 1048 p->it_virt_expires = cputime_zero;
@@ -1243,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1243 if (thread_group_leader(p)) { 1247 if (thread_group_leader(p)) {
1244 p->signal->tty = current->signal->tty; 1248 p->signal->tty = current->signal->tty;
1245 p->signal->pgrp = process_group(current); 1249 p->signal->pgrp = process_group(current);
1246 p->signal->session = current->signal->session; 1250 set_signal_session(p->signal, process_session(current));
1247 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1251 attach_pid(p, PIDTYPE_PGID, process_group(p));
1248 attach_pid(p, PIDTYPE_SID, p->signal->session); 1252 attach_pid(p, PIDTYPE_SID, process_session(p));
1249 1253
1250 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1254 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1251 __get_cpu_var(process_counts)++; 1255 __get_cpu_var(process_counts)++;
@@ -1303,7 +1307,7 @@ fork_out:
1303 return ERR_PTR(retval); 1307 return ERR_PTR(retval);
1304} 1308}
1305 1309
1306struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) 1310noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1307{ 1311{
1308 memset(regs, 0, sizeof(struct pt_regs)); 1312 memset(regs, 0, sizeof(struct pt_regs));
1309 return regs; 1313 return regs;
@@ -1413,7 +1417,7 @@ long do_fork(unsigned long clone_flags,
1413#define ARCH_MIN_MMSTRUCT_ALIGN 0 1417#define ARCH_MIN_MMSTRUCT_ALIGN 0
1414#endif 1418#endif
1415 1419
1416static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) 1420static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
1417{ 1421{
1418 struct sighand_struct *sighand = data; 1422 struct sighand_struct *sighand = data;
1419 1423
@@ -1509,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1509} 1513}
1510 1514
1511/* 1515/*
1512 * Unshare the namespace structure if it is being shared 1516 * Unshare the mnt_namespace structure if it is being shared
1513 */ 1517 */
1514static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) 1518static int unshare_mnt_namespace(unsigned long unshare_flags,
1519 struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
1515{ 1520{
1516 struct namespace *ns = current->nsproxy->namespace; 1521 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
1517 1522
1518 if ((unshare_flags & CLONE_NEWNS) && ns) { 1523 if ((unshare_flags & CLONE_NEWNS) && ns) {
1519 if (!capable(CAP_SYS_ADMIN)) 1524 if (!capable(CAP_SYS_ADMIN))
1520 return -EPERM; 1525 return -EPERM;
1521 1526
1522 *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); 1527 *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
1523 if (!*new_nsp) 1528 if (!*new_nsp)
1524 return -ENOMEM; 1529 return -ENOMEM;
1525 } 1530 }
@@ -1528,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
1528} 1533}
1529 1534
1530/* 1535/*
1531 * Unsharing of sighand for tasks created with CLONE_SIGHAND is not 1536 * Unsharing of sighand is not supported yet
1532 * supported yet
1533 */ 1537 */
1534static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) 1538static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1535{ 1539{
1536 struct sighand_struct *sigh = current->sighand; 1540 struct sighand_struct *sigh = current->sighand;
1537 1541
1538 if ((unshare_flags & CLONE_SIGHAND) && 1542 if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
1539 (sigh && atomic_read(&sigh->count) > 1))
1540 return -EINVAL; 1543 return -EINVAL;
1541 else 1544 else
1542 return 0; 1545 return 0;
@@ -1609,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1609{ 1612{
1610 int err = 0; 1613 int err = 0;
1611 struct fs_struct *fs, *new_fs = NULL; 1614 struct fs_struct *fs, *new_fs = NULL;
1612 struct namespace *ns, *new_ns = NULL; 1615 struct mnt_namespace *ns, *new_ns = NULL;
1613 struct sighand_struct *sigh, *new_sigh = NULL; 1616 struct sighand_struct *new_sigh = NULL;
1614 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1617 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1615 struct files_struct *fd, *new_fd = NULL; 1618 struct files_struct *fd, *new_fd = NULL;
1616 struct sem_undo_list *new_ulist = NULL; 1619 struct sem_undo_list *new_ulist = NULL;
@@ -1631,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1631 goto bad_unshare_out; 1634 goto bad_unshare_out;
1632 if ((err = unshare_fs(unshare_flags, &new_fs))) 1635 if ((err = unshare_fs(unshare_flags, &new_fs)))
1633 goto bad_unshare_cleanup_thread; 1636 goto bad_unshare_cleanup_thread;
1634 if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) 1637 if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
1635 goto bad_unshare_cleanup_fs; 1638 goto bad_unshare_cleanup_fs;
1636 if ((err = unshare_sighand(unshare_flags, &new_sigh))) 1639 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1637 goto bad_unshare_cleanup_ns; 1640 goto bad_unshare_cleanup_ns;
@@ -1655,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1655 } 1658 }
1656 } 1659 }
1657 1660
1658 if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || 1661 if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
1659 new_uts || new_ipc) { 1662 new_uts || new_ipc) {
1660 1663
1661 task_lock(current); 1664 task_lock(current);
@@ -1672,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1672 } 1675 }
1673 1676
1674 if (new_ns) { 1677 if (new_ns) {
1675 ns = current->nsproxy->namespace; 1678 ns = current->nsproxy->mnt_ns;
1676 current->nsproxy->namespace = new_ns; 1679 current->nsproxy->mnt_ns = new_ns;
1677 new_ns = ns; 1680 new_ns = ns;
1678 } 1681 }
1679 1682
1680 if (new_sigh) {
1681 sigh = current->sighand;
1682 rcu_assign_pointer(current->sighand, new_sigh);
1683 new_sigh = sigh;
1684 }
1685
1686 if (new_mm) { 1683 if (new_mm) {
1687 mm = current->mm; 1684 mm = current->mm;
1688 active_mm = current->active_mm; 1685 active_mm = current->active_mm;
@@ -1740,7 +1737,7 @@ bad_unshare_cleanup_sigh:
1740 1737
1741bad_unshare_cleanup_ns: 1738bad_unshare_cleanup_ns:
1742 if (new_ns) 1739 if (new_ns)
1743 put_namespace(new_ns); 1740 put_mnt_ns(new_ns);
1744 1741
1745bad_unshare_cleanup_fs: 1742bad_unshare_cleanup_fs:
1746 if (new_fs) 1743 if (new_fs)
diff --git a/kernel/futex.c b/kernel/futex.c
index 93ef30ba209f..5a737de857d3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
166/* 166/*
167 * Get parameters which are the keys for a futex. 167 * Get parameters which are the keys for a futex.
168 * 168 *
169 * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, 169 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
170 * offset_within_page). For private mappings, it's (uaddr, current->mm). 170 * offset_within_page). For private mappings, it's (uaddr, current->mm).
171 * We can usually work out the index without swapping in the page. 171 * We can usually work out the index without swapping in the page.
172 * 172 *
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
223 /* 223 /*
224 * Linear file mappings are also simple. 224 * Linear file mappings are also simple.
225 */ 225 */
226 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
282{ 282{
283 int ret; 283 int ret;
284 284
285 inc_preempt_count(); 285 pagefault_disable();
286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
287 dec_preempt_count(); 287 pagefault_enable();
288 288
289 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
290} 290}
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
324 if (likely(current->pi_state_cache)) 324 if (likely(current->pi_state_cache))
325 return 0; 325 return 0;
326 326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); 327 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
328 328
329 if (!pi_state) 329 if (!pi_state)
330 return -ENOMEM; 330 return -ENOMEM;
331 331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list); 332 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */ 333 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL; 334 pi_state->owner = NULL;
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q)
553 * at the end of wake_up_all() does not prevent this store from 552 * at the end of wake_up_all() does not prevent this store from
554 * moving. 553 * moving.
555 */ 554 */
556 wmb(); 555 smp_wmb();
557 q->lock_ptr = NULL; 556 q->lock_ptr = NULL;
558} 557}
559 558
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
585 if (!(uval & FUTEX_OWNER_DIED)) { 584 if (!(uval & FUTEX_OWNER_DIED)) {
586 newval = FUTEX_WAITERS | new_owner->pid; 585 newval = FUTEX_WAITERS | new_owner->pid;
587 586
588 inc_preempt_count(); 587 pagefault_disable();
589 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 588 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
590 dec_preempt_count(); 589 pagefault_enable();
591 if (curval == -EFAULT) 590 if (curval == -EFAULT)
592 return -EFAULT; 591 return -EFAULT;
593 if (curval != uval) 592 if (curval != uval)
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
618 * There is no waiter, so we unlock the futex. The owner died 617 * There is no waiter, so we unlock the futex. The owner died
619 * bit has not to be preserved here. We are the owner: 618 * bit has not to be preserved here. We are the owner:
620 */ 619 */
621 inc_preempt_count(); 620 pagefault_disable();
622 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); 621 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
623 dec_preempt_count(); 622 pagefault_enable();
624 623
625 if (oldval == -EFAULT) 624 if (oldval == -EFAULT)
626 return oldval; 625 return oldval;
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1158 */ 1157 */
1159 newval = current->pid; 1158 newval = current->pid;
1160 1159
1161 inc_preempt_count(); 1160 pagefault_disable();
1162 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); 1161 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1163 dec_preempt_count(); 1162 pagefault_enable();
1164 1163
1165 if (unlikely(curval == -EFAULT)) 1164 if (unlikely(curval == -EFAULT))
1166 goto uaddr_faulted; 1165 goto uaddr_faulted;
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1183 uval = curval; 1182 uval = curval;
1184 newval = uval | FUTEX_WAITERS; 1183 newval = uval | FUTEX_WAITERS;
1185 1184
1186 inc_preempt_count(); 1185 pagefault_disable();
1187 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1186 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1188 dec_preempt_count(); 1187 pagefault_enable();
1189 1188
1190 if (unlikely(curval == -EFAULT)) 1189 if (unlikely(curval == -EFAULT))
1191 goto uaddr_faulted; 1190 goto uaddr_faulted;
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1215 newval = current->pid | 1214 newval = current->pid |
1216 FUTEX_OWNER_DIED | FUTEX_WAITERS; 1215 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1217 1216
1218 inc_preempt_count(); 1217 pagefault_disable();
1219 curval = futex_atomic_cmpxchg_inatomic(uaddr, 1218 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1220 uval, newval); 1219 uval, newval);
1221 dec_preempt_count(); 1220 pagefault_enable();
1222 1221
1223 if (unlikely(curval == -EFAULT)) 1222 if (unlikely(curval == -EFAULT))
1224 goto uaddr_faulted; 1223 goto uaddr_faulted;
@@ -1390,9 +1389,9 @@ retry_locked:
1390 * anyone else up: 1389 * anyone else up:
1391 */ 1390 */
1392 if (!(uval & FUTEX_OWNER_DIED)) { 1391 if (!(uval & FUTEX_OWNER_DIED)) {
1393 inc_preempt_count(); 1392 pagefault_disable();
1394 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1393 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1395 dec_preempt_count(); 1394 pagefault_enable();
1396 } 1395 }
1397 1396
1398 if (unlikely(uval == -EFAULT)) 1397 if (unlikely(uval == -EFAULT))
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
1493 return ret; 1492 return ret;
1494} 1493}
1495 1494
1496static struct file_operations futex_fops = { 1495static const struct file_operations futex_fops = {
1497 .release = futex_close, 1496 .release = futex_close,
1498 .poll = futex_poll, 1497 .poll = futex_poll,
1499}; 1498};
@@ -1529,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
1529 goto out; 1528 goto out;
1530 } 1529 }
1531 filp->f_op = &futex_fops; 1530 filp->f_op = &futex_fops;
1532 filp->f_vfsmnt = mntget(futex_mnt); 1531 filp->f_path.mnt = mntget(futex_mnt);
1533 filp->f_dentry = dget(futex_mnt->mnt_root); 1532 filp->f_path.dentry = dget(futex_mnt->mnt_root);
1534 filp->f_mapping = filp->f_dentry->d_inode->i_mapping; 1533 filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
1535 1534
1536 if (signal) { 1535 if (signal) {
1537 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); 1536 err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
@@ -1858,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
1858 1857
1859static int __init init(void) 1858static int __init init(void)
1860{ 1859{
1861 unsigned int i; 1860 int i = register_filesystem(&futex_fs_type);
1861
1862 if (i)
1863 return i;
1862 1864
1863 register_filesystem(&futex_fs_type);
1864 futex_mnt = kern_mount(&futex_fs_type); 1865 futex_mnt = kern_mount(&futex_fs_type);
1866 if (IS_ERR(futex_mnt)) {
1867 unregister_filesystem(&futex_fs_type);
1868 return PTR_ERR(futex_mnt);
1869 }
1865 1870
1866 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 1871 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1867 INIT_LIST_HEAD(&futex_queues[i].chain); 1872 INIT_LIST_HEAD(&futex_queues[i].chain);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ebfd24a41858..d27b25855743 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -517,10 +517,9 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
517 517
518 if (!handle) 518 if (!handle)
519 handle = handle_bad_irq; 519 handle = handle_bad_irq;
520 520 else if (desc->chip == &no_irq_chip) {
521 if (desc->chip == &no_irq_chip) {
522 printk(KERN_WARNING "Trying to install %sinterrupt handler " 521 printk(KERN_WARNING "Trying to install %sinterrupt handler "
523 "for IRQ%d\n", is_chained ? "chained " : " ", irq); 522 "for IRQ%d\n", is_chained ? "chained " : "", irq);
524 /* 523 /*
525 * Some ARM implementations install a handler for really dumb 524 * Some ARM implementations install a handler for really dumb
526 * interrupt hardware without setting an irq_chip. This worked 525 * interrupt hardware without setting an irq_chip. This worked
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a681912bc89a..aff1f0fabb0d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
54 .chip = &no_irq_chip, 54 .chip = &no_irq_chip,
55 .handle_irq = handle_bad_irq, 55 .handle_irq = handle_bad_irq,
56 .depth = 1, 56 .depth = 1,
57 .lock = SPIN_LOCK_UNLOCKED, 57 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
58#ifdef CONFIG_SMP 58#ifdef CONFIG_SMP
59 .affinity = CPU_MASK_ALL 59 .affinity = CPU_MASK_ALL
60#endif 60#endif
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a352667007c..61f5c717a8f5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
54 unsigned int irq = (int)(long)data, full_count = count, err; 54 unsigned int irq = (int)(long)data, full_count = count, err;
55 cpumask_t new_value, tmp; 55 cpumask_t new_value, tmp;
56 56
57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
58 CHECK_IRQ_PER_CPU(irq_desc[irq].status))
58 return -EIO; 59 return -EIO;
59 60
60 err = cpumask_parse_user(buffer, count, new_value); 61 err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 543ea2e5ad93..9d8c79b48823 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -176,7 +176,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
176 176
177int noirqdebug __read_mostly; 177int noirqdebug __read_mostly;
178 178
179int __init noirqdebug_setup(char *str) 179int noirqdebug_setup(char *str)
180{ 180{
181 noirqdebug = 1; 181 noirqdebug = 1;
182 printk(KERN_INFO "IRQ lockup detection disabled\n"); 182 printk(KERN_INFO "IRQ lockup detection disabled\n");
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2b..6f294ff4f9ee 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/ctype.h>
23 24
24#include <asm/sections.h> 25#include <asm/sections.h>
25 26
@@ -30,14 +31,14 @@
30#endif 31#endif
31 32
32/* These will be re-linked against their real values during the second link stage */ 33/* These will be re-linked against their real values during the second link stage */
33extern unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const unsigned long kallsyms_addresses[] __attribute__((weak));
34extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); 35extern const unsigned long kallsyms_num_syms __attribute__((weak));
35extern u8 kallsyms_names[] __attribute__((weak)); 36extern const u8 kallsyms_names[] __attribute__((weak));
36 37
37extern u8 kallsyms_token_table[] __attribute__((weak)); 38extern const u8 kallsyms_token_table[] __attribute__((weak));
38extern u16 kallsyms_token_index[] __attribute__((weak)); 39extern const u16 kallsyms_token_index[] __attribute__((weak));
39 40
40extern unsigned long kallsyms_markers[] __attribute__((weak)); 41extern const unsigned long kallsyms_markers[] __attribute__((weak));
41 42
42static inline int is_kernel_inittext(unsigned long addr) 43static inline int is_kernel_inittext(unsigned long addr)
43{ 44{
@@ -83,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
83static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 84static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
84{ 85{
85 int len, skipped_first = 0; 86 int len, skipped_first = 0;
86 u8 *tptr, *data; 87 const u8 *tptr, *data;
87 88
88 /* get the compressed symbol length from the first symbol byte */ 89 /* get the compressed symbol length from the first symbol byte */
89 data = &kallsyms_names[off]; 90 data = &kallsyms_names[off];
@@ -131,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
131 * kallsyms array */ 132 * kallsyms array */
132static unsigned int get_symbol_offset(unsigned long pos) 133static unsigned int get_symbol_offset(unsigned long pos)
133{ 134{
134 u8 *name; 135 const u8 *name;
135 int i; 136 int i;
136 137
137 /* use the closest marker we have. We have markers every 256 positions, 138 /* use the closest marker we have. We have markers every 256 positions,
@@ -301,13 +302,6 @@ struct kallsym_iter
301 char name[KSYM_NAME_LEN+1]; 302 char name[KSYM_NAME_LEN+1];
302}; 303};
303 304
304/* Only label it "global" if it is exported. */
305static void upcase_if_global(struct kallsym_iter *iter)
306{
307 if (is_exported(iter->name, iter->owner))
308 iter->type += 'A' - 'a';
309}
310
311static int get_ksymbol_mod(struct kallsym_iter *iter) 305static int get_ksymbol_mod(struct kallsym_iter *iter)
312{ 306{
313 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 307 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
316 if (iter->owner == NULL) 310 if (iter->owner == NULL)
317 return 0; 311 return 0;
318 312
319 upcase_if_global(iter); 313 /* Label it "global" if it is exported, "local" if not exported. */
314 iter->type = is_exported(iter->name, iter->owner)
315 ? toupper(iter->type) : tolower(iter->type);
316
320 return 1; 317 return 1;
321} 318}
322 319
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
401 return 0; 398 return 0;
402} 399}
403 400
404static struct seq_operations kallsyms_op = { 401static const struct seq_operations kallsyms_op = {
405 .start = s_start, 402 .start = s_start,
406 .next = s_next, 403 .next = s_next,
407 .stop = s_stop, 404 .stop = s_stop,
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
436 return seq_release(inode, file); 433 return seq_release(inode, file);
437} 434}
438 435
439static struct file_operations kallsyms_operations = { 436static const struct file_operations kallsyms_operations = {
440 .open = kallsyms_open, 437 .open = kallsyms_open,
441 .read = seq_read, 438 .read = seq_read,
442 .llseek = seq_lseek, 439 .llseek = seq_lseek,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f4..2a59c8a01ae0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/elf.h>
24#include <linux/elfcore.h>
23 25
24#include <asm/page.h> 26#include <asm/page.h>
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
108 110
109 /* Allocate a controlling structure */ 111 /* Allocate a controlling structure */
110 result = -ENOMEM; 112 result = -ENOMEM;
111 image = kmalloc(sizeof(*image), GFP_KERNEL); 113 image = kzalloc(sizeof(*image), GFP_KERNEL);
112 if (!image) 114 if (!image)
113 goto out; 115 goto out;
114 116
115 memset(image, 0, sizeof(*image));
116 image->head = 0; 117 image->head = 0;
117 image->entry = &image->head; 118 image->entry = &image->head;
118 image->last_entry = &image->head; 119 image->last_entry = &image->head;
@@ -851,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image,
851 memset(ptr + uchunk, 0, mchunk - uchunk); 852 memset(ptr + uchunk, 0, mchunk - uchunk);
852 } 853 }
853 result = copy_from_user(ptr, buf, uchunk); 854 result = copy_from_user(ptr, buf, uchunk);
855 kexec_flush_icache_page(page);
854 kunmap(page); 856 kunmap(page);
855 if (result) { 857 if (result) {
856 result = (result < 0) ? result : -EIO; 858 result = (result < 0) ? result : -EIO;
@@ -1067,6 +1069,60 @@ void crash_kexec(struct pt_regs *regs)
1067 } 1069 }
1068} 1070}
1069 1071
1072static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1073 size_t data_len)
1074{
1075 struct elf_note note;
1076
1077 note.n_namesz = strlen(name) + 1;
1078 note.n_descsz = data_len;
1079 note.n_type = type;
1080 memcpy(buf, &note, sizeof(note));
1081 buf += (sizeof(note) + 3)/4;
1082 memcpy(buf, name, note.n_namesz);
1083 buf += (note.n_namesz + 3)/4;
1084 memcpy(buf, data, note.n_descsz);
1085 buf += (note.n_descsz + 3)/4;
1086
1087 return buf;
1088}
1089
1090static void final_note(u32 *buf)
1091{
1092 struct elf_note note;
1093
1094 note.n_namesz = 0;
1095 note.n_descsz = 0;
1096 note.n_type = 0;
1097 memcpy(buf, &note, sizeof(note));
1098}
1099
1100void crash_save_cpu(struct pt_regs *regs, int cpu)
1101{
1102 struct elf_prstatus prstatus;
1103 u32 *buf;
1104
1105 if ((cpu < 0) || (cpu >= NR_CPUS))
1106 return;
1107
1108 /* Using ELF notes here is opportunistic.
1109 * I need a well defined structure format
1110 * for the data I pass, and I need tags
1111 * on the data to indicate what information I have
1112 * squirrelled away. ELF notes happen to provide
1113 * all of that, so there is no need to invent something new.
1114 */
1115 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1116 if (!buf)
1117 return;
1118 memset(&prstatus, 0, sizeof(prstatus));
1119 prstatus.pr_pid = current->pid;
1120 elf_core_copy_regs(&prstatus.pr_reg, regs);
1121 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
1122 sizeof(prstatus));
1123 final_note(buf);
1124}
1125
1070static int __init crash_notes_memory_init(void) 1126static int __init crash_notes_memory_init(void)
1071{ 1127{
1072 /* Allocate memory for saving cpu registers. */ 1128 /* Allocate memory for saving cpu registers. */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2b76dee28496..3a7379aa31ca 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/namespace.h> 28#include <linux/mnt_namespace.h>
29#include <linux/completion.h> 29#include <linux/completion.h>
30#include <linux/file.h> 30#include <linux/file.h>
31#include <linux/workqueue.h> 31#include <linux/workqueue.h>
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
114#endif /* CONFIG_KMOD */ 114#endif /* CONFIG_KMOD */
115 115
116struct subprocess_info { 116struct subprocess_info {
117 struct work_struct work;
117 struct completion *complete; 118 struct completion *complete;
118 char *path; 119 char *path;
119 char **argv; 120 char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
221} 222}
222 223
223/* This is run by khelper thread */ 224/* This is run by khelper thread */
224static void __call_usermodehelper(void *data) 225static void __call_usermodehelper(struct work_struct *work)
225{ 226{
226 struct subprocess_info *sub_info = data; 227 struct subprocess_info *sub_info =
228 container_of(work, struct subprocess_info, work);
227 pid_t pid; 229 pid_t pid;
228 int wait = sub_info->wait; 230 int wait = sub_info->wait;
229 231
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
264{ 266{
265 DECLARE_COMPLETION_ONSTACK(done); 267 DECLARE_COMPLETION_ONSTACK(done);
266 struct subprocess_info sub_info = { 268 struct subprocess_info sub_info = {
269 .work = __WORK_INITIALIZER(sub_info.work,
270 __call_usermodehelper),
267 .complete = &done, 271 .complete = &done,
268 .path = path, 272 .path = path,
269 .argv = argv, 273 .argv = argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
272 .wait = wait, 276 .wait = wait,
273 .retval = 0, 277 .retval = 0,
274 }; 278 };
275 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
276 279
277 if (!khelper_wq) 280 if (!khelper_wq)
278 return -EBUSY; 281 return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
280 if (path[0] == '\0') 283 if (path[0] == '\0')
281 return 0; 284 return 0;
282 285
283 queue_work(khelper_wq, &work); 286 queue_work(khelper_wq, &sub_info.work);
284 wait_for_completion(&done); 287 wait_for_completion(&done);
285 return sub_info.retval; 288 return sub_info.retval;
286} 289}
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
291{ 294{
292 DECLARE_COMPLETION(done); 295 DECLARE_COMPLETION(done);
293 struct subprocess_info sub_info = { 296 struct subprocess_info sub_info = {
297 .work = __WORK_INITIALIZER(sub_info.work,
298 __call_usermodehelper),
294 .complete = &done, 299 .complete = &done,
295 .path = path, 300 .path = path,
296 .argv = argv, 301 .argv = argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
298 .retval = 0, 303 .retval = 0,
299 }; 304 };
300 struct file *f; 305 struct file *f;
301 DECLARE_WORK(work, __call_usermodehelper, &sub_info);
302 306
303 if (!khelper_wq) 307 if (!khelper_wq)
304 return -EBUSY; 308 return -EBUSY;
@@ -318,7 +322,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
318 } 322 }
319 sub_info.stdin = f; 323 sub_info.stdin = f;
320 324
321 queue_work(khelper_wq, &work); 325 queue_work(khelper_wq, &sub_info.work);
322 wait_for_completion(&done); 326 wait_for_completion(&done);
323 return sub_info.retval; 327 return sub_info.retval;
324} 328}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e0..17ec4afb0994 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
41#include <linux/freezer.h>
41#include <asm-generic/sections.h> 42#include <asm-generic/sections.h>
42#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
43#include <asm/errno.h> 44#include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
83 kprobe_opcode_t *insns; /* Page of instruction slots */ 84 kprobe_opcode_t *insns; /* Page of instruction slots */
84 char slot_used[INSNS_PER_PAGE]; 85 char slot_used[INSNS_PER_PAGE];
85 int nused; 86 int nused;
87 int ngarbage;
86}; 88};
87 89
88static struct hlist_head kprobe_insn_pages; 90static struct hlist_head kprobe_insn_pages;
91static int kprobe_garbage_slots;
92static int collect_garbage_slots(void);
93
94static int __kprobes check_safety(void)
95{
96 int ret = 0;
97#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
98 ret = freeze_processes();
99 if (ret == 0) {
100 struct task_struct *p, *q;
101 do_each_thread(p, q) {
102 if (p != current && p->state == TASK_RUNNING &&
103 p->pid != 0) {
104 printk("Check failed: %s is running\n",p->comm);
105 ret = -1;
106 goto loop_end;
107 }
108 } while_each_thread(p, q);
109 }
110loop_end:
111 thaw_processes();
112#else
113 synchronize_sched();
114#endif
115 return ret;
116}
89 117
90/** 118/**
91 * get_insn_slot() - Find a slot on an executable page for an instruction. 119 * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
96 struct kprobe_insn_page *kip; 124 struct kprobe_insn_page *kip;
97 struct hlist_node *pos; 125 struct hlist_node *pos;
98 126
127 retry:
99 hlist_for_each(pos, &kprobe_insn_pages) { 128 hlist_for_each(pos, &kprobe_insn_pages) {
100 kip = hlist_entry(pos, struct kprobe_insn_page, hlist); 129 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
101 if (kip->nused < INSNS_PER_PAGE) { 130 if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
112 } 141 }
113 } 142 }
114 143
115 /* All out of space. Need to allocate a new page. Use slot 0.*/ 144 /* If there are any garbage slots, collect it and try again. */
145 if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
146 goto retry;
147 }
148 /* All out of space. Need to allocate a new page. Use slot 0. */
116 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 149 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
117 if (!kip) { 150 if (!kip) {
118 return NULL; 151 return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
133 memset(kip->slot_used, 0, INSNS_PER_PAGE); 166 memset(kip->slot_used, 0, INSNS_PER_PAGE);
134 kip->slot_used[0] = 1; 167 kip->slot_used[0] = 1;
135 kip->nused = 1; 168 kip->nused = 1;
169 kip->ngarbage = 0;
136 return kip->insns; 170 return kip->insns;
137} 171}
138 172
139void __kprobes free_insn_slot(kprobe_opcode_t *slot) 173/* Return 1 if all garbages are collected, otherwise 0. */
174static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
175{
176 kip->slot_used[idx] = 0;
177 kip->nused--;
178 if (kip->nused == 0) {
179 /*
180 * Page is no longer in use. Free it unless
181 * it's the last one. We keep the last one
182 * so as not to have to set it up again the
183 * next time somebody inserts a probe.
184 */
185 hlist_del(&kip->hlist);
186 if (hlist_empty(&kprobe_insn_pages)) {
187 INIT_HLIST_NODE(&kip->hlist);
188 hlist_add_head(&kip->hlist,
189 &kprobe_insn_pages);
190 } else {
191 module_free(NULL, kip->insns);
192 kfree(kip);
193 }
194 return 1;
195 }
196 return 0;
197}
198
199static int __kprobes collect_garbage_slots(void)
200{
201 struct kprobe_insn_page *kip;
202 struct hlist_node *pos, *next;
203
204 /* Ensure no-one is preepmted on the garbages */
205 if (check_safety() != 0)
206 return -EAGAIN;
207
208 hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
209 int i;
210 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
211 if (kip->ngarbage == 0)
212 continue;
213 kip->ngarbage = 0; /* we will collect all garbages */
214 for (i = 0; i < INSNS_PER_PAGE; i++) {
215 if (kip->slot_used[i] == -1 &&
216 collect_one_slot(kip, i))
217 break;
218 }
219 }
220 kprobe_garbage_slots = 0;
221 return 0;
222}
223
224void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
140{ 225{
141 struct kprobe_insn_page *kip; 226 struct kprobe_insn_page *kip;
142 struct hlist_node *pos; 227 struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
146 if (kip->insns <= slot && 231 if (kip->insns <= slot &&
147 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 232 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
148 int i = (slot - kip->insns) / MAX_INSN_SIZE; 233 int i = (slot - kip->insns) / MAX_INSN_SIZE;
149 kip->slot_used[i] = 0; 234 if (dirty) {
150 kip->nused--; 235 kip->slot_used[i] = -1;
151 if (kip->nused == 0) { 236 kip->ngarbage++;
152 /* 237 } else {
153 * Page is no longer in use. Free it unless 238 collect_one_slot(kip, i);
154 * it's the last one. We keep the last one
155 * so as not to have to set it up again the
156 * next time somebody inserts a probe.
157 */
158 hlist_del(&kip->hlist);
159 if (hlist_empty(&kprobe_insn_pages)) {
160 INIT_HLIST_NODE(&kip->hlist);
161 hlist_add_head(&kip->hlist,
162 &kprobe_insn_pages);
163 } else {
164 module_free(NULL, kip->insns);
165 kfree(kip);
166 }
167 } 239 }
168 return; 240 break;
169 } 241 }
170 } 242 }
243 if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
244 collect_garbage_slots();
245 }
171} 246}
172#endif 247#endif
173 248
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e8..1db8c72d0d38 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
31 /* Result passed back to kthread_create() from keventd. */ 31 /* Result passed back to kthread_create() from keventd. */
32 struct task_struct *result; 32 struct task_struct *result;
33 struct completion done; 33 struct completion done;
34
35 struct work_struct work;
34}; 36};
35 37
36struct kthread_stop_info 38struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
111} 113}
112 114
113/* We are keventd: create a thread. */ 115/* We are keventd: create a thread. */
114static void keventd_create_kthread(void *_create) 116static void keventd_create_kthread(struct work_struct *work)
115{ 117{
116 struct kthread_create_info *create = _create; 118 struct kthread_create_info *create =
119 container_of(work, struct kthread_create_info, work);
117 int pid; 120 int pid;
118 121
119 /* We want our own signal handler (we take no signals by default). */ 122 /* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
154 ...) 157 ...)
155{ 158{
156 struct kthread_create_info create; 159 struct kthread_create_info create;
157 DECLARE_WORK(work, keventd_create_kthread, &create);
158 160
159 create.threadfn = threadfn; 161 create.threadfn = threadfn;
160 create.data = data; 162 create.data = data;
161 init_completion(&create.started); 163 init_completion(&create.started);
162 init_completion(&create.done); 164 init_completion(&create.done);
165 INIT_WORK(&create.work, keventd_create_kthread);
163 166
164 /* 167 /*
165 * The workqueue needs to start up first: 168 * The workqueue needs to start up first:
166 */ 169 */
167 if (!helper_wq) 170 if (!helper_wq)
168 work.func(work.data); 171 create.work.func(&create.work);
169 else { 172 else {
170 queue_work(helper_wq, &work); 173 queue_work(helper_wq, &create.work);
171 wait_for_completion(&create.done); 174 wait_for_completion(&create.done);
172 } 175 }
173 if (!IS_ERR(create.result)) { 176 if (!IS_ERR(create.result)) {
diff --git a/kernel/latency.c b/kernel/latency.c
index 258f2555abbc..e63fcacb61a7 100644
--- a/kernel/latency.c
+++ b/kernel/latency.c
@@ -36,6 +36,7 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
39#include <linux/jiffies.h>
39#include <asm/atomic.h> 40#include <asm/atomic.h>
40 41
41struct latency_info { 42struct latency_info {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c9fefdb1a7db..509efd49540f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,13 +43,49 @@
43#include "lockdep_internals.h" 43#include "lockdep_internals.h"
44 44
45/* 45/*
46 * hash_lock: protects the lockdep hashes and class/list/hash allocators. 46 * lockdep_lock: protects the lockdep graph, the hashes and the
47 * class/list/hash allocators.
47 * 48 *
48 * This is one of the rare exceptions where it's justified 49 * This is one of the rare exceptions where it's justified
49 * to use a raw spinlock - we really dont want the spinlock 50 * to use a raw spinlock - we really dont want the spinlock
50 * code to recurse back into the lockdep code. 51 * code to recurse back into the lockdep code...
51 */ 52 */
52static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 53static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
54
55static int graph_lock(void)
56{
57 __raw_spin_lock(&lockdep_lock);
58 /*
59 * Make sure that if another CPU detected a bug while
60 * walking the graph we dont change it (while the other
61 * CPU is busy printing out stuff with the graph lock
62 * dropped already)
63 */
64 if (!debug_locks) {
65 __raw_spin_unlock(&lockdep_lock);
66 return 0;
67 }
68 return 1;
69}
70
71static inline int graph_unlock(void)
72{
73 __raw_spin_unlock(&lockdep_lock);
74 return 0;
75}
76
77/*
78 * Turn lock debugging off and return with 0 if it was off already,
79 * and also release the graph lock:
80 */
81static inline int debug_locks_off_graph_unlock(void)
82{
83 int ret = debug_locks_off();
84
85 __raw_spin_unlock(&lockdep_lock);
86
87 return ret;
88}
53 89
54static int lockdep_initialized; 90static int lockdep_initialized;
55 91
@@ -57,14 +93,15 @@ unsigned long nr_list_entries;
57static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; 93static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
58 94
59/* 95/*
60 * Allocate a lockdep entry. (assumes hash_lock held, returns 96 * Allocate a lockdep entry. (assumes the graph_lock held, returns
61 * with NULL on failure) 97 * with NULL on failure)
62 */ 98 */
63static struct lock_list *alloc_list_entry(void) 99static struct lock_list *alloc_list_entry(void)
64{ 100{
65 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { 101 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
66 __raw_spin_unlock(&hash_lock); 102 if (!debug_locks_off_graph_unlock())
67 debug_locks_off(); 103 return NULL;
104
68 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 105 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
69 printk("turning off the locking correctness validator.\n"); 106 printk("turning off the locking correctness validator.\n");
70 return NULL; 107 return NULL;
@@ -140,21 +177,12 @@ void lockdep_on(void)
140 177
141EXPORT_SYMBOL(lockdep_on); 178EXPORT_SYMBOL(lockdep_on);
142 179
143int lockdep_internal(void)
144{
145 return current->lockdep_recursion != 0;
146}
147
148EXPORT_SYMBOL(lockdep_internal);
149
150/* 180/*
151 * Debugging switches: 181 * Debugging switches:
152 */ 182 */
153 183
154#define VERBOSE 0 184#define VERBOSE 0
155#ifdef VERBOSE 185#define VERY_VERBOSE 0
156# define VERY_VERBOSE 0
157#endif
158 186
159#if VERBOSE 187#if VERBOSE
160# define HARDIRQ_VERBOSE 1 188# define HARDIRQ_VERBOSE 1
@@ -179,8 +207,8 @@ static int class_filter(struct lock_class *class)
179 !strcmp(class->name, "&struct->lockfield")) 207 !strcmp(class->name, "&struct->lockfield"))
180 return 1; 208 return 1;
181#endif 209#endif
182 /* Allow everything else. 0 would be filter everything else */ 210 /* Filter everything else. 1 would be to allow everything else */
183 return 1; 211 return 0;
184} 212}
185#endif 213#endif
186 214
@@ -214,7 +242,7 @@ static int softirq_verbose(struct lock_class *class)
214 242
215/* 243/*
216 * Stack-trace: tightly packed array of stack backtrace 244 * Stack-trace: tightly packed array of stack backtrace
217 * addresses. Protected by the hash_lock. 245 * addresses. Protected by the graph_lock.
218 */ 246 */
219unsigned long nr_stack_trace_entries; 247unsigned long nr_stack_trace_entries;
220static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; 248static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
@@ -228,25 +256,20 @@ static int save_trace(struct stack_trace *trace)
228 trace->skip = 3; 256 trace->skip = 3;
229 trace->all_contexts = 0; 257 trace->all_contexts = 0;
230 258
231 /* Make sure to not recurse in case the the unwinder needs to tak
232e locks. */
233 lockdep_off();
234 save_stack_trace(trace, NULL); 259 save_stack_trace(trace, NULL);
235 lockdep_on();
236 260
237 trace->max_entries = trace->nr_entries; 261 trace->max_entries = trace->nr_entries;
238 262
239 nr_stack_trace_entries += trace->nr_entries; 263 nr_stack_trace_entries += trace->nr_entries;
240 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
241 return 0;
242 264
243 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 265 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
244 __raw_spin_unlock(&hash_lock); 266 if (!debug_locks_off_graph_unlock())
245 if (debug_locks_off()) { 267 return 0;
246 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); 268
247 printk("turning off the locking correctness validator.\n"); 269 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
248 dump_stack(); 270 printk("turning off the locking correctness validator.\n");
249 } 271 dump_stack();
272
250 return 0; 273 return 0;
251 } 274 }
252 275
@@ -357,7 +380,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
357 380
358static void print_lock_name(struct lock_class *class) 381static void print_lock_name(struct lock_class *class)
359{ 382{
360 char str[128], c1, c2, c3, c4; 383 char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
361 const char *name; 384 const char *name;
362 385
363 get_usage_chars(class, &c1, &c2, &c3, &c4); 386 get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -379,7 +402,7 @@ static void print_lock_name(struct lock_class *class)
379static void print_lockdep_cache(struct lockdep_map *lock) 402static void print_lockdep_cache(struct lockdep_map *lock)
380{ 403{
381 const char *name; 404 const char *name;
382 char str[128]; 405 char str[KSYM_NAME_LEN + 1];
383 406
384 name = lock->name; 407 name = lock->name;
385 if (!name) 408 if (!name)
@@ -449,7 +472,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
449 print_lock_class_header(class, depth); 472 print_lock_class_header(class, depth);
450 473
451 list_for_each_entry(entry, &class->locks_after, entry) { 474 list_for_each_entry(entry, &class->locks_after, entry) {
452 DEBUG_LOCKS_WARN_ON(!entry->class); 475 if (DEBUG_LOCKS_WARN_ON(!entry->class))
476 return;
477
453 print_lock_dependencies(entry->class, depth + 1); 478 print_lock_dependencies(entry->class, depth + 1);
454 479
455 printk("%*s ... acquired at:\n",depth,""); 480 printk("%*s ... acquired at:\n",depth,"");
@@ -474,7 +499,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
474 return 0; 499 return 0;
475 500
476 entry->class = this; 501 entry->class = this;
477 save_trace(&entry->trace); 502 if (!save_trace(&entry->trace))
503 return 0;
478 504
479 /* 505 /*
480 * Since we never remove from the dependency list, the list can 506 * Since we never remove from the dependency list, the list can
@@ -532,9 +558,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
532{ 558{
533 struct task_struct *curr = current; 559 struct task_struct *curr = current;
534 560
535 __raw_spin_unlock(&hash_lock); 561 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
536 debug_locks_off();
537 if (debug_locks_silent)
538 return 0; 562 return 0;
539 563
540 printk("\n=======================================================\n"); 564 printk("\n=======================================================\n");
@@ -563,7 +587,9 @@ static noinline int print_circular_bug_tail(void)
563 return 0; 587 return 0;
564 588
565 this.class = check_source->class; 589 this.class = check_source->class;
566 save_trace(&this.trace); 590 if (!save_trace(&this.trace))
591 return 0;
592
567 print_circular_bug_entry(&this, 0); 593 print_circular_bug_entry(&this, 0);
568 594
569 printk("\nother info that might help us debug this:\n\n"); 595 printk("\nother info that might help us debug this:\n\n");
@@ -579,8 +605,10 @@ static noinline int print_circular_bug_tail(void)
579 605
580static int noinline print_infinite_recursion_bug(void) 606static int noinline print_infinite_recursion_bug(void)
581{ 607{
582 __raw_spin_unlock(&hash_lock); 608 if (!debug_locks_off_graph_unlock())
583 DEBUG_LOCKS_WARN_ON(1); 609 return 0;
610
611 WARN_ON(1);
584 612
585 return 0; 613 return 0;
586} 614}
@@ -715,9 +743,7 @@ print_bad_irq_dependency(struct task_struct *curr,
715 enum lock_usage_bit bit2, 743 enum lock_usage_bit bit2,
716 const char *irqclass) 744 const char *irqclass)
717{ 745{
718 __raw_spin_unlock(&hash_lock); 746 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
719 debug_locks_off();
720 if (debug_locks_silent)
721 return 0; 747 return 0;
722 748
723 printk("\n======================================================\n"); 749 printk("\n======================================================\n");
@@ -798,9 +824,7 @@ static int
798print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 824print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
799 struct held_lock *next) 825 struct held_lock *next)
800{ 826{
801 debug_locks_off(); 827 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
802 __raw_spin_unlock(&hash_lock);
803 if (debug_locks_silent)
804 return 0; 828 return 0;
805 829
806 printk("\n=============================================\n"); 830 printk("\n=============================================\n");
@@ -966,27 +990,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
966 &prev->class->locks_after, next->acquire_ip); 990 &prev->class->locks_after, next->acquire_ip);
967 if (!ret) 991 if (!ret)
968 return 0; 992 return 0;
969 /* 993
970 * Return value of 2 signals 'dependency already added',
971 * in that case we dont have to add the backlink either.
972 */
973 if (ret == 2)
974 return 2;
975 ret = add_lock_to_list(next->class, prev->class, 994 ret = add_lock_to_list(next->class, prev->class,
976 &next->class->locks_before, next->acquire_ip); 995 &next->class->locks_before, next->acquire_ip);
996 if (!ret)
997 return 0;
977 998
978 /* 999 /*
979 * Debugging printouts: 1000 * Debugging printouts:
980 */ 1001 */
981 if (verbose(prev->class) || verbose(next->class)) { 1002 if (verbose(prev->class) || verbose(next->class)) {
982 __raw_spin_unlock(&hash_lock); 1003 graph_unlock();
983 printk("\n new dependency: "); 1004 printk("\n new dependency: ");
984 print_lock_name(prev->class); 1005 print_lock_name(prev->class);
985 printk(" => "); 1006 printk(" => ");
986 print_lock_name(next->class); 1007 print_lock_name(next->class);
987 printk("\n"); 1008 printk("\n");
988 dump_stack(); 1009 dump_stack();
989 __raw_spin_lock(&hash_lock); 1010 return graph_lock();
990 } 1011 }
991 return 1; 1012 return 1;
992} 1013}
@@ -1025,7 +1046,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1025 * added: 1046 * added:
1026 */ 1047 */
1027 if (hlock->read != 2) { 1048 if (hlock->read != 2) {
1028 check_prev_add(curr, hlock, next); 1049 if (!check_prev_add(curr, hlock, next))
1050 return 0;
1029 /* 1051 /*
1030 * Stop after the first non-trylock entry, 1052 * Stop after the first non-trylock entry,
1031 * as non-trylock entries have added their 1053 * as non-trylock entries have added their
@@ -1050,8 +1072,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1050 } 1072 }
1051 return 1; 1073 return 1;
1052out_bug: 1074out_bug:
1053 __raw_spin_unlock(&hash_lock); 1075 if (!debug_locks_off_graph_unlock())
1054 DEBUG_LOCKS_WARN_ON(1); 1076 return 0;
1077
1078 WARN_ON(1);
1055 1079
1056 return 0; 1080 return 0;
1057} 1081}
@@ -1182,6 +1206,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1182 struct lockdep_subclass_key *key; 1206 struct lockdep_subclass_key *key;
1183 struct list_head *hash_head; 1207 struct list_head *hash_head;
1184 struct lock_class *class; 1208 struct lock_class *class;
1209 unsigned long flags;
1185 1210
1186 class = look_up_lock_class(lock, subclass); 1211 class = look_up_lock_class(lock, subclass);
1187 if (likely(class)) 1212 if (likely(class))
@@ -1203,7 +1228,11 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1203 key = lock->key->subkeys + subclass; 1228 key = lock->key->subkeys + subclass;
1204 hash_head = classhashentry(key); 1229 hash_head = classhashentry(key);
1205 1230
1206 __raw_spin_lock(&hash_lock); 1231 raw_local_irq_save(flags);
1232 if (!graph_lock()) {
1233 raw_local_irq_restore(flags);
1234 return NULL;
1235 }
1207 /* 1236 /*
1208 * We have to do the hash-walk again, to avoid races 1237 * We have to do the hash-walk again, to avoid races
1209 * with another CPU: 1238 * with another CPU:
@@ -1216,8 +1245,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1216 * the hash: 1245 * the hash:
1217 */ 1246 */
1218 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { 1247 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1219 __raw_spin_unlock(&hash_lock); 1248 if (!debug_locks_off_graph_unlock()) {
1220 debug_locks_off(); 1249 raw_local_irq_restore(flags);
1250 return NULL;
1251 }
1252 raw_local_irq_restore(flags);
1253
1221 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 1254 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1222 printk("turning off the locking correctness validator.\n"); 1255 printk("turning off the locking correctness validator.\n");
1223 return NULL; 1256 return NULL;
@@ -1238,16 +1271,24 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1238 list_add_tail_rcu(&class->hash_entry, hash_head); 1271 list_add_tail_rcu(&class->hash_entry, hash_head);
1239 1272
1240 if (verbose(class)) { 1273 if (verbose(class)) {
1241 __raw_spin_unlock(&hash_lock); 1274 graph_unlock();
1275 raw_local_irq_restore(flags);
1276
1242 printk("\nnew class %p: %s", class->key, class->name); 1277 printk("\nnew class %p: %s", class->key, class->name);
1243 if (class->name_version > 1) 1278 if (class->name_version > 1)
1244 printk("#%d", class->name_version); 1279 printk("#%d", class->name_version);
1245 printk("\n"); 1280 printk("\n");
1246 dump_stack(); 1281 dump_stack();
1247 __raw_spin_lock(&hash_lock); 1282
1283 raw_local_irq_save(flags);
1284 if (!graph_lock()) {
1285 raw_local_irq_restore(flags);
1286 return NULL;
1287 }
1248 } 1288 }
1249out_unlock_set: 1289out_unlock_set:
1250 __raw_spin_unlock(&hash_lock); 1290 graph_unlock();
1291 raw_local_irq_restore(flags);
1251 1292
1252 if (!subclass || force) 1293 if (!subclass || force)
1253 lock->class_cache = class; 1294 lock->class_cache = class;
@@ -1262,7 +1303,7 @@ out_unlock_set:
1262 * add it and return 0 - in this case the new dependency chain is 1303 * add it and return 0 - in this case the new dependency chain is
1263 * validated. If the key is already hashed, return 1. 1304 * validated. If the key is already hashed, return 1.
1264 */ 1305 */
1265static inline int lookup_chain_cache(u64 chain_key) 1306static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
1266{ 1307{
1267 struct list_head *hash_head = chainhashentry(chain_key); 1308 struct list_head *hash_head = chainhashentry(chain_key);
1268 struct lock_chain *chain; 1309 struct lock_chain *chain;
@@ -1276,34 +1317,36 @@ static inline int lookup_chain_cache(u64 chain_key)
1276 if (chain->chain_key == chain_key) { 1317 if (chain->chain_key == chain_key) {
1277cache_hit: 1318cache_hit:
1278 debug_atomic_inc(&chain_lookup_hits); 1319 debug_atomic_inc(&chain_lookup_hits);
1279 /* 1320 if (very_verbose(class))
1280 * In the debugging case, force redundant checking 1321 printk("\nhash chain already cached, key: "
1281 * by returning 1: 1322 "%016Lx tail class: [%p] %s\n",
1282 */ 1323 (unsigned long long)chain_key,
1283#ifdef CONFIG_DEBUG_LOCKDEP 1324 class->key, class->name);
1284 __raw_spin_lock(&hash_lock);
1285 return 1;
1286#endif
1287 return 0; 1325 return 0;
1288 } 1326 }
1289 } 1327 }
1328 if (very_verbose(class))
1329 printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
1330 (unsigned long long)chain_key, class->key, class->name);
1290 /* 1331 /*
1291 * Allocate a new chain entry from the static array, and add 1332 * Allocate a new chain entry from the static array, and add
1292 * it to the hash: 1333 * it to the hash:
1293 */ 1334 */
1294 __raw_spin_lock(&hash_lock); 1335 if (!graph_lock())
1336 return 0;
1295 /* 1337 /*
1296 * We have to walk the chain again locked - to avoid duplicates: 1338 * We have to walk the chain again locked - to avoid duplicates:
1297 */ 1339 */
1298 list_for_each_entry(chain, hash_head, entry) { 1340 list_for_each_entry(chain, hash_head, entry) {
1299 if (chain->chain_key == chain_key) { 1341 if (chain->chain_key == chain_key) {
1300 __raw_spin_unlock(&hash_lock); 1342 graph_unlock();
1301 goto cache_hit; 1343 goto cache_hit;
1302 } 1344 }
1303 } 1345 }
1304 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { 1346 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
1305 __raw_spin_unlock(&hash_lock); 1347 if (!debug_locks_off_graph_unlock())
1306 debug_locks_off(); 1348 return 0;
1349
1307 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 1350 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
1308 printk("turning off the locking correctness validator.\n"); 1351 printk("turning off the locking correctness validator.\n");
1309 return 0; 1352 return 0;
@@ -1379,9 +1422,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1379 struct held_lock *this, int forwards, 1422 struct held_lock *this, int forwards,
1380 const char *irqclass) 1423 const char *irqclass)
1381{ 1424{
1382 __raw_spin_unlock(&hash_lock); 1425 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1383 debug_locks_off();
1384 if (debug_locks_silent)
1385 return 0; 1426 return 0;
1386 1427
1387 printk("\n=========================================================\n"); 1428 printk("\n=========================================================\n");
@@ -1451,7 +1492,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1451 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 1492 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
1452} 1493}
1453 1494
1454static inline void print_irqtrace_events(struct task_struct *curr) 1495void print_irqtrace_events(struct task_struct *curr)
1455{ 1496{
1456 printk("irq event stamp: %u\n", curr->irq_events); 1497 printk("irq event stamp: %u\n", curr->irq_events);
1457 printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); 1498 printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
@@ -1464,19 +1505,13 @@ static inline void print_irqtrace_events(struct task_struct *curr)
1464 print_ip_sym(curr->softirq_disable_ip); 1505 print_ip_sym(curr->softirq_disable_ip);
1465} 1506}
1466 1507
1467#else
1468static inline void print_irqtrace_events(struct task_struct *curr)
1469{
1470}
1471#endif 1508#endif
1472 1509
1473static int 1510static int
1474print_usage_bug(struct task_struct *curr, struct held_lock *this, 1511print_usage_bug(struct task_struct *curr, struct held_lock *this,
1475 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 1512 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1476{ 1513{
1477 __raw_spin_unlock(&hash_lock); 1514 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1478 debug_locks_off();
1479 if (debug_locks_silent)
1480 return 0; 1515 return 0;
1481 1516
1482 printk("\n=================================\n"); 1517 printk("\n=================================\n");
@@ -1537,12 +1572,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1537 if (likely(this->class->usage_mask & new_mask)) 1572 if (likely(this->class->usage_mask & new_mask))
1538 return 1; 1573 return 1;
1539 1574
1540 __raw_spin_lock(&hash_lock); 1575 if (!graph_lock())
1576 return 0;
1541 /* 1577 /*
1542 * Make sure we didnt race: 1578 * Make sure we didnt race:
1543 */ 1579 */
1544 if (unlikely(this->class->usage_mask & new_mask)) { 1580 if (unlikely(this->class->usage_mask & new_mask)) {
1545 __raw_spin_unlock(&hash_lock); 1581 graph_unlock();
1546 return 1; 1582 return 1;
1547 } 1583 }
1548 1584
@@ -1728,15 +1764,16 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1728 debug_atomic_dec(&nr_unused_locks); 1764 debug_atomic_dec(&nr_unused_locks);
1729 break; 1765 break;
1730 default: 1766 default:
1731 debug_locks_off(); 1767 if (!debug_locks_off_graph_unlock())
1768 return 0;
1732 WARN_ON(1); 1769 WARN_ON(1);
1733 return 0; 1770 return 0;
1734 } 1771 }
1735 1772
1736 __raw_spin_unlock(&hash_lock); 1773 graph_unlock();
1737 1774
1738 /* 1775 /*
1739 * We must printk outside of the hash_lock: 1776 * We must printk outside of the graph_lock:
1740 */ 1777 */
1741 if (ret == 2) { 1778 if (ret == 2) {
1742 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); 1779 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
@@ -2134,9 +2171,9 @@ out_calc_hash:
2134 * We look up the chain_key and do the O(N^2) check and update of 2171 * We look up the chain_key and do the O(N^2) check and update of
2135 * the dependencies only if this is a new dependency chain. 2172 * the dependencies only if this is a new dependency chain.
2136 * (If lookup_chain_cache() returns with 1 it acquires 2173 * (If lookup_chain_cache() returns with 1 it acquires
2137 * hash_lock for us) 2174 * graph_lock for us)
2138 */ 2175 */
2139 if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) { 2176 if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
2140 /* 2177 /*
2141 * Check whether last held lock: 2178 * Check whether last held lock:
2142 * 2179 *
@@ -2167,7 +2204,7 @@ out_calc_hash:
2167 if (!chain_head && ret != 2) 2204 if (!chain_head && ret != 2)
2168 if (!check_prevs_add(curr, hlock)) 2205 if (!check_prevs_add(curr, hlock))
2169 return 0; 2206 return 0;
2170 __raw_spin_unlock(&hash_lock); 2207 graph_unlock();
2171 } 2208 }
2172 curr->lockdep_depth++; 2209 curr->lockdep_depth++;
2173 check_chain_key(curr); 2210 check_chain_key(curr);
@@ -2430,6 +2467,7 @@ EXPORT_SYMBOL_GPL(lock_release);
2430void lockdep_reset(void) 2467void lockdep_reset(void)
2431{ 2468{
2432 unsigned long flags; 2469 unsigned long flags;
2470 int i;
2433 2471
2434 raw_local_irq_save(flags); 2472 raw_local_irq_save(flags);
2435 current->curr_chain_key = 0; 2473 current->curr_chain_key = 0;
@@ -2440,6 +2478,8 @@ void lockdep_reset(void)
2440 nr_softirq_chains = 0; 2478 nr_softirq_chains = 0;
2441 nr_process_chains = 0; 2479 nr_process_chains = 0;
2442 debug_locks = 1; 2480 debug_locks = 1;
2481 for (i = 0; i < CHAINHASH_SIZE; i++)
2482 INIT_LIST_HEAD(chainhash_table + i);
2443 raw_local_irq_restore(flags); 2483 raw_local_irq_restore(flags);
2444} 2484}
2445 2485
@@ -2476,7 +2516,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
2476 int i; 2516 int i;
2477 2517
2478 raw_local_irq_save(flags); 2518 raw_local_irq_save(flags);
2479 __raw_spin_lock(&hash_lock); 2519 graph_lock();
2480 2520
2481 /* 2521 /*
2482 * Unhash all classes that were created by this module: 2522 * Unhash all classes that were created by this module:
@@ -2490,7 +2530,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
2490 zap_class(class); 2530 zap_class(class);
2491 } 2531 }
2492 2532
2493 __raw_spin_unlock(&hash_lock); 2533 graph_unlock();
2494 raw_local_irq_restore(flags); 2534 raw_local_irq_restore(flags);
2495} 2535}
2496 2536
@@ -2518,20 +2558,20 @@ void lockdep_reset_lock(struct lockdep_map *lock)
2518 * Debug check: in the end all mapped classes should 2558 * Debug check: in the end all mapped classes should
2519 * be gone. 2559 * be gone.
2520 */ 2560 */
2521 __raw_spin_lock(&hash_lock); 2561 graph_lock();
2522 for (i = 0; i < CLASSHASH_SIZE; i++) { 2562 for (i = 0; i < CLASSHASH_SIZE; i++) {
2523 head = classhash_table + i; 2563 head = classhash_table + i;
2524 if (list_empty(head)) 2564 if (list_empty(head))
2525 continue; 2565 continue;
2526 list_for_each_entry_safe(class, next, head, hash_entry) { 2566 list_for_each_entry_safe(class, next, head, hash_entry) {
2527 if (unlikely(class == lock->class_cache)) { 2567 if (unlikely(class == lock->class_cache)) {
2528 __raw_spin_unlock(&hash_lock); 2568 if (debug_locks_off_graph_unlock())
2529 DEBUG_LOCKS_WARN_ON(1); 2569 WARN_ON(1);
2530 goto out_restore; 2570 goto out_restore;
2531 } 2571 }
2532 } 2572 }
2533 } 2573 }
2534 __raw_spin_unlock(&hash_lock); 2574 graph_unlock();
2535 2575
2536out_restore: 2576out_restore:
2537 raw_local_irq_restore(flags); 2577 raw_local_irq_restore(flags);
@@ -2645,6 +2685,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
2645 } 2685 }
2646 local_irq_restore(flags); 2686 local_irq_restore(flags);
2647} 2687}
2688EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
2648 2689
2649static void print_held_locks_bug(struct task_struct *curr) 2690static void print_held_locks_bug(struct task_struct *curr)
2650{ 2691{
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb2..8ce09bc4613d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
20#define MAX_LOCKDEP_KEYS_BITS 11 20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) 21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22 22
23#define MAX_LOCKDEP_CHAINS_BITS 13 23#define MAX_LOCKDEP_CHAINS_BITS 14
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25 25
26/* 26/*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3fa..b554b40a4aa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
113 return 0; 113 return 0;
114} 114}
115 115
116static struct seq_operations lockdep_ops = { 116static const struct seq_operations lockdep_ops = {
117 .start = l_start, 117 .start = l_start,
118 .next = l_next, 118 .next = l_next,
119 .stop = l_stop, 119 .stop = l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
135 return res; 135 return res;
136} 136}
137 137
138static struct file_operations proc_lockdep_operations = { 138static const struct file_operations proc_lockdep_operations = {
139 .open = lockdep_open, 139 .open = lockdep_open,
140 .read = seq_read, 140 .read = seq_read,
141 .llseek = seq_lseek, 141 .llseek = seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
319 return single_open(file, lockdep_stats_show, NULL); 319 return single_open(file, lockdep_stats_show, NULL);
320} 320}
321 321
322static struct file_operations proc_lockdep_stats_operations = { 322static const struct file_operations proc_lockdep_stats_operations = {
323 .open = lockdep_stats_open, 323 .open = lockdep_stats_open,
324 .read = seq_read, 324 .read = seq_read,
325 .llseek = seq_lseek, 325 .llseek = seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index f0166563c602..d0f2260a0210 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -34,10 +34,10 @@
34#include <linux/err.h> 34#include <linux/err.h>
35#include <linux/vermagic.h> 35#include <linux/vermagic.h>
36#include <linux/notifier.h> 36#include <linux/notifier.h>
37#include <linux/sched.h>
37#include <linux/stop_machine.h> 38#include <linux/stop_machine.h>
38#include <linux/device.h> 39#include <linux/device.h>
39#include <linux/string.h> 40#include <linux/string.h>
40#include <linux/sched.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/unwind.h> 42#include <linux/unwind.h>
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -790,6 +790,19 @@ static struct module_attribute refcnt = {
790 .show = show_refcnt, 790 .show = show_refcnt,
791}; 791};
792 792
793void module_put(struct module *module)
794{
795 if (module) {
796 unsigned int cpu = get_cpu();
797 local_dec(&module->ref[cpu].count);
798 /* Maybe they're waiting for us to drop reference? */
799 if (unlikely(!module_is_live(module)))
800 wake_up_process(module->waiter);
801 put_cpu();
802 }
803}
804EXPORT_SYMBOL(module_put);
805
793#else /* !CONFIG_MODULE_UNLOAD */ 806#else /* !CONFIG_MODULE_UNLOAD */
794static void print_unload_info(struct seq_file *m, struct module *mod) 807static void print_unload_info(struct seq_file *m, struct module *mod)
795{ 808{
@@ -811,9 +824,34 @@ static inline void module_unload_init(struct module *mod)
811} 824}
812#endif /* CONFIG_MODULE_UNLOAD */ 825#endif /* CONFIG_MODULE_UNLOAD */
813 826
827static ssize_t show_initstate(struct module_attribute *mattr,
828 struct module *mod, char *buffer)
829{
830 const char *state = "unknown";
831
832 switch (mod->state) {
833 case MODULE_STATE_LIVE:
834 state = "live";
835 break;
836 case MODULE_STATE_COMING:
837 state = "coming";
838 break;
839 case MODULE_STATE_GOING:
840 state = "going";
841 break;
842 }
843 return sprintf(buffer, "%s\n", state);
844}
845
846static struct module_attribute initstate = {
847 .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE },
848 .show = show_initstate,
849};
850
814static struct module_attribute *modinfo_attrs[] = { 851static struct module_attribute *modinfo_attrs[] = {
815 &modinfo_version, 852 &modinfo_version,
816 &modinfo_srcversion, 853 &modinfo_srcversion,
854 &initstate,
817#ifdef CONFIG_MODULE_UNLOAD 855#ifdef CONFIG_MODULE_UNLOAD
818 &refcnt, 856 &refcnt,
819#endif 857#endif
@@ -1086,22 +1124,37 @@ static int mod_sysfs_setup(struct module *mod,
1086 goto out; 1124 goto out;
1087 kobj_set_kset_s(&mod->mkobj, module_subsys); 1125 kobj_set_kset_s(&mod->mkobj, module_subsys);
1088 mod->mkobj.mod = mod; 1126 mod->mkobj.mod = mod;
1089 err = kobject_register(&mod->mkobj.kobj); 1127
1128 /* delay uevent until full sysfs population */
1129 kobject_init(&mod->mkobj.kobj);
1130 err = kobject_add(&mod->mkobj.kobj);
1090 if (err) 1131 if (err)
1091 goto out; 1132 goto out;
1092 1133
1134 mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers");
1135 if (!mod->drivers_dir) {
1136 err = -ENOMEM;
1137 goto out_unreg;
1138 }
1139
1093 err = module_param_sysfs_setup(mod, kparam, num_params); 1140 err = module_param_sysfs_setup(mod, kparam, num_params);
1094 if (err) 1141 if (err)
1095 goto out_unreg; 1142 goto out_unreg_drivers;
1096 1143
1097 err = module_add_modinfo_attrs(mod); 1144 err = module_add_modinfo_attrs(mod);
1098 if (err) 1145 if (err)
1099 goto out_unreg; 1146 goto out_unreg_param;
1100 1147
1148 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1101 return 0; 1149 return 0;
1102 1150
1151out_unreg_param:
1152 module_param_sysfs_remove(mod);
1153out_unreg_drivers:
1154 kobject_unregister(mod->drivers_dir);
1103out_unreg: 1155out_unreg:
1104 kobject_unregister(&mod->mkobj.kobj); 1156 kobject_del(&mod->mkobj.kobj);
1157 kobject_put(&mod->mkobj.kobj);
1105out: 1158out:
1106 return err; 1159 return err;
1107} 1160}
@@ -1110,6 +1163,7 @@ static void mod_kobject_remove(struct module *mod)
1110{ 1163{
1111 module_remove_modinfo_attrs(mod); 1164 module_remove_modinfo_attrs(mod);
1112 module_param_sysfs_remove(mod); 1165 module_param_sysfs_remove(mod);
1166 kobject_unregister(mod->drivers_dir);
1113 1167
1114 kobject_unregister(&mod->mkobj.kobj); 1168 kobject_unregister(&mod->mkobj.kobj);
1115} 1169}
@@ -2182,7 +2236,7 @@ static int m_show(struct seq_file *m, void *p)
2182 Where refcount is a number or -, and deps is a comma-separated list 2236 Where refcount is a number or -, and deps is a comma-separated list
2183 of depends or -. 2237 of depends or -.
2184*/ 2238*/
2185struct seq_operations modules_op = { 2239const struct seq_operations modules_op = {
2186 .start = m_start, 2240 .start = m_start,
2187 .next = m_next, 2241 .next = m_next,
2188 .stop = m_stop, 2242 .stop = m_stop,
@@ -2273,21 +2327,54 @@ void print_modules(void)
2273 printk("\n"); 2327 printk("\n");
2274} 2328}
2275 2329
2330static char *make_driver_name(struct device_driver *drv)
2331{
2332 char *driver_name;
2333
2334 driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
2335 GFP_KERNEL);
2336 if (!driver_name)
2337 return NULL;
2338
2339 sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
2340 return driver_name;
2341}
2342
2276void module_add_driver(struct module *mod, struct device_driver *drv) 2343void module_add_driver(struct module *mod, struct device_driver *drv)
2277{ 2344{
2345 char *driver_name;
2346 int no_warn;
2347
2278 if (!mod || !drv) 2348 if (!mod || !drv)
2279 return; 2349 return;
2280 2350
2281 /* Don't check return code; this call is idempotent */ 2351 /* Don't check return codes; these calls are idempotent */
2282 sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); 2352 no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
2353 driver_name = make_driver_name(drv);
2354 if (driver_name) {
2355 no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj,
2356 driver_name);
2357 kfree(driver_name);
2358 }
2283} 2359}
2284EXPORT_SYMBOL(module_add_driver); 2360EXPORT_SYMBOL(module_add_driver);
2285 2361
2286void module_remove_driver(struct device_driver *drv) 2362void module_remove_driver(struct device_driver *drv)
2287{ 2363{
2364 char *driver_name;
2365
2288 if (!drv) 2366 if (!drv)
2289 return; 2367 return;
2368
2290 sysfs_remove_link(&drv->kobj, "module"); 2369 sysfs_remove_link(&drv->kobj, "module");
2370 if (drv->owner && drv->owner->drivers_dir) {
2371 driver_name = make_driver_name(drv);
2372 if (driver_name) {
2373 sysfs_remove_link(drv->owner->drivers_dir,
2374 driver_name);
2375 kfree(driver_name);
2376 }
2377 }
2291} 2378}
2292EXPORT_SYMBOL(module_remove_driver); 2379EXPORT_SYMBOL(module_remove_driver);
2293 2380
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 18651641a7b5..841539d72c55 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
77 77
78void debug_mutex_unlock(struct mutex *lock) 78void debug_mutex_unlock(struct mutex *lock)
79{ 79{
80 if (unlikely(!debug_locks))
81 return;
82
80 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
81 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 84 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 85 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a497..e7cbbb82765b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
206} 206}
207 207
208EXPORT_SYMBOL_GPL(mutex_lock_nested); 208EXPORT_SYMBOL_GPL(mutex_lock_nested);
209
210int __sched
211mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
212{
213 might_sleep();
214 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
215}
216
217EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
209#endif 218#endif
210 219
211/* 220/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 674aceb7335a..f5b9ee6f6bbb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,8 +17,9 @@
17#include <linux/version.h> 17#include <linux/version.h>
18#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
19#include <linux/init_task.h> 19#include <linux/init_task.h>
20#include <linux/namespace.h> 20#include <linux/mnt_namespace.h>
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h>
22 23
23struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 24struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
24 25
@@ -60,12 +61,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
60 struct nsproxy *ns = clone_namespaces(orig); 61 struct nsproxy *ns = clone_namespaces(orig);
61 62
62 if (ns) { 63 if (ns) {
63 if (ns->namespace) 64 if (ns->mnt_ns)
64 get_namespace(ns->namespace); 65 get_mnt_ns(ns->mnt_ns);
65 if (ns->uts_ns) 66 if (ns->uts_ns)
66 get_uts_ns(ns->uts_ns); 67 get_uts_ns(ns->uts_ns);
67 if (ns->ipc_ns) 68 if (ns->ipc_ns)
68 get_ipc_ns(ns->ipc_ns); 69 get_ipc_ns(ns->ipc_ns);
70 if (ns->pid_ns)
71 get_pid_ns(ns->pid_ns);
69 } 72 }
70 73
71 return ns; 74 return ns;
@@ -97,7 +100,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
97 100
98 tsk->nsproxy = new_ns; 101 tsk->nsproxy = new_ns;
99 102
100 err = copy_namespace(flags, tsk); 103 err = copy_mnt_ns(flags, tsk);
101 if (err) 104 if (err)
102 goto out_ns; 105 goto out_ns;
103 106
@@ -109,16 +112,23 @@ int copy_namespaces(int flags, struct task_struct *tsk)
109 if (err) 112 if (err)
110 goto out_ipc; 113 goto out_ipc;
111 114
115 err = copy_pid_ns(flags, tsk);
116 if (err)
117 goto out_pid;
118
112out: 119out:
113 put_nsproxy(old_ns); 120 put_nsproxy(old_ns);
114 return err; 121 return err;
115 122
123out_pid:
124 if (new_ns->ipc_ns)
125 put_ipc_ns(new_ns->ipc_ns);
116out_ipc: 126out_ipc:
117 if (new_ns->uts_ns) 127 if (new_ns->uts_ns)
118 put_uts_ns(new_ns->uts_ns); 128 put_uts_ns(new_ns->uts_ns);
119out_uts: 129out_uts:
120 if (new_ns->namespace) 130 if (new_ns->mnt_ns)
121 put_namespace(new_ns->namespace); 131 put_mnt_ns(new_ns->mnt_ns);
122out_ns: 132out_ns:
123 tsk->nsproxy = old_ns; 133 tsk->nsproxy = old_ns;
124 kfree(new_ns); 134 kfree(new_ns);
@@ -127,11 +137,13 @@ out_ns:
127 137
128void free_nsproxy(struct nsproxy *ns) 138void free_nsproxy(struct nsproxy *ns)
129{ 139{
130 if (ns->namespace) 140 if (ns->mnt_ns)
131 put_namespace(ns->namespace); 141 put_mnt_ns(ns->mnt_ns);
132 if (ns->uts_ns) 142 if (ns->uts_ns)
133 put_uts_ns(ns->uts_ns); 143 put_uts_ns(ns->uts_ns);
134 if (ns->ipc_ns) 144 if (ns->ipc_ns)
135 put_ipc_ns(ns->ipc_ns); 145 put_ipc_ns(ns->ipc_ns);
136 kfree(ns); 146 if (ns->pid_ns)
147 put_pid_ns(ns->pid_ns);
148 kfree(ns);
137} 149}
diff --git a/kernel/params.c b/kernel/params.c
index f406655d6653..718945da8f58 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -143,9 +143,15 @@ int parse_args(const char *name,
143 143
144 while (*args) { 144 while (*args) {
145 int ret; 145 int ret;
146 int irq_was_disabled;
146 147
147 args = next_arg(args, &param, &val); 148 args = next_arg(args, &param, &val);
149 irq_was_disabled = irqs_disabled();
148 ret = parse_one(param, val, params, num, unknown); 150 ret = parse_one(param, val, params, num, unknown);
151 if (irq_was_disabled && !irqs_disabled()) {
152 printk(KERN_WARNING "parse_args(): option '%s' enabled "
153 "irq's!\n", param);
154 }
149 switch (ret) { 155 switch (ret) {
150 case -ENOENT: 156 case -ENOENT:
151 printk(KERN_ERR "%s: Unknown parameter `%s'\n", 157 printk(KERN_ERR "%s: Unknown parameter `%s'\n",
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f9..2efe9d8d367b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,12 +26,12 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/hash.h> 28#include <linux/hash.h>
29#include <linux/pspace.h> 29#include <linux/pid_namespace.h>
30 30
31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
32static struct hlist_head *pid_hash; 32static struct hlist_head *pid_hash;
33static int pidhash_shift; 33static int pidhash_shift;
34static kmem_cache_t *pid_cachep; 34static struct kmem_cache *pid_cachep;
35 35
36int pid_max = PID_MAX_DEFAULT; 36int pid_max = PID_MAX_DEFAULT;
37 37
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
43#define BITS_PER_PAGE (PAGE_SIZE*8) 43#define BITS_PER_PAGE (PAGE_SIZE*8)
44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 44#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
45 45
46static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) 46static inline int mk_pid(struct pid_namespace *pid_ns,
47 struct pidmap *map, int off)
47{ 48{
48 return (map - pspace->pidmap)*BITS_PER_PAGE + off; 49 return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
49} 50}
50 51
51#define find_next_offset(map, off) \ 52#define find_next_offset(map, off) \
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
57 * value does not cause lots of bitmaps to be allocated, but 58 * value does not cause lots of bitmaps to be allocated, but
58 * the scheme scales to up to 4 million PIDs, runtime. 59 * the scheme scales to up to 4 million PIDs, runtime.
59 */ 60 */
60struct pspace init_pspace = { 61struct pid_namespace init_pid_ns = {
62 .kref = {
63 .refcount = ATOMIC_INIT(2),
64 },
61 .pidmap = { 65 .pidmap = {
62 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 66 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
63 }, 67 },
64 .last_pid = 0 68 .last_pid = 0,
69 .child_reaper = &init_task
65}; 70};
66 71
67/* 72/*
@@ -80,25 +85,25 @@ struct pspace init_pspace = {
80 85
81static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 86static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
82 87
83static fastcall void free_pidmap(struct pspace *pspace, int pid) 88static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
84{ 89{
85 struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; 90 struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
86 int offset = pid & BITS_PER_PAGE_MASK; 91 int offset = pid & BITS_PER_PAGE_MASK;
87 92
88 clear_bit(offset, map->page); 93 clear_bit(offset, map->page);
89 atomic_inc(&map->nr_free); 94 atomic_inc(&map->nr_free);
90} 95}
91 96
92static int alloc_pidmap(struct pspace *pspace) 97static int alloc_pidmap(struct pid_namespace *pid_ns)
93{ 98{
94 int i, offset, max_scan, pid, last = pspace->last_pid; 99 int i, offset, max_scan, pid, last = pid_ns->last_pid;
95 struct pidmap *map; 100 struct pidmap *map;
96 101
97 pid = last + 1; 102 pid = last + 1;
98 if (pid >= pid_max) 103 if (pid >= pid_max)
99 pid = RESERVED_PIDS; 104 pid = RESERVED_PIDS;
100 offset = pid & BITS_PER_PAGE_MASK; 105 offset = pid & BITS_PER_PAGE_MASK;
101 map = &pspace->pidmap[pid/BITS_PER_PAGE]; 106 map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
102 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; 107 max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
103 for (i = 0; i <= max_scan; ++i) { 108 for (i = 0; i <= max_scan; ++i) {
104 if (unlikely(!map->page)) { 109 if (unlikely(!map->page)) {
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace)
120 do { 125 do {
121 if (!test_and_set_bit(offset, map->page)) { 126 if (!test_and_set_bit(offset, map->page)) {
122 atomic_dec(&map->nr_free); 127 atomic_dec(&map->nr_free);
123 pspace->last_pid = pid; 128 pid_ns->last_pid = pid;
124 return pid; 129 return pid;
125 } 130 }
126 offset = find_next_offset(map, offset); 131 offset = find_next_offset(map, offset);
127 pid = mk_pid(pspace, map, offset); 132 pid = mk_pid(pid_ns, map, offset);
128 /* 133 /*
129 * find_next_offset() found a bit, the pid from it 134 * find_next_offset() found a bit, the pid from it
130 * is in-bounds, and if we fell back to the last 135 * is in-bounds, and if we fell back to the last
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace)
135 (i != max_scan || pid < last || 140 (i != max_scan || pid < last ||
136 !((last+1) & BITS_PER_PAGE_MASK))); 141 !((last+1) & BITS_PER_PAGE_MASK)));
137 } 142 }
138 if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 143 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
139 ++map; 144 ++map;
140 offset = 0; 145 offset = 0;
141 } else { 146 } else {
142 map = &pspace->pidmap[0]; 147 map = &pid_ns->pidmap[0];
143 offset = RESERVED_PIDS; 148 offset = RESERVED_PIDS;
144 if (unlikely(last == offset)) 149 if (unlikely(last == offset))
145 break; 150 break;
146 } 151 }
147 pid = mk_pid(pspace, map, offset); 152 pid = mk_pid(pid_ns, map, offset);
148 } 153 }
149 return -1; 154 return -1;
150} 155}
151 156
152static int next_pidmap(struct pspace *pspace, int last) 157static int next_pidmap(struct pid_namespace *pid_ns, int last)
153{ 158{
154 int offset; 159 int offset;
155 struct pidmap *map, *end; 160 struct pidmap *map, *end;
156 161
157 offset = (last + 1) & BITS_PER_PAGE_MASK; 162 offset = (last + 1) & BITS_PER_PAGE_MASK;
158 map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; 163 map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
159 end = &pspace->pidmap[PIDMAP_ENTRIES]; 164 end = &pid_ns->pidmap[PIDMAP_ENTRIES];
160 for (; map < end; map++, offset = 0) { 165 for (; map < end; map++, offset = 0) {
161 if (unlikely(!map->page)) 166 if (unlikely(!map->page))
162 continue; 167 continue;
163 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); 168 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
164 if (offset < BITS_PER_PAGE) 169 if (offset < BITS_PER_PAGE)
165 return mk_pid(pspace, map, offset); 170 return mk_pid(pid_ns, map, offset);
166 } 171 }
167 return -1; 172 return -1;
168} 173}
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid)
192 hlist_del_rcu(&pid->pid_chain); 197 hlist_del_rcu(&pid->pid_chain);
193 spin_unlock_irqrestore(&pidmap_lock, flags); 198 spin_unlock_irqrestore(&pidmap_lock, flags);
194 199
195 free_pidmap(&init_pspace, pid->nr); 200 free_pidmap(current->nsproxy->pid_ns, pid->nr);
196 call_rcu(&pid->rcu, delayed_put_pid); 201 call_rcu(&pid->rcu, delayed_put_pid);
197} 202}
198 203
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void)
206 if (!pid) 211 if (!pid)
207 goto out; 212 goto out;
208 213
209 nr = alloc_pidmap(&init_pspace); 214 nr = alloc_pidmap(current->nsproxy->pid_ns);
210 if (nr < 0) 215 if (nr < 0)
211 goto out_free; 216 goto out_free;
212 217
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr)
348 pid = find_pid(nr); 353 pid = find_pid(nr);
349 if (pid) 354 if (pid)
350 break; 355 break;
351 nr = next_pidmap(&init_pspace, nr); 356 nr = next_pidmap(current->nsproxy->pid_ns, nr);
352 } while (nr > 0); 357 } while (nr > 0);
353 358
354 return pid; 359 return pid;
355} 360}
356EXPORT_SYMBOL_GPL(find_get_pid); 361EXPORT_SYMBOL_GPL(find_get_pid);
357 362
363int copy_pid_ns(int flags, struct task_struct *tsk)
364{
365 struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
366 int err = 0;
367
368 if (!old_ns)
369 return 0;
370
371 get_pid_ns(old_ns);
372 return err;
373}
374
375void free_pid_ns(struct kref *kref)
376{
377 struct pid_namespace *ns;
378
379 ns = container_of(kref, struct pid_namespace, kref);
380 kfree(ns);
381}
382
358/* 383/*
359 * The pid hash table is scaled according to the amount of memory in the 384 * The pid hash table is scaled according to the amount of memory in the
360 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 385 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -382,10 +407,10 @@ void __init pidhash_init(void)
382 407
383void __init pidmap_init(void) 408void __init pidmap_init(void)
384{ 409{
385 init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 410 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
386 /* Reserve PID 0. We never call free_pidmap(0) */ 411 /* Reserve PID 0. We never call free_pidmap(0) */
387 set_bit(0, init_pspace.pidmap[0].page); 412 set_bit(0, init_pid_ns.pidmap[0].page);
388 atomic_dec(&init_pspace.pidmap[0].nr_free); 413 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
389 414
390 pid_cachep = kmem_cache_create("pid", sizeof(struct pid), 415 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
391 __alignof__(struct pid), 416 __alignof__(struct pid),
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06f..5fe87de10ff0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
70/* 70/*
71 * Lets keep our timers in a slab cache :-) 71 * Lets keep our timers in a slab cache :-)
72 */ 72 */
73static kmem_cache_t *posix_timers_cache; 73static struct kmem_cache *posix_timers_cache;
74static struct idr posix_timers_id; 74static struct idr posix_timers_id;
75static DEFINE_SPINLOCK(idr_lock); 75static DEFINE_SPINLOCK(idr_lock);
76 76
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca3479..ed296225dcd4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -20,13 +20,14 @@ config PM
20 sending the processor to sleep and saving power. 20 sending the processor to sleep and saving power.
21 21
22config PM_LEGACY 22config PM_LEGACY
23 bool "Legacy Power Management API" 23 bool "Legacy Power Management API (DEPRECATED)"
24 depends on PM 24 depends on PM
25 default y 25 default n
26 ---help--- 26 ---help---
27 Support for pm_register() and friends. 27 Support for pm_register() and friends. This old API is obsoleted
28 by the driver model.
28 29
29 If unsure, say Y. 30 If unsure, say N.
30 31
31config PM_DEBUG 32config PM_DEBUG
32 bool "Power Management Debug Support" 33 bool "Power Management Debug Support"
@@ -78,7 +79,7 @@ config PM_SYSFS_DEPRECATED
78 79
79config SOFTWARE_SUSPEND 80config SOFTWARE_SUSPEND
80 bool "Software Suspend" 81 bool "Software Suspend"
81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) 82 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
82 ---help--- 83 ---help---
83 Enable the possibility of suspending the machine. 84 Enable the possibility of suspending the machine.
84 It doesn't need ACPI or APM. 85 It doesn't need ACPI or APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index b1fb7866b0b3..88fc5d7ac737 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/console.h> 21#include <linux/console.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/freezer.h>
23 24
24#include "power.h" 25#include "power.h"
25 26
@@ -27,6 +28,23 @@
27static int noresume = 0; 28static int noresume = 0;
28char resume_file[256] = CONFIG_PM_STD_PARTITION; 29char resume_file[256] = CONFIG_PM_STD_PARTITION;
29dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block;
32
33/**
34 * platform_prepare - prepare the machine for hibernation using the
35 * platform driver if so configured and return an error code if it fails
36 */
37
38static inline int platform_prepare(void)
39{
40 int error = 0;
41
42 if (pm_disk_mode == PM_DISK_PLATFORM) {
43 if (pm_ops && pm_ops->prepare)
44 error = pm_ops->prepare(PM_SUSPEND_DISK);
45 }
46 return error;
47}
30 48
31/** 49/**
32 * power_down - Shut machine down for hibernate. 50 * power_down - Shut machine down for hibernate.
@@ -40,13 +58,13 @@ dev_t swsusp_resume_device;
40 58
41static void power_down(suspend_disk_method_t mode) 59static void power_down(suspend_disk_method_t mode)
42{ 60{
43 int error = 0;
44
45 switch(mode) { 61 switch(mode) {
46 case PM_DISK_PLATFORM: 62 case PM_DISK_PLATFORM:
47 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 63 if (pm_ops && pm_ops->enter) {
48 error = pm_ops->enter(PM_SUSPEND_DISK); 64 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
49 break; 65 pm_ops->enter(PM_SUSPEND_DISK);
66 break;
67 }
50 case PM_DISK_SHUTDOWN: 68 case PM_DISK_SHUTDOWN:
51 kernel_power_off(); 69 kernel_power_off();
52 break; 70 break;
@@ -90,12 +108,18 @@ static int prepare_processes(void)
90 goto thaw; 108 goto thaw;
91 } 109 }
92 110
111 error = platform_prepare();
112 if (error)
113 goto thaw;
114
93 /* Free memory before shutting down devices. */ 115 /* Free memory before shutting down devices. */
94 if (!(error = swsusp_shrink_memory())) 116 if (!(error = swsusp_shrink_memory()))
95 return 0; 117 return 0;
96thaw: 118
119 platform_finish();
120 thaw:
97 thaw_processes(); 121 thaw_processes();
98enable_cpus: 122 enable_cpus:
99 enable_nonboot_cpus(); 123 enable_nonboot_cpus();
100 pm_restore_console(); 124 pm_restore_console();
101 return error; 125 return error;
@@ -127,7 +151,7 @@ int pm_suspend_disk(void)
127 return error; 151 return error;
128 152
129 if (pm_disk_mode == PM_DISK_TESTPROC) 153 if (pm_disk_mode == PM_DISK_TESTPROC)
130 goto Thaw; 154 return 0;
131 155
132 suspend_console(); 156 suspend_console();
133 error = device_suspend(PMSG_FREEZE); 157 error = device_suspend(PMSG_FREEZE);
@@ -189,10 +213,10 @@ static int software_resume(void)
189{ 213{
190 int error; 214 int error;
191 215
192 down(&pm_sem); 216 mutex_lock(&pm_mutex);
193 if (!swsusp_resume_device) { 217 if (!swsusp_resume_device) {
194 if (!strlen(resume_file)) { 218 if (!strlen(resume_file)) {
195 up(&pm_sem); 219 mutex_unlock(&pm_mutex);
196 return -ENOENT; 220 return -ENOENT;
197 } 221 }
198 swsusp_resume_device = name_to_dev_t(resume_file); 222 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -207,7 +231,7 @@ static int software_resume(void)
207 * FIXME: If noresume is specified, we need to find the partition 231 * FIXME: If noresume is specified, we need to find the partition
208 * and reset it back to normal swap space. 232 * and reset it back to normal swap space.
209 */ 233 */
210 up(&pm_sem); 234 mutex_unlock(&pm_mutex);
211 return 0; 235 return 0;
212 } 236 }
213 237
@@ -251,7 +275,7 @@ static int software_resume(void)
251 unprepare_processes(); 275 unprepare_processes();
252 Done: 276 Done:
253 /* For success case, the suspend path will release the lock */ 277 /* For success case, the suspend path will release the lock */
254 up(&pm_sem); 278 mutex_unlock(&pm_mutex);
255 pr_debug("PM: Resume from disk failed.\n"); 279 pr_debug("PM: Resume from disk failed.\n");
256 return 0; 280 return 0;
257} 281}
@@ -312,7 +336,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
312 p = memchr(buf, '\n', n); 336 p = memchr(buf, '\n', n);
313 len = p ? p - buf : n; 337 len = p ? p - buf : n;
314 338
315 down(&pm_sem); 339 mutex_lock(&pm_mutex);
316 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { 340 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
317 if (!strncmp(buf, pm_disk_modes[i], len)) { 341 if (!strncmp(buf, pm_disk_modes[i], len)) {
318 mode = i; 342 mode = i;
@@ -336,7 +360,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
336 360
337 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 361 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
338 pm_disk_modes[mode]); 362 pm_disk_modes[mode]);
339 up(&pm_sem); 363 mutex_unlock(&pm_mutex);
340 return error ? error : n; 364 return error ? error : n;
341} 365}
342 366
@@ -361,14 +385,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
361 if (maj != MAJOR(res) || min != MINOR(res)) 385 if (maj != MAJOR(res) || min != MINOR(res))
362 goto out; 386 goto out;
363 387
364 down(&pm_sem); 388 mutex_lock(&pm_mutex);
365 swsusp_resume_device = res; 389 swsusp_resume_device = res;
366 up(&pm_sem); 390 mutex_unlock(&pm_mutex);
367 printk("Attempting manual resume\n"); 391 printk("Attempting manual resume\n");
368 noresume = 0; 392 noresume = 0;
369 software_resume(); 393 software_resume();
370 ret = n; 394 ret = n;
371out: 395 out:
372 return ret; 396 return ret;
373} 397}
374 398
@@ -423,6 +447,19 @@ static int __init resume_setup(char *str)
423 return 1; 447 return 1;
424} 448}
425 449
450static int __init resume_offset_setup(char *str)
451{
452 unsigned long long offset;
453
454 if (noresume)
455 return 1;
456
457 if (sscanf(str, "%llu", &offset) == 1)
458 swsusp_resume_block = offset;
459
460 return 1;
461}
462
426static int __init noresume_setup(char *str) 463static int __init noresume_setup(char *str)
427{ 464{
428 noresume = 1; 465 noresume = 1;
@@ -430,4 +467,5 @@ static int __init noresume_setup(char *str)
430} 467}
431 468
432__setup("noresume", noresume_setup); 469__setup("noresume", noresume_setup);
470__setup("resume_offset=", resume_offset_setup);
433__setup("resume=", resume_setup); 471__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 873228c71dab..ff3a6182f5f0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/module.h>
11#include <linux/suspend.h> 12#include <linux/suspend.h>
12#include <linux/kobject.h> 13#include <linux/kobject.h>
13#include <linux/string.h> 14#include <linux/string.h>
@@ -18,16 +19,17 @@
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/resume-trace.h> 21#include <linux/resume-trace.h>
22#include <linux/freezer.h>
21 23
22#include "power.h" 24#include "power.h"
23 25
24/*This is just an arbitrary number */ 26/*This is just an arbitrary number */
25#define FREE_PAGE_NUMBER (100) 27#define FREE_PAGE_NUMBER (100)
26 28
27DECLARE_MUTEX(pm_sem); 29DEFINE_MUTEX(pm_mutex);
28 30
29struct pm_ops *pm_ops; 31struct pm_ops *pm_ops;
30suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; 32suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
31 33
32/** 34/**
33 * pm_set_ops - Set the global power method table. 35 * pm_set_ops - Set the global power method table.
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
36 38
37void pm_set_ops(struct pm_ops * ops) 39void pm_set_ops(struct pm_ops * ops)
38{ 40{
39 down(&pm_sem); 41 mutex_lock(&pm_mutex);
40 pm_ops = ops; 42 pm_ops = ops;
41 up(&pm_sem); 43 mutex_unlock(&pm_mutex);
42} 44}
43 45
44 46
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state)
182 184
183 if (!valid_state(state)) 185 if (!valid_state(state))
184 return -ENODEV; 186 return -ENODEV;
185 if (down_trylock(&pm_sem)) 187 if (!mutex_trylock(&pm_mutex))
186 return -EBUSY; 188 return -EBUSY;
187 189
188 if (state == PM_SUSPEND_DISK) { 190 if (state == PM_SUSPEND_DISK) {
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state)
200 pr_debug("PM: Finishing wakeup.\n"); 202 pr_debug("PM: Finishing wakeup.\n");
201 suspend_finish(state); 203 suspend_finish(state);
202 Unlock: 204 Unlock:
203 up(&pm_sem); 205 mutex_unlock(&pm_mutex);
204 return error; 206 return error;
205} 207}
206 208
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state)
229 return -EINVAL; 231 return -EINVAL;
230} 232}
231 233
232 234EXPORT_SYMBOL(pm_suspend);
233 235
234decl_subsys(power,NULL,NULL); 236decl_subsys(power,NULL,NULL);
235 237
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b272..eb461b816bf4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
22 return -EPERM; 22 return -EPERM;
23} 23}
24#endif 24#endif
25extern struct semaphore pm_sem; 25
26extern struct mutex pm_mutex;
27
26#define power_attr(_name) \ 28#define power_attr(_name) \
27static struct subsys_attribute _name##_attr = { \ 29static struct subsys_attribute _name##_attr = { \
28 .attr = { \ 30 .attr = { \
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end;
42extern unsigned long image_size; 44extern unsigned long image_size;
43extern int in_suspend; 45extern int in_suspend;
44extern dev_t swsusp_resume_device; 46extern dev_t swsusp_resume_device;
47extern sector_t swsusp_resume_block;
45 48
46extern asmlinkage int swsusp_arch_suspend(void); 49extern asmlinkage int swsusp_arch_suspend(void);
47extern asmlinkage int swsusp_arch_resume(void); 50extern asmlinkage int swsusp_arch_resume(void);
@@ -102,8 +105,18 @@ struct snapshot_handle {
102extern unsigned int snapshot_additional_pages(struct zone *zone); 105extern unsigned int snapshot_additional_pages(struct zone *zone);
103extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 106extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
104extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 107extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
108extern void snapshot_write_finalize(struct snapshot_handle *handle);
105extern int snapshot_image_loaded(struct snapshot_handle *handle); 109extern int snapshot_image_loaded(struct snapshot_handle *handle);
106extern void snapshot_free_unused_memory(struct snapshot_handle *handle); 110
111/*
112 * This structure is used to pass the values needed for the identification
113 * of the resume swap area from a user space to the kernel via the
114 * SNAPSHOT_SET_SWAP_AREA ioctl
115 */
116struct resume_swap_area {
117 loff_t offset;
118 u_int32_t dev;
119} __attribute__((packed));
107 120
108#define SNAPSHOT_IOC_MAGIC '3' 121#define SNAPSHOT_IOC_MAGIC '3'
109#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) 122#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
117#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) 130#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
118#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) 131#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
119#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) 132#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
120#define SNAPSHOT_IOC_MAXNR 11 133#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
134#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \
135 struct resume_swap_area)
136#define SNAPSHOT_IOC_MAXNR 13
137
138#define PMOPS_PREPARE 1
139#define PMOPS_ENTER 2
140#define PMOPS_FINISH 3
121 141
122/** 142/**
123 * The bitmap is used for tracing allocated swap pages 143 * The bitmap is used for tracing allocated swap pages
@@ -141,7 +161,7 @@ struct bitmap_page {
141 161
142extern void free_bitmap(struct bitmap_page *bitmap); 162extern void free_bitmap(struct bitmap_page *bitmap);
143extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); 163extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
144extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); 164extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
145extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); 165extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
146 166
147extern int swsusp_check(void); 167extern int swsusp_check(void);
@@ -153,3 +173,7 @@ extern int swsusp_read(void);
153extern int swsusp_write(void); 173extern int swsusp_write(void);
154extern void swsusp_close(void); 174extern void swsusp_close(void);
155extern int suspend_enter(suspend_state_t state); 175extern int suspend_enter(suspend_state_t state);
176
177struct timeval;
178extern void swsusp_show_speed(struct timeval *, struct timeval *,
179 unsigned int, char *);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac3164..678ec736076b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
16 * callback we use. 16 * callback we use.
17 */ 17 */
18 18
19static void do_poweroff(void *dummy) 19static void do_poweroff(struct work_struct *dummy)
20{ 20{
21 kernel_power_off(); 21 kernel_power_off();
22} 22}
23 23
24static DECLARE_WORK(poweroff_work, do_poweroff, NULL); 24static DECLARE_WORK(poweroff_work, do_poweroff);
25 25
26static void handle_poweroff(int key, struct tty_struct *tty) 26static void handle_poweroff(int key, struct tty_struct *tty)
27{ 27{
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e6..6d566bf7085c 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,20 +13,22 @@
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h>
16 17
17/* 18/*
18 * Timeout for stopping processes 19 * Timeout for stopping processes
19 */ 20 */
20#define TIMEOUT (20 * HZ) 21#define TIMEOUT (20 * HZ)
21 22
23#define FREEZER_KERNEL_THREADS 0
24#define FREEZER_USER_SPACE 1
22 25
23static inline int freezeable(struct task_struct * p) 26static inline int freezeable(struct task_struct * p)
24{ 27{
25 if ((p == current) || 28 if ((p == current) ||
26 (p->flags & PF_NOFREEZE) || 29 (p->flags & PF_NOFREEZE) ||
27 (p->exit_state == EXIT_ZOMBIE) || 30 (p->exit_state == EXIT_ZOMBIE) ||
28 (p->exit_state == EXIT_DEAD) || 31 (p->exit_state == EXIT_DEAD))
29 (p->state == TASK_STOPPED))
30 return 0; 32 return 0;
31 return 1; 33 return 1;
32} 34}
@@ -39,7 +41,6 @@ void refrigerator(void)
39 long save; 41 long save;
40 save = current->state; 42 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 43 pr_debug("%s entered refrigerator\n", current->comm);
42 printk("=");
43 44
44 frozen_process(current); 45 frozen_process(current);
45 spin_lock_irq(&current->sighand->siglock); 46 spin_lock_irq(&current->sighand->siglock);
@@ -59,10 +60,16 @@ static inline void freeze_process(struct task_struct *p)
59 unsigned long flags; 60 unsigned long flags;
60 61
61 if (!freezing(p)) { 62 if (!freezing(p)) {
62 freeze(p); 63 rmb();
63 spin_lock_irqsave(&p->sighand->siglock, flags); 64 if (!frozen(p)) {
64 signal_wake_up(p, 0); 65 if (p->state == TASK_STOPPED)
65 spin_unlock_irqrestore(&p->sighand->siglock, flags); 66 force_sig_specific(SIGSTOP, p);
67
68 freeze(p);
69 spin_lock_irqsave(&p->sighand->siglock, flags);
70 signal_wake_up(p, p->state == TASK_STOPPED);
71 spin_unlock_irqrestore(&p->sighand->siglock, flags);
72 }
66 } 73 }
67} 74}
68 75
@@ -79,96 +86,134 @@ static void cancel_freezing(struct task_struct *p)
79 } 86 }
80} 87}
81 88
82/* 0 = success, else # of processes that we failed to stop */ 89static inline int is_user_space(struct task_struct *p)
83int freeze_processes(void) 90{
91 return p->mm && !(p->flags & PF_BORROWED_MM);
92}
93
94static unsigned int try_to_freeze_tasks(int freeze_user_space)
84{ 95{
85 int todo, nr_user, user_frozen;
86 unsigned long start_time;
87 struct task_struct *g, *p; 96 struct task_struct *g, *p;
97 unsigned long end_time;
98 unsigned int todo;
88 99
89 printk( "Stopping tasks: " ); 100 end_time = jiffies + TIMEOUT;
90 start_time = jiffies;
91 user_frozen = 0;
92 do { 101 do {
93 nr_user = todo = 0; 102 todo = 0;
94 read_lock(&tasklist_lock); 103 read_lock(&tasklist_lock);
95 do_each_thread(g, p) { 104 do_each_thread(g, p) {
96 if (!freezeable(p)) 105 if (!freezeable(p))
97 continue; 106 continue;
107
98 if (frozen(p)) 108 if (frozen(p))
99 continue; 109 continue;
110
100 if (p->state == TASK_TRACED && frozen(p->parent)) { 111 if (p->state == TASK_TRACED && frozen(p->parent)) {
101 cancel_freezing(p); 112 cancel_freezing(p);
102 continue; 113 continue;
103 } 114 }
104 if (p->mm && !(p->flags & PF_BORROWED_MM)) { 115 if (is_user_space(p)) {
105 /* The task is a user-space one. 116 if (!freeze_user_space)
106 * Freeze it unless there's a vfork completion 117 continue;
107 * pending 118
119 /* Freeze the task unless there is a vfork
120 * completion pending
108 */ 121 */
109 if (!p->vfork_done) 122 if (!p->vfork_done)
110 freeze_process(p); 123 freeze_process(p);
111 nr_user++;
112 } else { 124 } else {
113 /* Freeze only if the user space is frozen */ 125 if (freeze_user_space)
114 if (user_frozen) 126 continue;
115 freeze_process(p); 127
116 todo++; 128 freeze_process(p);
117 } 129 }
130 todo++;
118 } while_each_thread(g, p); 131 } while_each_thread(g, p);
119 read_unlock(&tasklist_lock); 132 read_unlock(&tasklist_lock);
120 todo += nr_user;
121 if (!user_frozen && !nr_user) {
122 sys_sync();
123 start_time = jiffies;
124 }
125 user_frozen = !nr_user;
126 yield(); /* Yield is okay here */ 133 yield(); /* Yield is okay here */
127 if (todo && time_after(jiffies, start_time + TIMEOUT)) 134 if (todo && time_after(jiffies, end_time))
128 break; 135 break;
129 } while(todo); 136 } while (todo);
130 137
131 /* This does not unfreeze processes that are already frozen
132 * (we have slightly ugly calling convention in that respect,
133 * and caller must call thaw_processes() if something fails),
134 * but it cleans up leftover PF_FREEZE requests.
135 */
136 if (todo) { 138 if (todo) {
137 printk( "\n" ); 139 /* This does not unfreeze processes that are already frozen
138 printk(KERN_ERR " stopping tasks timed out " 140 * (we have slightly ugly calling convention in that respect,
139 "after %d seconds (%d tasks remaining):\n", 141 * and caller must call thaw_processes() if something fails),
140 TIMEOUT / HZ, todo); 142 * but it cleans up leftover PF_FREEZE requests.
143 */
144 printk("\n");
145 printk(KERN_ERR "Stopping %s timed out after %d seconds "
146 "(%d tasks refusing to freeze):\n",
147 freeze_user_space ? "user space processes" :
148 "kernel threads",
149 TIMEOUT / HZ, todo);
141 read_lock(&tasklist_lock); 150 read_lock(&tasklist_lock);
142 do_each_thread(g, p) { 151 do_each_thread(g, p) {
152 if (is_user_space(p) == !freeze_user_space)
153 continue;
154
143 if (freezeable(p) && !frozen(p)) 155 if (freezeable(p) && !frozen(p))
144 printk(KERN_ERR " %s\n", p->comm); 156 printk(KERN_ERR " %s\n", p->comm);
157
145 cancel_freezing(p); 158 cancel_freezing(p);
146 } while_each_thread(g, p); 159 } while_each_thread(g, p);
147 read_unlock(&tasklist_lock); 160 read_unlock(&tasklist_lock);
148 return todo;
149 } 161 }
150 162
151 printk( "|\n" ); 163 return todo;
164}
165
166/**
167 * freeze_processes - tell processes to enter the refrigerator
168 *
169 * Returns 0 on success, or the number of processes that didn't freeze,
170 * although they were told to.
171 */
172int freeze_processes(void)
173{
174 unsigned int nr_unfrozen;
175
176 printk("Stopping tasks ... ");
177 nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
178 if (nr_unfrozen)
179 return nr_unfrozen;
180
181 sys_sync();
182 nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
183 if (nr_unfrozen)
184 return nr_unfrozen;
185
186 printk("done.\n");
152 BUG_ON(in_atomic()); 187 BUG_ON(in_atomic());
153 return 0; 188 return 0;
154} 189}
155 190
156void thaw_processes(void) 191static void thaw_tasks(int thaw_user_space)
157{ 192{
158 struct task_struct *g, *p; 193 struct task_struct *g, *p;
159 194
160 printk( "Restarting tasks..." );
161 read_lock(&tasklist_lock); 195 read_lock(&tasklist_lock);
162 do_each_thread(g, p) { 196 do_each_thread(g, p) {
163 if (!freezeable(p)) 197 if (!freezeable(p))
164 continue; 198 continue;
199
200 if (is_user_space(p) == !thaw_user_space)
201 continue;
202
165 if (!thaw_process(p)) 203 if (!thaw_process(p))
166 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 204 printk(KERN_WARNING " Strange, %s not stopped\n",
205 p->comm );
167 } while_each_thread(g, p); 206 } while_each_thread(g, p);
168
169 read_unlock(&tasklist_lock); 207 read_unlock(&tasklist_lock);
208}
209
210void thaw_processes(void)
211{
212 printk("Restarting tasks ... ");
213 thaw_tasks(FREEZER_KERNEL_THREADS);
214 thaw_tasks(FREEZER_USER_SPACE);
170 schedule(); 215 schedule();
171 printk( " done\n" ); 216 printk("done.\n");
172} 217}
173 218
174EXPORT_SYMBOL(refrigerator); 219EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d6..c024606221c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
1/* 1/*
2 * linux/kernel/power/snapshot.c 2 * linux/kernel/power/snapshot.c
3 * 3 *
4 * This file provide system snapshot/restore functionality. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 * 8 *
8 * This file is released under the GPLv2, and is based on swsusp.c. 9 * This file is released under the GPLv2.
9 * 10 *
10 */ 11 */
11 12
12
13#include <linux/version.h> 13#include <linux/version.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
@@ -34,137 +34,24 @@
34 34
35#include "power.h" 35#include "power.h"
36 36
37/* List of PBEs used for creating and restoring the suspend image */ 37/* List of PBEs needed for restoring the pages that were allocated before
38 * the suspend and included in the suspend image, but have also been
39 * allocated by the "resume" kernel, so their contents cannot be written
40 * directly to their "original" page frames.
41 */
38struct pbe *restore_pblist; 42struct pbe *restore_pblist;
39 43
40static unsigned int nr_copy_pages; 44/* Pointer to an auxiliary buffer (1 page) */
41static unsigned int nr_meta_pages;
42static void *buffer; 45static void *buffer;
43 46
44#ifdef CONFIG_HIGHMEM
45unsigned int count_highmem_pages(void)
46{
47 struct zone *zone;
48 unsigned long zone_pfn;
49 unsigned int n = 0;
50
51 for_each_zone (zone)
52 if (is_highmem(zone)) {
53 mark_free_pages(zone);
54 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
55 struct page *page;
56 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
57 if (!pfn_valid(pfn))
58 continue;
59 page = pfn_to_page(pfn);
60 if (PageReserved(page))
61 continue;
62 if (PageNosaveFree(page))
63 continue;
64 n++;
65 }
66 }
67 return n;
68}
69
70struct highmem_page {
71 char *data;
72 struct page *page;
73 struct highmem_page *next;
74};
75
76static struct highmem_page *highmem_copy;
77
78static int save_highmem_zone(struct zone *zone)
79{
80 unsigned long zone_pfn;
81 mark_free_pages(zone);
82 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
83 struct page *page;
84 struct highmem_page *save;
85 void *kaddr;
86 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
87
88 if (!(pfn%10000))
89 printk(".");
90 if (!pfn_valid(pfn))
91 continue;
92 page = pfn_to_page(pfn);
93 /*
94 * This condition results from rvmalloc() sans vmalloc_32()
95 * and architectural memory reservations. This should be
96 * corrected eventually when the cases giving rise to this
97 * are better understood.
98 */
99 if (PageReserved(page))
100 continue;
101 BUG_ON(PageNosave(page));
102 if (PageNosaveFree(page))
103 continue;
104 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
105 if (!save)
106 return -ENOMEM;
107 save->next = highmem_copy;
108 save->page = page;
109 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
110 if (!save->data) {
111 kfree(save);
112 return -ENOMEM;
113 }
114 kaddr = kmap_atomic(page, KM_USER0);
115 memcpy(save->data, kaddr, PAGE_SIZE);
116 kunmap_atomic(kaddr, KM_USER0);
117 highmem_copy = save;
118 }
119 return 0;
120}
121
122int save_highmem(void)
123{
124 struct zone *zone;
125 int res = 0;
126
127 pr_debug("swsusp: Saving Highmem");
128 drain_local_pages();
129 for_each_zone (zone) {
130 if (is_highmem(zone))
131 res = save_highmem_zone(zone);
132 if (res)
133 return res;
134 }
135 printk("\n");
136 return 0;
137}
138
139int restore_highmem(void)
140{
141 printk("swsusp: Restoring Highmem\n");
142 while (highmem_copy) {
143 struct highmem_page *save = highmem_copy;
144 void *kaddr;
145 highmem_copy = save->next;
146
147 kaddr = kmap_atomic(save->page, KM_USER0);
148 memcpy(kaddr, save->data, PAGE_SIZE);
149 kunmap_atomic(kaddr, KM_USER0);
150 free_page((long) save->data);
151 kfree(save);
152 }
153 return 0;
154}
155#else
156static inline unsigned int count_highmem_pages(void) {return 0;}
157static inline int save_highmem(void) {return 0;}
158static inline int restore_highmem(void) {return 0;}
159#endif
160
161/** 47/**
162 * @safe_needed - on resume, for storing the PBE list and the image, 48 * @safe_needed - on resume, for storing the PBE list and the image,
163 * we can only use memory pages that do not conflict with the pages 49 * we can only use memory pages that do not conflict with the pages
164 * used before suspend. 50 * used before suspend. The unsafe pages have PageNosaveFree set
51 * and we count them using unsafe_pages.
165 * 52 *
166 * The unsafe pages are marked with the PG_nosave_free flag 53 * Each allocated image page is marked as PageNosave and PageNosaveFree
167 * and we count them using unsafe_pages 54 * so that swsusp_free() can release it.
168 */ 55 */
169 56
170#define PG_ANY 0 57#define PG_ANY 0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
174 61
175static unsigned int allocated_unsafe_pages; 62static unsigned int allocated_unsafe_pages;
176 63
177static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 64static void *get_image_page(gfp_t gfp_mask, int safe_needed)
178{ 65{
179 void *res; 66 void *res;
180 67
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
195 82
196unsigned long get_safe_page(gfp_t gfp_mask) 83unsigned long get_safe_page(gfp_t gfp_mask)
197{ 84{
198 return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); 85 return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
86}
87
88static struct page *alloc_image_page(gfp_t gfp_mask)
89{
90 struct page *page;
91
92 page = alloc_page(gfp_mask);
93 if (page) {
94 SetPageNosave(page);
95 SetPageNosaveFree(page);
96 }
97 return page;
199} 98}
200 99
201/** 100/**
202 * free_image_page - free page represented by @addr, allocated with 101 * free_image_page - free page represented by @addr, allocated with
203 * alloc_image_page (page flags set by it must be cleared) 102 * get_image_page (page flags set by it must be cleared)
204 */ 103 */
205 104
206static inline void free_image_page(void *addr, int clear_nosave_free) 105static inline void free_image_page(void *addr, int clear_nosave_free)
207{ 106{
208 ClearPageNosave(virt_to_page(addr)); 107 struct page *page;
108
109 BUG_ON(!virt_addr_valid(addr));
110
111 page = virt_to_page(addr);
112
113 ClearPageNosave(page);
209 if (clear_nosave_free) 114 if (clear_nosave_free)
210 ClearPageNosaveFree(virt_to_page(addr)); 115 ClearPageNosaveFree(page);
211 free_page((unsigned long)addr); 116
117 __free_page(page);
212} 118}
213 119
214/* struct linked_page is used to build chains of pages */ 120/* struct linked_page is used to build chains of pages */
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
269 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { 175 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
270 struct linked_page *lp; 176 struct linked_page *lp;
271 177
272 lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); 178 lp = get_image_page(ca->gfp_mask, ca->safe_needed);
273 if (!lp) 179 if (!lp)
274 return NULL; 180 return NULL;
275 181
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
446 352
447 /* Compute the number of zones */ 353 /* Compute the number of zones */
448 nr = 0; 354 nr = 0;
449 for_each_zone (zone) 355 for_each_zone(zone)
450 if (populated_zone(zone) && !is_highmem(zone)) 356 if (populated_zone(zone))
451 nr++; 357 nr++;
452 358
453 /* Allocate the list of zones bitmap objects */ 359 /* Allocate the list of zones bitmap objects */
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
459 } 365 }
460 366
461 /* Initialize the zone bitmap objects */ 367 /* Initialize the zone bitmap objects */
462 for_each_zone (zone) { 368 for_each_zone(zone) {
463 unsigned long pfn; 369 unsigned long pfn;
464 370
465 if (!populated_zone(zone) || is_highmem(zone)) 371 if (!populated_zone(zone))
466 continue; 372 continue;
467 373
468 zone_bm->start_pfn = zone->zone_start_pfn; 374 zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
481 while (bb) { 387 while (bb) {
482 unsigned long *ptr; 388 unsigned long *ptr;
483 389
484 ptr = alloc_image_page(gfp_mask, safe_needed); 390 ptr = get_image_page(gfp_mask, safe_needed);
485 bb->data = ptr; 391 bb->data = ptr;
486 if (!ptr) 392 if (!ptr)
487 goto Free; 393 goto Free;
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
505 memory_bm_position_reset(bm); 411 memory_bm_position_reset(bm);
506 return 0; 412 return 0;
507 413
508Free: 414 Free:
509 bm->p_list = ca.chain; 415 bm->p_list = ca.chain;
510 memory_bm_free(bm, PG_UNSAFE_CLEAR); 416 memory_bm_free(bm, PG_UNSAFE_CLEAR);
511 return -ENOMEM; 417 return -ENOMEM;
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
651 memory_bm_position_reset(bm); 557 memory_bm_position_reset(bm);
652 return BM_END_OF_MAP; 558 return BM_END_OF_MAP;
653 559
654Return_pfn: 560 Return_pfn:
655 bm->cur.chunk = chunk; 561 bm->cur.chunk = chunk;
656 bm->cur.bit = bit; 562 bm->cur.bit = bit;
657 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; 563 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone)
669 575
670 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 576 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
671 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); 577 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
672 return res; 578 return 2 * res;
579}
580
581#ifdef CONFIG_HIGHMEM
582/**
583 * count_free_highmem_pages - compute the total number of free highmem
584 * pages, system-wide.
585 */
586
587static unsigned int count_free_highmem_pages(void)
588{
589 struct zone *zone;
590 unsigned int cnt = 0;
591
592 for_each_zone(zone)
593 if (populated_zone(zone) && is_highmem(zone))
594 cnt += zone->free_pages;
595
596 return cnt;
597}
598
599/**
600 * saveable_highmem_page - Determine whether a highmem page should be
601 * included in the suspend image.
602 *
603 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
604 * and it isn't a part of a free chunk of pages.
605 */
606
607static struct page *saveable_highmem_page(unsigned long pfn)
608{
609 struct page *page;
610
611 if (!pfn_valid(pfn))
612 return NULL;
613
614 page = pfn_to_page(pfn);
615
616 BUG_ON(!PageHighMem(page));
617
618 if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
619 return NULL;
620
621 return page;
673} 622}
674 623
675/** 624/**
625 * count_highmem_pages - compute the total number of saveable highmem
626 * pages.
627 */
628
629unsigned int count_highmem_pages(void)
630{
631 struct zone *zone;
632 unsigned int n = 0;
633
634 for_each_zone(zone) {
635 unsigned long pfn, max_zone_pfn;
636
637 if (!is_highmem(zone))
638 continue;
639
640 mark_free_pages(zone);
641 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
642 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
643 if (saveable_highmem_page(pfn))
644 n++;
645 }
646 return n;
647}
648#else
649static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
650static inline unsigned int count_highmem_pages(void) { return 0; }
651#endif /* CONFIG_HIGHMEM */
652
653/**
676 * pfn_is_nosave - check if given pfn is in the 'nosave' section 654 * pfn_is_nosave - check if given pfn is in the 'nosave' section
677 */ 655 */
678 656
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
684} 662}
685 663
686/** 664/**
687 * saveable - Determine whether a page should be cloned or not. 665 * saveable - Determine whether a non-highmem page should be included in
688 * @pfn: The page 666 * the suspend image.
689 * 667 *
690 * We save a page if it isn't Nosave, and is not in the range of pages 668 * We should save the page if it isn't Nosave, and is not in the range
691 * statically defined as 'unsaveable', and it 669 * of pages statically defined as 'unsaveable', and it isn't a part of
692 * isn't a part of a free chunk of pages. 670 * a free chunk of pages.
693 */ 671 */
694 672
695static struct page *saveable_page(unsigned long pfn) 673static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn)
701 679
702 page = pfn_to_page(pfn); 680 page = pfn_to_page(pfn);
703 681
704 if (PageNosave(page)) 682 BUG_ON(PageHighMem(page));
683
684 if (PageNosave(page) || PageNosaveFree(page))
705 return NULL; 685 return NULL;
686
706 if (PageReserved(page) && pfn_is_nosave(pfn)) 687 if (PageReserved(page) && pfn_is_nosave(pfn))
707 return NULL; 688 return NULL;
708 if (PageNosaveFree(page))
709 return NULL;
710 689
711 return page; 690 return page;
712} 691}
713 692
693/**
694 * count_data_pages - compute the total number of saveable non-highmem
695 * pages.
696 */
697
714unsigned int count_data_pages(void) 698unsigned int count_data_pages(void)
715{ 699{
716 struct zone *zone; 700 struct zone *zone;
717 unsigned long pfn, max_zone_pfn; 701 unsigned long pfn, max_zone_pfn;
718 unsigned int n = 0; 702 unsigned int n = 0;
719 703
720 for_each_zone (zone) { 704 for_each_zone(zone) {
721 if (is_highmem(zone)) 705 if (is_highmem(zone))
722 continue; 706 continue;
707
723 mark_free_pages(zone); 708 mark_free_pages(zone);
724 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 709 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
725 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 710 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
726 n += !!saveable_page(pfn); 711 if(saveable_page(pfn))
712 n++;
727 } 713 }
728 return n; 714 return n;
729} 715}
730 716
731static inline void copy_data_page(long *dst, long *src) 717/* This is needed, because copy_page and memcpy are not usable for copying
718 * task structs.
719 */
720static inline void do_copy_page(long *dst, long *src)
732{ 721{
733 int n; 722 int n;
734 723
735 /* copy_page and memcpy are not usable for copying task structs. */
736 for (n = PAGE_SIZE / sizeof(long); n; n--) 724 for (n = PAGE_SIZE / sizeof(long); n; n--)
737 *dst++ = *src++; 725 *dst++ = *src++;
738} 726}
739 727
728#ifdef CONFIG_HIGHMEM
729static inline struct page *
730page_is_saveable(struct zone *zone, unsigned long pfn)
731{
732 return is_highmem(zone) ?
733 saveable_highmem_page(pfn) : saveable_page(pfn);
734}
735
736static inline void
737copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
738{
739 struct page *s_page, *d_page;
740 void *src, *dst;
741
742 s_page = pfn_to_page(src_pfn);
743 d_page = pfn_to_page(dst_pfn);
744 if (PageHighMem(s_page)) {
745 src = kmap_atomic(s_page, KM_USER0);
746 dst = kmap_atomic(d_page, KM_USER1);
747 do_copy_page(dst, src);
748 kunmap_atomic(src, KM_USER0);
749 kunmap_atomic(dst, KM_USER1);
750 } else {
751 src = page_address(s_page);
752 if (PageHighMem(d_page)) {
753 /* Page pointed to by src may contain some kernel
754 * data modified by kmap_atomic()
755 */
756 do_copy_page(buffer, src);
757 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
758 memcpy(dst, buffer, PAGE_SIZE);
759 kunmap_atomic(dst, KM_USER0);
760 } else {
761 dst = page_address(d_page);
762 do_copy_page(dst, src);
763 }
764 }
765}
766#else
767#define page_is_saveable(zone, pfn) saveable_page(pfn)
768
769static inline void
770copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
771{
772 do_copy_page(page_address(pfn_to_page(dst_pfn)),
773 page_address(pfn_to_page(src_pfn)));
774}
775#endif /* CONFIG_HIGHMEM */
776
740static void 777static void
741copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) 778copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
742{ 779{
743 struct zone *zone; 780 struct zone *zone;
744 unsigned long pfn; 781 unsigned long pfn;
745 782
746 for_each_zone (zone) { 783 for_each_zone(zone) {
747 unsigned long max_zone_pfn; 784 unsigned long max_zone_pfn;
748 785
749 if (is_highmem(zone))
750 continue;
751
752 mark_free_pages(zone); 786 mark_free_pages(zone);
753 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 787 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
754 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 788 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
755 if (saveable_page(pfn)) 789 if (page_is_saveable(zone, pfn))
756 memory_bm_set_bit(orig_bm, pfn); 790 memory_bm_set_bit(orig_bm, pfn);
757 } 791 }
758 memory_bm_position_reset(orig_bm); 792 memory_bm_position_reset(orig_bm);
759 memory_bm_position_reset(copy_bm); 793 memory_bm_position_reset(copy_bm);
760 do { 794 do {
761 pfn = memory_bm_next_pfn(orig_bm); 795 pfn = memory_bm_next_pfn(orig_bm);
762 if (likely(pfn != BM_END_OF_MAP)) { 796 if (likely(pfn != BM_END_OF_MAP))
763 struct page *page; 797 copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
764 void *src;
765
766 page = pfn_to_page(pfn);
767 src = page_address(page);
768 page = pfn_to_page(memory_bm_next_pfn(copy_bm));
769 copy_data_page(page_address(page), src);
770 }
771 } while (pfn != BM_END_OF_MAP); 798 } while (pfn != BM_END_OF_MAP);
772} 799}
773 800
801/* Total number of image pages */
802static unsigned int nr_copy_pages;
803/* Number of pages needed for saving the original pfns of the image pages */
804static unsigned int nr_meta_pages;
805
774/** 806/**
775 * swsusp_free - free pages allocated for the suspend. 807 * swsusp_free - free pages allocated for the suspend.
776 * 808 *
@@ -792,7 +824,7 @@ void swsusp_free(void)
792 if (PageNosave(page) && PageNosaveFree(page)) { 824 if (PageNosave(page) && PageNosaveFree(page)) {
793 ClearPageNosave(page); 825 ClearPageNosave(page);
794 ClearPageNosaveFree(page); 826 ClearPageNosaveFree(page);
795 free_page((long) page_address(page)); 827 __free_page(page);
796 } 828 }
797 } 829 }
798 } 830 }
@@ -802,34 +834,108 @@ void swsusp_free(void)
802 buffer = NULL; 834 buffer = NULL;
803} 835}
804 836
837#ifdef CONFIG_HIGHMEM
838/**
839 * count_pages_for_highmem - compute the number of non-highmem pages
840 * that will be necessary for creating copies of highmem pages.
841 */
842
843static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
844{
845 unsigned int free_highmem = count_free_highmem_pages();
846
847 if (free_highmem >= nr_highmem)
848 nr_highmem = 0;
849 else
850 nr_highmem -= free_highmem;
851
852 return nr_highmem;
853}
854#else
855static unsigned int
856count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
857#endif /* CONFIG_HIGHMEM */
805 858
806/** 859/**
807 * enough_free_mem - Make sure we enough free memory to snapshot. 860 * enough_free_mem - Make sure we have enough free memory for the
808 * 861 * snapshot image.
809 * Returns TRUE or FALSE after checking the number of available
810 * free pages.
811 */ 862 */
812 863
813static int enough_free_mem(unsigned int nr_pages) 864static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
814{ 865{
815 struct zone *zone; 866 struct zone *zone;
816 unsigned int free = 0, meta = 0; 867 unsigned int free = 0, meta = 0;
817 868
818 for_each_zone (zone) 869 for_each_zone(zone) {
819 if (!is_highmem(zone)) { 870 meta += snapshot_additional_pages(zone);
871 if (!is_highmem(zone))
820 free += zone->free_pages; 872 free += zone->free_pages;
821 meta += snapshot_additional_pages(zone); 873 }
822 }
823 874
824 pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", 875 nr_pages += count_pages_for_highmem(nr_highmem);
876 pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
825 nr_pages, PAGES_FOR_IO, meta, free); 877 nr_pages, PAGES_FOR_IO, meta, free);
826 878
827 return free > nr_pages + PAGES_FOR_IO + meta; 879 return free > nr_pages + PAGES_FOR_IO + meta;
828} 880}
829 881
882#ifdef CONFIG_HIGHMEM
883/**
884 * get_highmem_buffer - if there are some highmem pages in the suspend
885 * image, we may need the buffer to copy them and/or load their data.
886 */
887
888static inline int get_highmem_buffer(int safe_needed)
889{
890 buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
891 return buffer ? 0 : -ENOMEM;
892}
893
894/**
895 * alloc_highmem_image_pages - allocate some highmem pages for the image.
896 * Try to allocate as many pages as needed, but if the number of free
897 * highmem pages is lesser than that, allocate them all.
898 */
899
900static inline unsigned int
901alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
902{
903 unsigned int to_alloc = count_free_highmem_pages();
904
905 if (to_alloc > nr_highmem)
906 to_alloc = nr_highmem;
907
908 nr_highmem -= to_alloc;
909 while (to_alloc-- > 0) {
910 struct page *page;
911
912 page = alloc_image_page(__GFP_HIGHMEM);
913 memory_bm_set_bit(bm, page_to_pfn(page));
914 }
915 return nr_highmem;
916}
917#else
918static inline int get_highmem_buffer(int safe_needed) { return 0; }
919
920static inline unsigned int
921alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
922#endif /* CONFIG_HIGHMEM */
923
924/**
925 * swsusp_alloc - allocate memory for the suspend image
926 *
927 * We first try to allocate as many highmem pages as there are
928 * saveable highmem pages in the system. If that fails, we allocate
929 * non-highmem pages for the copies of the remaining highmem ones.
930 *
931 * In this approach it is likely that the copies of highmem pages will
932 * also be located in the high memory, because of the way in which
933 * copy_data_pages() works.
934 */
935
830static int 936static int
831swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 937swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
832 unsigned int nr_pages) 938 unsigned int nr_pages, unsigned int nr_highmem)
833{ 939{
834 int error; 940 int error;
835 941
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
841 if (error) 947 if (error)
842 goto Free; 948 goto Free;
843 949
950 if (nr_highmem > 0) {
951 error = get_highmem_buffer(PG_ANY);
952 if (error)
953 goto Free;
954
955 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
956 }
844 while (nr_pages-- > 0) { 957 while (nr_pages-- > 0) {
845 struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); 958 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
959
846 if (!page) 960 if (!page)
847 goto Free; 961 goto Free;
848 962
849 SetPageNosave(page);
850 SetPageNosaveFree(page);
851 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 963 memory_bm_set_bit(copy_bm, page_to_pfn(page));
852 } 964 }
853 return 0; 965 return 0;
854 966
855Free: 967 Free:
856 swsusp_free(); 968 swsusp_free();
857 return -ENOMEM; 969 return -ENOMEM;
858} 970}
859 971
860/* Memory bitmap used for marking saveable pages */ 972/* Memory bitmap used for marking saveable pages (during suspend) or the
973 * suspend image pages (during resume)
974 */
861static struct memory_bitmap orig_bm; 975static struct memory_bitmap orig_bm;
862/* Memory bitmap used for marking allocated pages that will contain the copies 976/* Memory bitmap used on suspend for marking allocated pages that will contain
863 * of saveable pages 977 * the copies of saveable pages. During resume it is initially used for
978 * marking the suspend image pages, but then its set bits are duplicated in
979 * @orig_bm and it is released. Next, on systems with high memory, it may be
980 * used for marking "safe" highmem pages, but it has to be reinitialized for
981 * this purpose.
864 */ 982 */
865static struct memory_bitmap copy_bm; 983static struct memory_bitmap copy_bm;
866 984
867asmlinkage int swsusp_save(void) 985asmlinkage int swsusp_save(void)
868{ 986{
869 unsigned int nr_pages; 987 unsigned int nr_pages, nr_highmem;
870 988
871 pr_debug("swsusp: critical section: \n"); 989 printk("swsusp: critical section: \n");
872 990
873 drain_local_pages(); 991 drain_local_pages();
874 nr_pages = count_data_pages(); 992 nr_pages = count_data_pages();
875 printk("swsusp: Need to copy %u pages\n", nr_pages); 993 nr_highmem = count_highmem_pages();
994 printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
876 995
877 if (!enough_free_mem(nr_pages)) { 996 if (!enough_free_mem(nr_pages, nr_highmem)) {
878 printk(KERN_ERR "swsusp: Not enough free memory\n"); 997 printk(KERN_ERR "swsusp: Not enough free memory\n");
879 return -ENOMEM; 998 return -ENOMEM;
880 } 999 }
881 1000
882 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages)) 1001 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
1002 printk(KERN_ERR "swsusp: Memory allocation failed\n");
883 return -ENOMEM; 1003 return -ENOMEM;
1004 }
884 1005
885 /* During allocating of suspend pagedir, new cold pages may appear. 1006 /* During allocating of suspend pagedir, new cold pages may appear.
886 * Kill them. 1007 * Kill them.
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void)
894 * touch swap space! Except we must write out our image of course. 1015 * touch swap space! Except we must write out our image of course.
895 */ 1016 */
896 1017
1018 nr_pages += nr_highmem;
897 nr_copy_pages = nr_pages; 1019 nr_copy_pages = nr_pages;
898 nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1020 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
899 1021
900 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 1022 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
1023
901 return 0; 1024 return 0;
902} 1025}
903 1026
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
960 1083
961 if (!buffer) { 1084 if (!buffer) {
962 /* This makes the buffer be freed by swsusp_free() */ 1085 /* This makes the buffer be freed by swsusp_free() */
963 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1086 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
964 if (!buffer) 1087 if (!buffer)
965 return -ENOMEM; 1088 return -ENOMEM;
966 } 1089 }
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
975 memset(buffer, 0, PAGE_SIZE); 1098 memset(buffer, 0, PAGE_SIZE);
976 pack_pfns(buffer, &orig_bm); 1099 pack_pfns(buffer, &orig_bm);
977 } else { 1100 } else {
978 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1101 struct page *page;
979 1102
980 handle->buffer = page_address(pfn_to_page(pfn)); 1103 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1104 if (PageHighMem(page)) {
1105 /* Highmem pages are copied to the buffer,
1106 * because we can't return with a kmapped
1107 * highmem page (we may not be called again).
1108 */
1109 void *kaddr;
1110
1111 kaddr = kmap_atomic(page, KM_USER0);
1112 memcpy(buffer, kaddr, PAGE_SIZE);
1113 kunmap_atomic(kaddr, KM_USER0);
1114 handle->buffer = buffer;
1115 } else {
1116 handle->buffer = page_address(page);
1117 }
981 } 1118 }
982 handle->prev = handle->cur; 1119 handle->prev = handle->cur;
983 } 1120 }
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1005 unsigned long pfn, max_zone_pfn; 1142 unsigned long pfn, max_zone_pfn;
1006 1143
1007 /* Clear page flags */ 1144 /* Clear page flags */
1008 for_each_zone (zone) { 1145 for_each_zone(zone) {
1009 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1146 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1010 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1147 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1011 if (pfn_valid(pfn)) 1148 if (pfn_valid(pfn))
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1101 } 1238 }
1102} 1239}
1103 1240
1241/* List of "safe" pages that may be used to store data loaded from the suspend
1242 * image
1243 */
1244static struct linked_page *safe_pages_list;
1245
1246#ifdef CONFIG_HIGHMEM
1247/* struct highmem_pbe is used for creating the list of highmem pages that
1248 * should be restored atomically during the resume from disk, because the page
1249 * frames they have occupied before the suspend are in use.
1250 */
1251struct highmem_pbe {
1252 struct page *copy_page; /* data is here now */
1253 struct page *orig_page; /* data was here before the suspend */
1254 struct highmem_pbe *next;
1255};
1256
1257/* List of highmem PBEs needed for restoring the highmem pages that were
1258 * allocated before the suspend and included in the suspend image, but have
1259 * also been allocated by the "resume" kernel, so their contents cannot be
1260 * written directly to their "original" page frames.
1261 */
1262static struct highmem_pbe *highmem_pblist;
1263
1264/**
1265 * count_highmem_image_pages - compute the number of highmem pages in the
1266 * suspend image. The bits in the memory bitmap @bm that correspond to the
1267 * image pages are assumed to be set.
1268 */
1269
1270static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
1271{
1272 unsigned long pfn;
1273 unsigned int cnt = 0;
1274
1275 memory_bm_position_reset(bm);
1276 pfn = memory_bm_next_pfn(bm);
1277 while (pfn != BM_END_OF_MAP) {
1278 if (PageHighMem(pfn_to_page(pfn)))
1279 cnt++;
1280
1281 pfn = memory_bm_next_pfn(bm);
1282 }
1283 return cnt;
1284}
1285
1286/**
1287 * prepare_highmem_image - try to allocate as many highmem pages as
1288 * there are highmem image pages (@nr_highmem_p points to the variable
1289 * containing the number of highmem image pages). The pages that are
1290 * "safe" (ie. will not be overwritten when the suspend image is
1291 * restored) have the corresponding bits set in @bm (it must be
1292 * unitialized).
1293 *
1294 * NOTE: This function should not be called if there are no highmem
1295 * image pages.
1296 */
1297
1298static unsigned int safe_highmem_pages;
1299
1300static struct memory_bitmap *safe_highmem_bm;
1301
1302static int
1303prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1304{
1305 unsigned int to_alloc;
1306
1307 if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
1308 return -ENOMEM;
1309
1310 if (get_highmem_buffer(PG_SAFE))
1311 return -ENOMEM;
1312
1313 to_alloc = count_free_highmem_pages();
1314 if (to_alloc > *nr_highmem_p)
1315 to_alloc = *nr_highmem_p;
1316 else
1317 *nr_highmem_p = to_alloc;
1318
1319 safe_highmem_pages = 0;
1320 while (to_alloc-- > 0) {
1321 struct page *page;
1322
1323 page = alloc_page(__GFP_HIGHMEM);
1324 if (!PageNosaveFree(page)) {
1325 /* The page is "safe", set its bit the bitmap */
1326 memory_bm_set_bit(bm, page_to_pfn(page));
1327 safe_highmem_pages++;
1328 }
1329 /* Mark the page as allocated */
1330 SetPageNosave(page);
1331 SetPageNosaveFree(page);
1332 }
1333 memory_bm_position_reset(bm);
1334 safe_highmem_bm = bm;
1335 return 0;
1336}
1337
1338/**
1339 * get_highmem_page_buffer - for given highmem image page find the buffer
1340 * that suspend_write_next() should set for its caller to write to.
1341 *
1342 * If the page is to be saved to its "original" page frame or a copy of
1343 * the page is to be made in the highmem, @buffer is returned. Otherwise,
1344 * the copy of the page is to be made in normal memory, so the address of
1345 * the copy is returned.
1346 *
1347 * If @buffer is returned, the caller of suspend_write_next() will write
1348 * the page's contents to @buffer, so they will have to be copied to the
1349 * right location on the next call to suspend_write_next() and it is done
1350 * with the help of copy_last_highmem_page(). For this purpose, if
1351 * @buffer is returned, @last_highmem page is set to the page to which
1352 * the data will have to be copied from @buffer.
1353 */
1354
1355static struct page *last_highmem_page;
1356
1357static void *
1358get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1359{
1360 struct highmem_pbe *pbe;
1361 void *kaddr;
1362
1363 if (PageNosave(page) && PageNosaveFree(page)) {
1364 /* We have allocated the "original" page frame and we can
1365 * use it directly to store the loaded page.
1366 */
1367 last_highmem_page = page;
1368 return buffer;
1369 }
1370 /* The "original" page frame has not been allocated and we have to
1371 * use a "safe" page frame to store the loaded page.
1372 */
1373 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1374 if (!pbe) {
1375 swsusp_free();
1376 return NULL;
1377 }
1378 pbe->orig_page = page;
1379 if (safe_highmem_pages > 0) {
1380 struct page *tmp;
1381
1382 /* Copy of the page will be stored in high memory */
1383 kaddr = buffer;
1384 tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
1385 safe_highmem_pages--;
1386 last_highmem_page = tmp;
1387 pbe->copy_page = tmp;
1388 } else {
1389 /* Copy of the page will be stored in normal memory */
1390 kaddr = safe_pages_list;
1391 safe_pages_list = safe_pages_list->next;
1392 pbe->copy_page = virt_to_page(kaddr);
1393 }
1394 pbe->next = highmem_pblist;
1395 highmem_pblist = pbe;
1396 return kaddr;
1397}
1398
1399/**
1400 * copy_last_highmem_page - copy the contents of a highmem image from
1401 * @buffer, where the caller of snapshot_write_next() has place them,
1402 * to the right location represented by @last_highmem_page .
1403 */
1404
1405static void copy_last_highmem_page(void)
1406{
1407 if (last_highmem_page) {
1408 void *dst;
1409
1410 dst = kmap_atomic(last_highmem_page, KM_USER0);
1411 memcpy(dst, buffer, PAGE_SIZE);
1412 kunmap_atomic(dst, KM_USER0);
1413 last_highmem_page = NULL;
1414 }
1415}
1416
1417static inline int last_highmem_page_copied(void)
1418{
1419 return !last_highmem_page;
1420}
1421
1422static inline void free_highmem_data(void)
1423{
1424 if (safe_highmem_bm)
1425 memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
1426
1427 if (buffer)
1428 free_image_page(buffer, PG_UNSAFE_CLEAR);
1429}
1430#else
1431static inline int get_safe_write_buffer(void) { return 0; }
1432
1433static unsigned int
1434count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
1435
1436static inline int
1437prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1438{
1439 return 0;
1440}
1441
1442static inline void *
1443get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1444{
1445 return NULL;
1446}
1447
1448static inline void copy_last_highmem_page(void) {}
1449static inline int last_highmem_page_copied(void) { return 1; }
1450static inline void free_highmem_data(void) {}
1451#endif /* CONFIG_HIGHMEM */
1452
1104/** 1453/**
1105 * prepare_image - use the memory bitmap @bm to mark the pages that will 1454 * prepare_image - use the memory bitmap @bm to mark the pages that will
1106 * be overwritten in the process of restoring the system memory state 1455 * be overwritten in the process of restoring the system memory state
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1110 * The idea is to allocate a new memory bitmap first and then allocate 1459 * The idea is to allocate a new memory bitmap first and then allocate
1111 * as many pages as needed for the image data, but not to assign these 1460 * as many pages as needed for the image data, but not to assign these
1112 * pages to specific tasks initially. Instead, we just mark them as 1461 * pages to specific tasks initially. Instead, we just mark them as
1113 * allocated and create a list of "safe" pages that will be used later. 1462 * allocated and create a lists of "safe" pages that will be used
1463 * later. On systems with high memory a list of "safe" highmem pages is
1464 * also created.
1114 */ 1465 */
1115 1466
1116#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) 1467#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
1117 1468
1118static struct linked_page *safe_pages_list;
1119
1120static int 1469static int
1121prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) 1470prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1122{ 1471{
1123 unsigned int nr_pages; 1472 unsigned int nr_pages, nr_highmem;
1124 struct linked_page *sp_list, *lp; 1473 struct linked_page *sp_list, *lp;
1125 int error; 1474 int error;
1126 1475
1476 /* If there is no highmem, the buffer will not be necessary */
1477 free_image_page(buffer, PG_UNSAFE_CLEAR);
1478 buffer = NULL;
1479
1480 nr_highmem = count_highmem_image_pages(bm);
1127 error = mark_unsafe_pages(bm); 1481 error = mark_unsafe_pages(bm);
1128 if (error) 1482 if (error)
1129 goto Free; 1483 goto Free;
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1134 1488
1135 duplicate_memory_bitmap(new_bm, bm); 1489 duplicate_memory_bitmap(new_bm, bm);
1136 memory_bm_free(bm, PG_UNSAFE_KEEP); 1490 memory_bm_free(bm, PG_UNSAFE_KEEP);
1491 if (nr_highmem > 0) {
1492 error = prepare_highmem_image(bm, &nr_highmem);
1493 if (error)
1494 goto Free;
1495 }
1137 /* Reserve some safe pages for potential later use. 1496 /* Reserve some safe pages for potential later use.
1138 * 1497 *
1139 * NOTE: This way we make sure there will be enough safe pages for the 1498 * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1142 */ 1501 */
1143 sp_list = NULL; 1502 sp_list = NULL;
1144 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ 1503 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1145 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1504 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1146 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); 1505 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1147 while (nr_pages > 0) { 1506 while (nr_pages > 0) {
1148 lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); 1507 lp = get_image_page(GFP_ATOMIC, PG_SAFE);
1149 if (!lp) { 1508 if (!lp) {
1150 error = -ENOMEM; 1509 error = -ENOMEM;
1151 goto Free; 1510 goto Free;
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1156 } 1515 }
1157 /* Preallocate memory for the image */ 1516 /* Preallocate memory for the image */
1158 safe_pages_list = NULL; 1517 safe_pages_list = NULL;
1159 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1518 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1160 while (nr_pages > 0) { 1519 while (nr_pages > 0) {
1161 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); 1520 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
1162 if (!lp) { 1521 if (!lp) {
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1181 } 1540 }
1182 return 0; 1541 return 0;
1183 1542
1184Free: 1543 Free:
1185 swsusp_free(); 1544 swsusp_free();
1186 return error; 1545 return error;
1187} 1546}
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1196 struct pbe *pbe; 1555 struct pbe *pbe;
1197 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1556 struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
1198 1557
1558 if (PageHighMem(page))
1559 return get_highmem_page_buffer(page, ca);
1560
1199 if (PageNosave(page) && PageNosaveFree(page)) 1561 if (PageNosave(page) && PageNosaveFree(page))
1200 /* We have allocated the "original" page frame and we can 1562 /* We have allocated the "original" page frame and we can
1201 * use it directly to store the loaded page. 1563 * use it directly to store the loaded page.
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1210 swsusp_free(); 1572 swsusp_free();
1211 return NULL; 1573 return NULL;
1212 } 1574 }
1213 pbe->orig_address = (unsigned long)page_address(page); 1575 pbe->orig_address = page_address(page);
1214 pbe->address = (unsigned long)safe_pages_list; 1576 pbe->address = safe_pages_list;
1215 safe_pages_list = safe_pages_list->next; 1577 safe_pages_list = safe_pages_list->next;
1216 pbe->next = restore_pblist; 1578 pbe->next = restore_pblist;
1217 restore_pblist = pbe; 1579 restore_pblist = pbe;
1218 return (void *)pbe->address; 1580 return pbe->address;
1219} 1581}
1220 1582
1221/** 1583/**
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1249 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1611 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1250 return 0; 1612 return 0;
1251 1613
1252 if (!buffer) { 1614 if (handle->offset == 0) {
1253 /* This makes the buffer be freed by swsusp_free() */ 1615 if (!buffer)
1254 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1616 /* This makes the buffer be freed by swsusp_free() */
1617 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
1618
1255 if (!buffer) 1619 if (!buffer)
1256 return -ENOMEM; 1620 return -ENOMEM;
1257 } 1621
1258 if (!handle->offset)
1259 handle->buffer = buffer; 1622 handle->buffer = buffer;
1623 }
1260 handle->sync_read = 1; 1624 handle->sync_read = 1;
1261 if (handle->prev < handle->cur) { 1625 if (handle->prev < handle->cur) {
1262 if (handle->prev == 0) { 1626 if (handle->prev == 0) {
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1284 return -ENOMEM; 1648 return -ENOMEM;
1285 } 1649 }
1286 } else { 1650 } else {
1651 copy_last_highmem_page();
1287 handle->buffer = get_buffer(&orig_bm, &ca); 1652 handle->buffer = get_buffer(&orig_bm, &ca);
1288 handle->sync_read = 0; 1653 if (handle->buffer != buffer)
1654 handle->sync_read = 0;
1289 } 1655 }
1290 handle->prev = handle->cur; 1656 handle->prev = handle->cur;
1291 } 1657 }
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1301 return count; 1667 return count;
1302} 1668}
1303 1669
1670/**
1671 * snapshot_write_finalize - must be called after the last call to
1672 * snapshot_write_next() in case the last page in the image happens
1673 * to be a highmem page and its contents should be stored in the
1674 * highmem. Additionally, it releases the memory that will not be
1675 * used any more.
1676 */
1677
1678void snapshot_write_finalize(struct snapshot_handle *handle)
1679{
1680 copy_last_highmem_page();
1681 /* Free only if we have loaded the image entirely */
1682 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
1683 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
1684 free_highmem_data();
1685 }
1686}
1687
1304int snapshot_image_loaded(struct snapshot_handle *handle) 1688int snapshot_image_loaded(struct snapshot_handle *handle)
1305{ 1689{
1306 return !(!nr_copy_pages || 1690 return !(!nr_copy_pages || !last_highmem_page_copied() ||
1307 handle->cur <= nr_meta_pages + nr_copy_pages); 1691 handle->cur <= nr_meta_pages + nr_copy_pages);
1308} 1692}
1309 1693
1310void snapshot_free_unused_memory(struct snapshot_handle *handle) 1694#ifdef CONFIG_HIGHMEM
1695/* Assumes that @buf is ready and points to a "safe" page */
1696static inline void
1697swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
1311{ 1698{
1312 /* Free only if we have loaded the image entirely */ 1699 void *kaddr1, *kaddr2;
1313 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1700
1314 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 1701 kaddr1 = kmap_atomic(p1, KM_USER0);
1702 kaddr2 = kmap_atomic(p2, KM_USER1);
1703 memcpy(buf, kaddr1, PAGE_SIZE);
1704 memcpy(kaddr1, kaddr2, PAGE_SIZE);
1705 memcpy(kaddr2, buf, PAGE_SIZE);
1706 kunmap_atomic(kaddr1, KM_USER0);
1707 kunmap_atomic(kaddr2, KM_USER1);
1708}
1709
1710/**
1711 * restore_highmem - for each highmem page that was allocated before
1712 * the suspend and included in the suspend image, and also has been
1713 * allocated by the "resume" kernel swap its current (ie. "before
1714 * resume") contents with the previous (ie. "before suspend") one.
1715 *
1716 * If the resume eventually fails, we can call this function once
1717 * again and restore the "before resume" highmem state.
1718 */
1719
1720int restore_highmem(void)
1721{
1722 struct highmem_pbe *pbe = highmem_pblist;
1723 void *buf;
1724
1725 if (!pbe)
1726 return 0;
1727
1728 buf = get_image_page(GFP_ATOMIC, PG_SAFE);
1729 if (!buf)
1730 return -ENOMEM;
1731
1732 while (pbe) {
1733 swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
1734 pbe = pbe->next;
1735 }
1736 free_image_page(buf, PG_UNSAFE_CLEAR);
1737 return 0;
1315} 1738}
1739#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd2c3fc..3581f8f86acd 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,34 +34,123 @@ extern char resume_file[];
34#define SWSUSP_SIG "S1SUSPEND" 34#define SWSUSP_SIG "S1SUSPEND"
35 35
36static struct swsusp_header { 36static struct swsusp_header {
37 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; 37 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
38 swp_entry_t image; 38 sector_t image;
39 char orig_sig[10]; 39 char orig_sig[10];
40 char sig[10]; 40 char sig[10];
41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
42 42
43/* 43/*
44 * Saving part... 44 * General things
45 */ 45 */
46 46
47static unsigned short root_swap = 0xffff; 47static unsigned short root_swap = 0xffff;
48static struct block_device *resume_bdev;
49
50/**
51 * submit - submit BIO request.
52 * @rw: READ or WRITE.
53 * @off physical offset of page.
54 * @page: page we're reading or writing.
55 * @bio_chain: list of pending biod (for async reading)
56 *
57 * Straight from the textbook - allocate and initialize the bio.
58 * If we're reading, make sure the page is marked as dirty.
59 * Then submit it and, if @bio_chain == NULL, wait.
60 */
61static int submit(int rw, pgoff_t page_off, struct page *page,
62 struct bio **bio_chain)
63{
64 struct bio *bio;
65
66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
67 if (!bio)
68 return -ENOMEM;
69 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
70 bio->bi_bdev = resume_bdev;
71 bio->bi_end_io = end_swap_bio_read;
72
73 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
74 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
75 bio_put(bio);
76 return -EFAULT;
77 }
78
79 lock_page(page);
80 bio_get(bio);
81
82 if (bio_chain == NULL) {
83 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
84 wait_on_page_locked(page);
85 if (rw == READ)
86 bio_set_pages_dirty(bio);
87 bio_put(bio);
88 } else {
89 if (rw == READ)
90 get_page(page); /* These pages are freed later */
91 bio->bi_private = *bio_chain;
92 *bio_chain = bio;
93 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
94 }
95 return 0;
96}
97
98static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
99{
100 return submit(READ, page_off, virt_to_page(addr), bio_chain);
101}
102
103static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
104{
105 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
106}
107
108static int wait_on_bio_chain(struct bio **bio_chain)
109{
110 struct bio *bio;
111 struct bio *next_bio;
112 int ret = 0;
113
114 if (bio_chain == NULL)
115 return 0;
116
117 bio = *bio_chain;
118 if (bio == NULL)
119 return 0;
120 while (bio) {
121 struct page *page;
122
123 next_bio = bio->bi_private;
124 page = bio->bi_io_vec[0].bv_page;
125 wait_on_page_locked(page);
126 if (!PageUptodate(page) || PageError(page))
127 ret = -EIO;
128 put_page(page);
129 bio_put(bio);
130 bio = next_bio;
131 }
132 *bio_chain = NULL;
133 return ret;
134}
135
136/*
137 * Saving part
138 */
48 139
49static int mark_swapfiles(swp_entry_t start) 140static int mark_swapfiles(sector_t start)
50{ 141{
51 int error; 142 int error;
52 143
53 rw_swap_page_sync(READ, swp_entry(root_swap, 0), 144 bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
54 virt_to_page((unsigned long)&swsusp_header), NULL);
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 145 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 146 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 147 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 148 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start; 149 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), 150 error = bio_write_page(swsusp_resume_block,
61 virt_to_page((unsigned long)&swsusp_header), 151 &swsusp_header, NULL);
62 NULL);
63 } else { 152 } else {
64 pr_debug("swsusp: Partition is not swap space.\n"); 153 printk(KERN_ERR "swsusp: Swap header not found!\n");
65 error = -ENODEV; 154 error = -ENODEV;
66 } 155 }
67 return error; 156 return error;
@@ -74,12 +163,22 @@ static int mark_swapfiles(swp_entry_t start)
74 163
75static int swsusp_swap_check(void) /* This is called before saving image */ 164static int swsusp_swap_check(void) /* This is called before saving image */
76{ 165{
77 int res = swap_type_of(swsusp_resume_device); 166 int res;
167
168 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
169 &resume_bdev);
170 if (res < 0)
171 return res;
172
173 root_swap = res;
174 res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR);
175 if (res)
176 return res;
177
178 res = set_blocksize(resume_bdev, PAGE_SIZE);
179 if (res < 0)
180 blkdev_put(resume_bdev);
78 181
79 if (res >= 0) {
80 root_swap = res;
81 return 0;
82 }
83 return res; 182 return res;
84} 183}
85 184
@@ -90,36 +189,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
90 * @bio_chain: Link the next write BIO here 189 * @bio_chain: Link the next write BIO here
91 */ 190 */
92 191
93static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) 192static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
94{ 193{
95 swp_entry_t entry; 194 void *src;
96 int error = -ENOSPC; 195
97 196 if (!offset)
98 if (offset) { 197 return -ENOSPC;
99 struct page *page = virt_to_page(buf); 198
100 199 if (bio_chain) {
101 if (bio_chain) { 200 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
102 /* 201 if (src) {
103 * Whether or not we successfully allocated a copy page, 202 memcpy(src, buf, PAGE_SIZE);
104 * we take a ref on the page here. It gets undone in 203 } else {
105 * wait_on_bio_chain(). 204 WARN_ON_ONCE(1);
106 */ 205 bio_chain = NULL; /* Go synchronous */
107 struct page *page_copy; 206 src = buf;
108 page_copy = alloc_page(GFP_ATOMIC);
109 if (page_copy == NULL) {
110 WARN_ON_ONCE(1);
111 bio_chain = NULL; /* Go synchronous */
112 get_page(page);
113 } else {
114 memcpy(page_address(page_copy),
115 page_address(page), PAGE_SIZE);
116 page = page_copy;
117 }
118 } 207 }
119 entry = swp_entry(root_swap, offset); 208 } else {
120 error = rw_swap_page_sync(WRITE, entry, page, bio_chain); 209 src = buf;
121 } 210 }
122 return error; 211 return bio_write_page(offset, src, bio_chain);
123} 212}
124 213
125/* 214/*
@@ -137,11 +226,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
137 * at a time. 226 * at a time.
138 */ 227 */
139 228
140#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) 229#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
141 230
142struct swap_map_page { 231struct swap_map_page {
143 unsigned long entries[MAP_PAGE_ENTRIES]; 232 sector_t entries[MAP_PAGE_ENTRIES];
144 unsigned long next_swap; 233 sector_t next_swap;
145}; 234};
146 235
147/** 236/**
@@ -151,7 +240,7 @@ struct swap_map_page {
151 240
152struct swap_map_handle { 241struct swap_map_handle {
153 struct swap_map_page *cur; 242 struct swap_map_page *cur;
154 unsigned long cur_swap; 243 sector_t cur_swap;
155 struct bitmap_page *bitmap; 244 struct bitmap_page *bitmap;
156 unsigned int k; 245 unsigned int k;
157}; 246};
@@ -166,26 +255,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
166 handle->bitmap = NULL; 255 handle->bitmap = NULL;
167} 256}
168 257
169static void show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
185 centisecs / 100, centisecs % 100,
186 kps / 1000, (kps % 1000) / 10);
187}
188
189static int get_swap_writer(struct swap_map_handle *handle) 258static int get_swap_writer(struct swap_map_handle *handle)
190{ 259{
191 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 260 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -196,7 +265,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
196 release_swap_writer(handle); 265 release_swap_writer(handle);
197 return -ENOMEM; 266 return -ENOMEM;
198 } 267 }
199 handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); 268 handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
200 if (!handle->cur_swap) { 269 if (!handle->cur_swap) {
201 release_swap_writer(handle); 270 release_swap_writer(handle);
202 return -ENOSPC; 271 return -ENOSPC;
@@ -205,43 +274,15 @@ static int get_swap_writer(struct swap_map_handle *handle)
205 return 0; 274 return 0;
206} 275}
207 276
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235
236static int swap_write_page(struct swap_map_handle *handle, void *buf, 277static int swap_write_page(struct swap_map_handle *handle, void *buf,
237 struct bio **bio_chain) 278 struct bio **bio_chain)
238{ 279{
239 int error = 0; 280 int error = 0;
240 unsigned long offset; 281 sector_t offset;
241 282
242 if (!handle->cur) 283 if (!handle->cur)
243 return -EINVAL; 284 return -EINVAL;
244 offset = alloc_swap_page(root_swap, handle->bitmap); 285 offset = alloc_swapdev_block(root_swap, handle->bitmap);
245 error = write_page(buf, offset, bio_chain); 286 error = write_page(buf, offset, bio_chain);
246 if (error) 287 if (error)
247 return error; 288 return error;
@@ -250,7 +291,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
250 error = wait_on_bio_chain(bio_chain); 291 error = wait_on_bio_chain(bio_chain);
251 if (error) 292 if (error)
252 goto out; 293 goto out;
253 offset = alloc_swap_page(root_swap, handle->bitmap); 294 offset = alloc_swapdev_block(root_swap, handle->bitmap);
254 if (!offset) 295 if (!offset)
255 return -ENOSPC; 296 return -ENOSPC;
256 handle->cur->next_swap = offset; 297 handle->cur->next_swap = offset;
@@ -261,7 +302,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
261 handle->cur_swap = offset; 302 handle->cur_swap = offset;
262 handle->k = 0; 303 handle->k = 0;
263 } 304 }
264out: 305 out:
265 return error; 306 return error;
266} 307}
267 308
@@ -315,7 +356,7 @@ static int save_image(struct swap_map_handle *handle,
315 error = err2; 356 error = err2;
316 if (!error) 357 if (!error)
317 printk("\b\b\b\bdone\n"); 358 printk("\b\b\b\bdone\n");
318 show_speed(&start, &stop, nr_to_write, "Wrote"); 359 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
319 return error; 360 return error;
320} 361}
321 362
@@ -350,100 +391,50 @@ int swsusp_write(void)
350 struct swsusp_info *header; 391 struct swsusp_info *header;
351 int error; 392 int error;
352 393
353 if ((error = swsusp_swap_check())) { 394 error = swsusp_swap_check();
395 if (error) {
354 printk(KERN_ERR "swsusp: Cannot find swap device, try " 396 printk(KERN_ERR "swsusp: Cannot find swap device, try "
355 "swapon -a.\n"); 397 "swapon -a.\n");
356 return error; 398 return error;
357 } 399 }
358 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 400 memset(&snapshot, 0, sizeof(struct snapshot_handle));
359 error = snapshot_read_next(&snapshot, PAGE_SIZE); 401 error = snapshot_read_next(&snapshot, PAGE_SIZE);
360 if (error < PAGE_SIZE) 402 if (error < PAGE_SIZE) {
361 return error < 0 ? error : -EFAULT; 403 if (error >= 0)
404 error = -EFAULT;
405
406 goto out;
407 }
362 header = (struct swsusp_info *)data_of(snapshot); 408 header = (struct swsusp_info *)data_of(snapshot);
363 if (!enough_swap(header->pages)) { 409 if (!enough_swap(header->pages)) {
364 printk(KERN_ERR "swsusp: Not enough free swap\n"); 410 printk(KERN_ERR "swsusp: Not enough free swap\n");
365 return -ENOSPC; 411 error = -ENOSPC;
412 goto out;
366 } 413 }
367 error = get_swap_writer(&handle); 414 error = get_swap_writer(&handle);
368 if (!error) { 415 if (!error) {
369 unsigned long start = handle.cur_swap; 416 sector_t start = handle.cur_swap;
417
370 error = swap_write_page(&handle, header, NULL); 418 error = swap_write_page(&handle, header, NULL);
371 if (!error) 419 if (!error)
372 error = save_image(&handle, &snapshot, 420 error = save_image(&handle, &snapshot,
373 header->pages - 1); 421 header->pages - 1);
422
374 if (!error) { 423 if (!error) {
375 flush_swap_writer(&handle); 424 flush_swap_writer(&handle);
376 printk("S"); 425 printk("S");
377 error = mark_swapfiles(swp_entry(root_swap, start)); 426 error = mark_swapfiles(start);
378 printk("|\n"); 427 printk("|\n");
379 } 428 }
380 } 429 }
381 if (error) 430 if (error)
382 free_all_swap_pages(root_swap, handle.bitmap); 431 free_all_swap_pages(root_swap, handle.bitmap);
383 release_swap_writer(&handle); 432 release_swap_writer(&handle);
433 out:
434 swsusp_close();
384 return error; 435 return error;
385} 436}
386 437
387static struct block_device *resume_bdev;
388
389/**
390 * submit - submit BIO request.
391 * @rw: READ or WRITE.
392 * @off physical offset of page.
393 * @page: page we're reading or writing.
394 * @bio_chain: list of pending biod (for async reading)
395 *
396 * Straight from the textbook - allocate and initialize the bio.
397 * If we're reading, make sure the page is marked as dirty.
398 * Then submit it and, if @bio_chain == NULL, wait.
399 */
400static int submit(int rw, pgoff_t page_off, struct page *page,
401 struct bio **bio_chain)
402{
403 struct bio *bio;
404
405 bio = bio_alloc(GFP_ATOMIC, 1);
406 if (!bio)
407 return -ENOMEM;
408 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
409 bio->bi_bdev = resume_bdev;
410 bio->bi_end_io = end_swap_bio_read;
411
412 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
413 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
414 bio_put(bio);
415 return -EFAULT;
416 }
417
418 lock_page(page);
419 bio_get(bio);
420
421 if (bio_chain == NULL) {
422 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
423 wait_on_page_locked(page);
424 if (rw == READ)
425 bio_set_pages_dirty(bio);
426 bio_put(bio);
427 } else {
428 if (rw == READ)
429 get_page(page); /* These pages are freed later */
430 bio->bi_private = *bio_chain;
431 *bio_chain = bio;
432 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
433 }
434 return 0;
435}
436
437static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
438{
439 return submit(READ, page_off, virt_to_page(addr), bio_chain);
440}
441
442static int bio_write_page(pgoff_t page_off, void *addr)
443{
444 return submit(WRITE, page_off, virt_to_page(addr), NULL);
445}
446
447/** 438/**
448 * The following functions allow us to read data using a swap map 439 * The following functions allow us to read data using a swap map
449 * in a file-alike way 440 * in a file-alike way
@@ -456,17 +447,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
456 handle->cur = NULL; 447 handle->cur = NULL;
457} 448}
458 449
459static int get_swap_reader(struct swap_map_handle *handle, 450static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
460 swp_entry_t start)
461{ 451{
462 int error; 452 int error;
463 453
464 if (!swp_offset(start)) 454 if (!start)
465 return -EINVAL; 455 return -EINVAL;
466 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 456
457 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
467 if (!handle->cur) 458 if (!handle->cur)
468 return -ENOMEM; 459 return -ENOMEM;
469 error = bio_read_page(swp_offset(start), handle->cur, NULL); 460
461 error = bio_read_page(start, handle->cur, NULL);
470 if (error) { 462 if (error) {
471 release_swap_reader(handle); 463 release_swap_reader(handle);
472 return error; 464 return error;
@@ -478,7 +470,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
478static int swap_read_page(struct swap_map_handle *handle, void *buf, 470static int swap_read_page(struct swap_map_handle *handle, void *buf,
479 struct bio **bio_chain) 471 struct bio **bio_chain)
480{ 472{
481 unsigned long offset; 473 sector_t offset;
482 int error; 474 int error;
483 475
484 if (!handle->cur) 476 if (!handle->cur)
@@ -547,11 +539,11 @@ static int load_image(struct swap_map_handle *handle,
547 error = err2; 539 error = err2;
548 if (!error) { 540 if (!error) {
549 printk("\b\b\b\bdone\n"); 541 printk("\b\b\b\bdone\n");
550 snapshot_free_unused_memory(snapshot); 542 snapshot_write_finalize(snapshot);
551 if (!snapshot_image_loaded(snapshot)) 543 if (!snapshot_image_loaded(snapshot))
552 error = -ENODATA; 544 error = -ENODATA;
553 } 545 }
554 show_speed(&start, &stop, nr_to_read, "Read"); 546 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
555 return error; 547 return error;
556} 548}
557 549
@@ -600,12 +592,16 @@ int swsusp_check(void)
600 if (!IS_ERR(resume_bdev)) { 592 if (!IS_ERR(resume_bdev)) {
601 set_blocksize(resume_bdev, PAGE_SIZE); 593 set_blocksize(resume_bdev, PAGE_SIZE);
602 memset(&swsusp_header, 0, sizeof(swsusp_header)); 594 memset(&swsusp_header, 0, sizeof(swsusp_header));
603 if ((error = bio_read_page(0, &swsusp_header, NULL))) 595 error = bio_read_page(swsusp_resume_block,
596 &swsusp_header, NULL);
597 if (error)
604 return error; 598 return error;
599
605 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 600 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
606 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 601 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
607 /* Reset swap signature now */ 602 /* Reset swap signature now */
608 error = bio_write_page(0, &swsusp_header); 603 error = bio_write_page(swsusp_resume_block,
604 &swsusp_header, NULL);
609 } else { 605 } else {
610 return -EINVAL; 606 return -EINVAL;
611 } 607 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc516..31aa0390c777 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
49#include <linux/bootmem.h> 49#include <linux/bootmem.h>
50#include <linux/syscalls.h> 50#include <linux/syscalls.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h>
52 53
53#include "power.h" 54#include "power.h"
54 55
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0;
64 65
65#ifdef CONFIG_HIGHMEM 66#ifdef CONFIG_HIGHMEM
66unsigned int count_highmem_pages(void); 67unsigned int count_highmem_pages(void);
67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static inline int save_highmem(void) { return 0; }
71static inline int restore_highmem(void) { return 0; } 70static inline int restore_highmem(void) { return 0; }
72static inline unsigned int count_highmem_pages(void) { return 0; } 71static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 72#endif
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
134 return 0; 133 return 0;
135} 134}
136 135
137unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) 136sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
138{ 137{
139 unsigned long offset; 138 unsigned long offset;
140 139
141 offset = swp_offset(get_swap_page_of_type(swap)); 140 offset = swp_offset(get_swap_page_of_type(swap));
142 if (offset) { 141 if (offset) {
143 if (bitmap_set(bitmap, offset)) { 142 if (bitmap_set(bitmap, offset))
144 swap_free(swp_entry(swap, offset)); 143 swap_free(swp_entry(swap, offset));
145 offset = 0; 144 else
146 } 145 return swapdev_block(swap, offset);
147 } 146 }
148 return offset; 147 return 0;
149} 148}
150 149
151void free_all_swap_pages(int swap, struct bitmap_page *bitmap) 150void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
166} 165}
167 166
168/** 167/**
168 * swsusp_show_speed - print the time elapsed between two events represented by
169 * @start and @stop
170 *
171 * @nr_pages - number of pages processed between @start and @stop
172 * @msg - introductory message to print
173 */
174
175void swsusp_show_speed(struct timeval *start, struct timeval *stop,
176 unsigned nr_pages, char *msg)
177{
178 s64 elapsed_centisecs64;
179 int centisecs;
180 int k;
181 int kps;
182
183 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
184 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
185 centisecs = elapsed_centisecs64;
186 if (centisecs == 0)
187 centisecs = 1; /* avoid div-by-zero */
188 k = nr_pages * (PAGE_SIZE / 1024);
189 kps = (k * 100) / centisecs;
190 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
191 centisecs / 100, centisecs % 100,
192 kps / 1000, (kps % 1000) / 10);
193}
194
195/**
169 * swsusp_shrink_memory - Try to free as much memory as needed 196 * swsusp_shrink_memory - Try to free as much memory as needed
170 * 197 *
171 * ... but do not OOM-kill anyone 198 * ... but do not OOM-kill anyone
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp)
184 211
185int swsusp_shrink_memory(void) 212int swsusp_shrink_memory(void)
186{ 213{
187 long size, tmp; 214 long tmp;
188 struct zone *zone; 215 struct zone *zone;
189 unsigned long pages = 0; 216 unsigned long pages = 0;
190 unsigned int i = 0; 217 unsigned int i = 0;
191 char *p = "-\\|/"; 218 char *p = "-\\|/";
219 struct timeval start, stop;
192 220
193 printk("Shrinking memory... "); 221 printk("Shrinking memory... ");
222 do_gettimeofday(&start);
194 do { 223 do {
195 size = 2 * count_highmem_pages(); 224 long size, highmem_size;
196 size += size / 50 + count_data_pages() + PAGES_FOR_IO; 225
226 highmem_size = count_highmem_pages();
227 size = count_data_pages() + PAGES_FOR_IO;
197 tmp = size; 228 tmp = size;
229 size += highmem_size;
198 for_each_zone (zone) 230 for_each_zone (zone)
199 if (!is_highmem(zone) && populated_zone(zone)) { 231 if (populated_zone(zone)) {
200 tmp -= zone->free_pages; 232 if (is_highmem(zone)) {
201 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 233 highmem_size -= zone->free_pages;
202 tmp += snapshot_additional_pages(zone); 234 } else {
235 tmp -= zone->free_pages;
236 tmp += zone->lowmem_reserve[ZONE_NORMAL];
237 tmp += snapshot_additional_pages(zone);
238 }
203 } 239 }
240
241 if (highmem_size < 0)
242 highmem_size = 0;
243
244 tmp += highmem_size;
204 if (tmp > 0) { 245 if (tmp > 0) {
205 tmp = __shrink_memory(tmp); 246 tmp = __shrink_memory(tmp);
206 if (!tmp) 247 if (!tmp)
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void)
212 } 253 }
213 printk("\b%c", p[i++%4]); 254 printk("\b%c", p[i++%4]);
214 } while (tmp > 0); 255 } while (tmp > 0);
256 do_gettimeofday(&stop);
215 printk("\bdone (%lu pages freed)\n", pages); 257 printk("\bdone (%lu pages freed)\n", pages);
258 swsusp_show_speed(&start, &stop, pages, "Freed");
216 259
217 return 0; 260 return 0;
218} 261}
@@ -223,6 +266,7 @@ int swsusp_suspend(void)
223 266
224 if ((error = arch_prepare_suspend())) 267 if ((error = arch_prepare_suspend()))
225 return error; 268 return error;
269
226 local_irq_disable(); 270 local_irq_disable();
227 /* At this point, device_suspend() has been called, but *not* 271 /* At this point, device_suspend() has been called, but *not*
228 * device_power_down(). We *must* device_power_down() now. 272 * device_power_down(). We *must* device_power_down() now.
@@ -235,23 +279,16 @@ int swsusp_suspend(void)
235 goto Enable_irqs; 279 goto Enable_irqs;
236 } 280 }
237 281
238 if ((error = save_highmem())) {
239 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
240 goto Restore_highmem;
241 }
242
243 save_processor_state(); 282 save_processor_state();
244 if ((error = swsusp_arch_suspend())) 283 if ((error = swsusp_arch_suspend()))
245 printk(KERN_ERR "Error %d suspending\n", error); 284 printk(KERN_ERR "Error %d suspending\n", error);
246 /* Restore control flow magically appears here */ 285 /* Restore control flow magically appears here */
247 restore_processor_state(); 286 restore_processor_state();
248Restore_highmem:
249 restore_highmem();
250 /* NOTE: device_power_up() is just a resume() for devices 287 /* NOTE: device_power_up() is just a resume() for devices
251 * that suspended with irqs off ... no overall powerup. 288 * that suspended with irqs off ... no overall powerup.
252 */ 289 */
253 device_power_up(); 290 device_power_up();
254Enable_irqs: 291 Enable_irqs:
255 local_irq_enable(); 292 local_irq_enable();
256 return error; 293 return error;
257} 294}
@@ -268,18 +305,23 @@ int swsusp_resume(void)
268 printk(KERN_ERR "Some devices failed to power down, very bad\n"); 305 printk(KERN_ERR "Some devices failed to power down, very bad\n");
269 /* We'll ignore saved state, but this gets preempt count (etc) right */ 306 /* We'll ignore saved state, but this gets preempt count (etc) right */
270 save_processor_state(); 307 save_processor_state();
271 error = swsusp_arch_resume(); 308 error = restore_highmem();
272 /* Code below is only ever reached in case of failure. Otherwise 309 if (!error) {
273 * execution continues at place where swsusp_arch_suspend was called 310 error = swsusp_arch_resume();
274 */ 311 /* The code below is only ever reached in case of a failure.
275 BUG_ON(!error); 312 * Otherwise execution continues at place where
313 * swsusp_arch_suspend() was called
314 */
315 BUG_ON(!error);
316 /* This call to restore_highmem() undos the previous one */
317 restore_highmem();
318 }
276 /* The only reason why swsusp_arch_resume() can fail is memory being 319 /* The only reason why swsusp_arch_resume() can fail is memory being
277 * very tight, so we have to free it as soon as we can to avoid 320 * very tight, so we have to free it as soon as we can to avoid
278 * subsequent failures 321 * subsequent failures
279 */ 322 */
280 swsusp_free(); 323 swsusp_free();
281 restore_processor_state(); 324 restore_processor_state();
282 restore_highmem();
283 touch_softlockup_watchdog(); 325 touch_softlockup_watchdog();
284 device_power_up(); 326 device_power_up();
285 local_irq_enable(); 327 local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a4..f7b7a785a5c6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/reboot.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/device.h> 16#include <linux/device.h>
16#include <linux/miscdevice.h> 17#include <linux/miscdevice.h>
@@ -21,6 +22,7 @@
21#include <linux/fs.h> 22#include <linux/fs.h>
22#include <linux/console.h> 23#include <linux/console.h>
23#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h>
24 26
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
26 28
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
54 filp->private_data = data; 56 filp->private_data = data;
55 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 57 memset(&data->handle, 0, sizeof(struct snapshot_handle));
56 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { 58 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
57 data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; 59 data->swap = swsusp_resume_device ?
60 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
58 data->mode = O_RDONLY; 61 data->mode = O_RDONLY;
59 } else { 62 } else {
60 data->swap = -1; 63 data->swap = -1;
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
76 free_all_swap_pages(data->swap, data->bitmap); 79 free_all_swap_pages(data->swap, data->bitmap);
77 free_bitmap(data->bitmap); 80 free_bitmap(data->bitmap);
78 if (data->frozen) { 81 if (data->frozen) {
79 down(&pm_sem); 82 mutex_lock(&pm_mutex);
80 thaw_processes(); 83 thaw_processes();
81 enable_nonboot_cpus(); 84 enable_nonboot_cpus();
82 up(&pm_sem); 85 mutex_unlock(&pm_mutex);
83 } 86 }
84 atomic_inc(&device_available); 87 atomic_inc(&device_available);
85 return 0; 88 return 0;
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
124{ 127{
125 int error = 0; 128 int error = 0;
126 struct snapshot_data *data; 129 struct snapshot_data *data;
127 loff_t offset, avail; 130 loff_t avail;
131 sector_t offset;
128 132
129 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) 133 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
130 return -ENOTTY; 134 return -ENOTTY;
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
140 case SNAPSHOT_FREEZE: 144 case SNAPSHOT_FREEZE:
141 if (data->frozen) 145 if (data->frozen)
142 break; 146 break;
143 down(&pm_sem); 147 mutex_lock(&pm_mutex);
144 error = disable_nonboot_cpus(); 148 error = disable_nonboot_cpus();
145 if (!error) { 149 if (!error) {
146 error = freeze_processes(); 150 error = freeze_processes();
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
150 error = -EBUSY; 154 error = -EBUSY;
151 } 155 }
152 } 156 }
153 up(&pm_sem); 157 mutex_unlock(&pm_mutex);
154 if (!error) 158 if (!error)
155 data->frozen = 1; 159 data->frozen = 1;
156 break; 160 break;
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
158 case SNAPSHOT_UNFREEZE: 162 case SNAPSHOT_UNFREEZE:
159 if (!data->frozen) 163 if (!data->frozen)
160 break; 164 break;
161 down(&pm_sem); 165 mutex_lock(&pm_mutex);
162 thaw_processes(); 166 thaw_processes();
163 enable_nonboot_cpus(); 167 enable_nonboot_cpus();
164 up(&pm_sem); 168 mutex_unlock(&pm_mutex);
165 data->frozen = 0; 169 data->frozen = 0;
166 break; 170 break;
167 171
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
170 error = -EPERM; 174 error = -EPERM;
171 break; 175 break;
172 } 176 }
173 down(&pm_sem); 177 mutex_lock(&pm_mutex);
174 /* Free memory before shutting down devices. */ 178 /* Free memory before shutting down devices. */
175 error = swsusp_shrink_memory(); 179 error = swsusp_shrink_memory();
176 if (!error) { 180 if (!error) {
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
183 } 187 }
184 resume_console(); 188 resume_console();
185 } 189 }
186 up(&pm_sem); 190 mutex_unlock(&pm_mutex);
187 if (!error) 191 if (!error)
188 error = put_user(in_suspend, (unsigned int __user *)arg); 192 error = put_user(in_suspend, (unsigned int __user *)arg);
189 if (!error) 193 if (!error)
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
191 break; 195 break;
192 196
193 case SNAPSHOT_ATOMIC_RESTORE: 197 case SNAPSHOT_ATOMIC_RESTORE:
198 snapshot_write_finalize(&data->handle);
194 if (data->mode != O_WRONLY || !data->frozen || 199 if (data->mode != O_WRONLY || !data->frozen ||
195 !snapshot_image_loaded(&data->handle)) { 200 !snapshot_image_loaded(&data->handle)) {
196 error = -EPERM; 201 error = -EPERM;
197 break; 202 break;
198 } 203 }
199 snapshot_free_unused_memory(&data->handle); 204 mutex_lock(&pm_mutex);
200 down(&pm_sem);
201 pm_prepare_console(); 205 pm_prepare_console();
202 suspend_console(); 206 suspend_console();
203 error = device_suspend(PMSG_PRETHAW); 207 error = device_suspend(PMSG_PRETHAW);
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
207 } 211 }
208 resume_console(); 212 resume_console();
209 pm_restore_console(); 213 pm_restore_console();
210 up(&pm_sem); 214 mutex_unlock(&pm_mutex);
211 break; 215 break;
212 216
213 case SNAPSHOT_FREE: 217 case SNAPSHOT_FREE:
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
238 break; 242 break;
239 } 243 }
240 } 244 }
241 offset = alloc_swap_page(data->swap, data->bitmap); 245 offset = alloc_swapdev_block(data->swap, data->bitmap);
242 if (offset) { 246 if (offset) {
243 offset <<= PAGE_SHIFT; 247 offset <<= PAGE_SHIFT;
244 error = put_user(offset, (loff_t __user *)arg); 248 error = put_user(offset, (sector_t __user *)arg);
245 } else { 249 } else {
246 error = -ENOSPC; 250 error = -ENOSPC;
247 } 251 }
@@ -264,7 +268,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
264 * so we need to recode them 268 * so we need to recode them
265 */ 269 */
266 if (old_decode_dev(arg)) { 270 if (old_decode_dev(arg)) {
267 data->swap = swap_type_of(old_decode_dev(arg)); 271 data->swap = swap_type_of(old_decode_dev(arg),
272 0, NULL);
268 if (data->swap < 0) 273 if (data->swap < 0)
269 error = -ENODEV; 274 error = -ENODEV;
270 } else { 275 } else {
@@ -282,7 +287,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
282 break; 287 break;
283 } 288 }
284 289
285 if (down_trylock(&pm_sem)) { 290 if (!mutex_trylock(&pm_mutex)) {
286 error = -EBUSY; 291 error = -EBUSY;
287 break; 292 break;
288 } 293 }
@@ -309,8 +314,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
309 if (pm_ops->finish) 314 if (pm_ops->finish)
310 pm_ops->finish(PM_SUSPEND_MEM); 315 pm_ops->finish(PM_SUSPEND_MEM);
311 316
312OutS3: 317 OutS3:
313 up(&pm_sem); 318 mutex_unlock(&pm_mutex);
319 break;
320
321 case SNAPSHOT_PMOPS:
322 switch (arg) {
323
324 case PMOPS_PREPARE:
325 if (pm_ops->prepare) {
326 error = pm_ops->prepare(PM_SUSPEND_DISK);
327 }
328 break;
329
330 case PMOPS_ENTER:
331 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
332 error = pm_ops->enter(PM_SUSPEND_DISK);
333 break;
334
335 case PMOPS_FINISH:
336 if (pm_ops && pm_ops->finish) {
337 pm_ops->finish(PM_SUSPEND_DISK);
338 }
339 break;
340
341 default:
342 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
343 error = -EINVAL;
344
345 }
346 break;
347
348 case SNAPSHOT_SET_SWAP_AREA:
349 if (data->bitmap) {
350 error = -EPERM;
351 } else {
352 struct resume_swap_area swap_area;
353 dev_t swdev;
354
355 error = copy_from_user(&swap_area, (void __user *)arg,
356 sizeof(struct resume_swap_area));
357 if (error) {
358 error = -EFAULT;
359 break;
360 }
361
362 /*
363 * User space encodes device types as two-byte values,
364 * so we need to recode them
365 */
366 swdev = old_decode_dev(swap_area.dev);
367 if (swdev) {
368 offset = swap_area.offset;
369 data->swap = swap_type_of(swdev, offset, NULL);
370 if (data->swap < 0)
371 error = -ENODEV;
372 } else {
373 data->swap = -1;
374 error = -EINVAL;
375 }
376 }
314 break; 377 break;
315 378
316 default: 379 default:
@@ -321,7 +384,7 @@ OutS3:
321 return error; 384 return error;
322} 385}
323 386
324static struct file_operations snapshot_fops = { 387static const struct file_operations snapshot_fops = {
325 .open = snapshot_open, 388 .open = snapshot_open,
326 .release = snapshot_release, 389 .release = snapshot_release,
327 .read = snapshot_read, 390 .read = snapshot_read,
diff --git a/kernel/printk.c b/kernel/printk.c
index 66426552fbfe..c770e1a4e882 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,8 +53,6 @@ int console_printk[4] = {
53 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 53 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
54}; 54};
55 55
56EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
57
58/* 56/*
59 * Low lever drivers may need that to know if they can schedule in 57 * Low lever drivers may need that to know if they can schedule in
60 * their unblank() callback or not. So let's export it. 58 * their unblank() callback or not. So let's export it.
@@ -335,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
335 } 333 }
336} 334}
337 335
336static int __read_mostly ignore_loglevel;
337
338static int __init ignore_loglevel_setup(char *str)
339{
340 ignore_loglevel = 1;
341 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
342
343 return 1;
344}
345
346__setup("ignore_loglevel", ignore_loglevel_setup);
347
338/* 348/*
339 * Write out chars from start to end - 1 inclusive 349 * Write out chars from start to end - 1 inclusive
340 */ 350 */
341static void _call_console_drivers(unsigned long start, 351static void _call_console_drivers(unsigned long start,
342 unsigned long end, int msg_log_level) 352 unsigned long end, int msg_log_level)
343{ 353{
344 if (msg_log_level < console_loglevel && 354 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
345 console_drivers && start != end) { 355 console_drivers && start != end) {
346 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { 356 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
347 /* wrapped write */ 357 /* wrapped write */
@@ -631,12 +641,7 @@ EXPORT_SYMBOL(vprintk);
631 641
632asmlinkage long sys_syslog(int type, char __user *buf, int len) 642asmlinkage long sys_syslog(int type, char __user *buf, int len)
633{ 643{
634 return 0; 644 return -ENOSYS;
635}
636
637int do_syslog(int type, char __user *buf, int len)
638{
639 return 0;
640} 645}
641 646
642static void call_console_drivers(unsigned long start, unsigned long end) 647static void call_console_drivers(unsigned long start, unsigned long end)
@@ -777,7 +782,6 @@ int is_console_locked(void)
777{ 782{
778 return console_locked; 783 return console_locked;
779} 784}
780EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
781 785
782/** 786/**
783 * release_console_sem - unlock the console system 787 * release_console_sem - unlock the console system
diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec9..a6574a18514e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,10 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
40 40
41static atomic_t *prof_buffer; 41static atomic_t *prof_buffer;
42static unsigned long prof_len, prof_shift; 42static unsigned long prof_len, prof_shift;
43static int prof_on __read_mostly; 43
44int prof_on __read_mostly;
45EXPORT_SYMBOL_GPL(prof_on);
46
44static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 47static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
45#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 49static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +54,20 @@ static DEFINE_MUTEX(profile_flip_mutex);
51static int __init profile_setup(char * str) 54static int __init profile_setup(char * str)
52{ 55{
53 static char __initdata schedstr[] = "schedule"; 56 static char __initdata schedstr[] = "schedule";
57 static char __initdata sleepstr[] = "sleep";
58 static char __initdata kvmstr[] = "kvm";
54 int par; 59 int par;
55 60
56 if (!strncmp(str, schedstr, strlen(schedstr))) { 61 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
62 prof_on = SLEEP_PROFILING;
63 if (str[strlen(sleepstr)] == ',')
64 str += strlen(sleepstr) + 1;
65 if (get_option(&str, &par))
66 prof_shift = par;
67 printk(KERN_INFO
68 "kernel sleep profiling enabled (shift: %ld)\n",
69 prof_shift);
70 } else if (!strncmp(str, schedstr, strlen(schedstr))) {
57 prof_on = SCHED_PROFILING; 71 prof_on = SCHED_PROFILING;
58 if (str[strlen(schedstr)] == ',') 72 if (str[strlen(schedstr)] == ',')
59 str += strlen(schedstr) + 1; 73 str += strlen(schedstr) + 1;
@@ -62,6 +76,15 @@ static int __init profile_setup(char * str)
62 printk(KERN_INFO 76 printk(KERN_INFO
63 "kernel schedule profiling enabled (shift: %ld)\n", 77 "kernel schedule profiling enabled (shift: %ld)\n",
64 prof_shift); 78 prof_shift);
79 } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
80 prof_on = KVM_PROFILING;
81 if (str[strlen(kvmstr)] == ',')
82 str += strlen(kvmstr) + 1;
83 if (get_option(&str, &par))
84 prof_shift = par;
85 printk(KERN_INFO
86 "kernel KVM profiling enabled (shift: %ld)\n",
87 prof_shift);
65 } else if (get_option(&str, &par)) { 88 } else if (get_option(&str, &par)) {
66 prof_shift = par; 89 prof_shift = par;
67 prof_on = CPU_PROFILING; 90 prof_on = CPU_PROFILING;
@@ -204,7 +227,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
204 * positions to which hits are accounted during short intervals (e.g. 227 * positions to which hits are accounted during short intervals (e.g.
205 * several seconds) is usually very small. Exclusion from buffer 228 * several seconds) is usually very small. Exclusion from buffer
206 * flipping is provided by interrupt disablement (note that for 229 * flipping is provided by interrupt disablement (note that for
207 * SCHED_PROFILING profile_hit() may be called from process context). 230 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
231 * process context).
208 * The hash function is meant to be lightweight as opposed to strong, 232 * The hash function is meant to be lightweight as opposed to strong,
209 * and was vaguely inspired by ppc64 firmware-supported inverted 233 * and was vaguely inspired by ppc64 firmware-supported inverted
210 * pagetable hash functions, but uses a full hashtable full of finite 234 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +281,7 @@ static void profile_discard_flip_buffers(void)
257 mutex_unlock(&profile_flip_mutex); 281 mutex_unlock(&profile_flip_mutex);
258} 282}
259 283
260void profile_hit(int type, void *__pc) 284void profile_hits(int type, void *__pc, unsigned int nr_hits)
261{ 285{
262 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 286 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
263 int i, j, cpu; 287 int i, j, cpu;
@@ -274,21 +298,31 @@ void profile_hit(int type, void *__pc)
274 put_cpu(); 298 put_cpu();
275 return; 299 return;
276 } 300 }
301 /*
302 * We buffer the global profiler buffer into a per-CPU
303 * queue and thus reduce the number of global (and possibly
304 * NUMA-alien) accesses. The write-queue is self-coalescing:
305 */
277 local_irq_save(flags); 306 local_irq_save(flags);
278 do { 307 do {
279 for (j = 0; j < PROFILE_GRPSZ; ++j) { 308 for (j = 0; j < PROFILE_GRPSZ; ++j) {
280 if (hits[i + j].pc == pc) { 309 if (hits[i + j].pc == pc) {
281 hits[i + j].hits++; 310 hits[i + j].hits += nr_hits;
282 goto out; 311 goto out;
283 } else if (!hits[i + j].hits) { 312 } else if (!hits[i + j].hits) {
284 hits[i + j].pc = pc; 313 hits[i + j].pc = pc;
285 hits[i + j].hits = 1; 314 hits[i + j].hits = nr_hits;
286 goto out; 315 goto out;
287 } 316 }
288 } 317 }
289 i = (i + secondary) & (NR_PROFILE_HIT - 1); 318 i = (i + secondary) & (NR_PROFILE_HIT - 1);
290 } while (i != primary); 319 } while (i != primary);
291 atomic_inc(&prof_buffer[pc]); 320
321 /*
322 * Add the current hit(s) and flush the write-queue out
323 * to the global buffer:
324 */
325 atomic_add(nr_hits, &prof_buffer[pc]);
292 for (i = 0; i < NR_PROFILE_HIT; ++i) { 326 for (i = 0; i < NR_PROFILE_HIT; ++i) {
293 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 327 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
294 hits[i].pc = hits[i].hits = 0; 328 hits[i].pc = hits[i].hits = 0;
@@ -297,8 +331,8 @@ out:
297 local_irq_restore(flags); 331 local_irq_restore(flags);
298 put_cpu(); 332 put_cpu();
299} 333}
334EXPORT_SYMBOL_GPL(profile_hits);
300 335
301#ifdef CONFIG_HOTPLUG_CPU
302static int __devinit profile_cpu_callback(struct notifier_block *info, 336static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 337 unsigned long action, void *__cpu)
304{ 338{
@@ -351,19 +385,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
351 } 385 }
352 return NOTIFY_OK; 386 return NOTIFY_OK;
353} 387}
354#endif /* CONFIG_HOTPLUG_CPU */
355#else /* !CONFIG_SMP */ 388#else /* !CONFIG_SMP */
356#define profile_flip_buffers() do { } while (0) 389#define profile_flip_buffers() do { } while (0)
357#define profile_discard_flip_buffers() do { } while (0) 390#define profile_discard_flip_buffers() do { } while (0)
391#define profile_cpu_callback NULL
358 392
359void profile_hit(int type, void *__pc) 393void profile_hits(int type, void *__pc, unsigned int nr_hits)
360{ 394{
361 unsigned long pc; 395 unsigned long pc;
362 396
363 if (prof_on != type || !prof_buffer) 397 if (prof_on != type || !prof_buffer)
364 return; 398 return;
365 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 399 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
366 atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); 400 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
367} 401}
368#endif /* !CONFIG_SMP */ 402#endif /* !CONFIG_SMP */
369 403
@@ -442,7 +476,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
442 read = 0; 476 read = 0;
443 477
444 while (p < sizeof(unsigned int) && count > 0) { 478 while (p < sizeof(unsigned int) && count > 0) {
445 put_user(*((char *)(&sample_step)+p),buf); 479 if (put_user(*((char *)(&sample_step)+p),buf))
480 return -EFAULT;
446 buf++; p++; count--; read++; 481 buf++; p++; count--; read++;
447 } 482 }
448 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 483 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
@@ -480,7 +515,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
480 return count; 515 return count;
481} 516}
482 517
483static struct file_operations proc_profile_operations = { 518static const struct file_operations proc_profile_operations = {
484 .read = read_profile, 519 .read = read_profile,
485 .write = write_profile, 520 .write = write_profile,
486}; 521};
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef1..3554b76da84c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
235 235
236 list = rdp->donelist; 236 list = rdp->donelist;
237 while (list) { 237 while (list) {
238 next = rdp->donelist = list->next; 238 next = list->next;
239 prefetch(next);
239 list->func(list); 240 list->func(list);
240 list = next; 241 list = next;
241 if (++count >= rdp->blimit) 242 if (++count >= rdp->blimit)
242 break; 243 break;
243 } 244 }
245 rdp->donelist = list;
244 246
245 local_irq_disable(); 247 local_irq_disable();
246 rdp->qlen -= count; 248 rdp->qlen -= count;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f42..482b11ff65cb 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
401 cleanup_srcu_struct(&srcu_ctl); 401 cleanup_srcu_struct(&srcu_ctl);
402} 402}
403 403
404static int srcu_torture_read_lock(void) 404static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
405{ 405{
406 return srcu_read_lock(&srcu_ctl); 406 return srcu_read_lock(&srcu_ctl);
407} 407}
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
419 schedule_timeout_interruptible(longdelay); 419 schedule_timeout_interruptible(longdelay);
420} 420}
421 421
422static void srcu_torture_read_unlock(int idx) 422static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
423{ 423{
424 srcu_read_unlock(&srcu_ctl, idx); 424 srcu_read_unlock(&srcu_ctl, idx);
425} 425}
@@ -522,6 +522,7 @@ rcu_torture_writer(void *arg)
522 522
523 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 523 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
524 set_user_nice(current, 19); 524 set_user_nice(current, 19);
525 current->flags |= PF_NOFREEZE;
525 526
526 do { 527 do {
527 schedule_timeout_uninterruptible(1); 528 schedule_timeout_uninterruptible(1);
@@ -561,6 +562,7 @@ rcu_torture_fakewriter(void *arg)
561 562
562 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); 563 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
563 set_user_nice(current, 19); 564 set_user_nice(current, 19);
565 current->flags |= PF_NOFREEZE;
564 566
565 do { 567 do {
566 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 568 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
@@ -591,6 +593,7 @@ rcu_torture_reader(void *arg)
591 593
592 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 594 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
593 set_user_nice(current, 19); 595 set_user_nice(current, 19);
596 current->flags |= PF_NOFREEZE;
594 597
595 do { 598 do {
596 idx = cur_ops->readlock(); 599 idx = cur_ops->readlock();
diff --git a/kernel/relay.c b/kernel/relay.c
index f04bbdb56ac2..284e2e8b4eed 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -138,7 +138,7 @@ depopulate:
138 */ 138 */
139struct rchan_buf *relay_create_buf(struct rchan *chan) 139struct rchan_buf *relay_create_buf(struct rchan *chan)
140{ 140{
141 struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL); 141 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
142 if (!buf) 142 if (!buf)
143 return NULL; 143 return NULL;
144 144
@@ -302,15 +302,16 @@ static struct rchan_callbacks default_channel_callbacks = {
302 302
303/** 303/**
304 * wakeup_readers - wake up readers waiting on a channel 304 * wakeup_readers - wake up readers waiting on a channel
305 * @private: the channel buffer 305 * @work: work struct that contains the the channel buffer
306 * 306 *
307 * This is the work function used to defer reader waking. The 307 * This is the work function used to defer reader waking. The
308 * reason waking is deferred is that calling directly from write 308 * reason waking is deferred is that calling directly from write
309 * causes problems if you're writing from say the scheduler. 309 * causes problems if you're writing from say the scheduler.
310 */ 310 */
311static void wakeup_readers(void *private) 311static void wakeup_readers(struct work_struct *work)
312{ 312{
313 struct rchan_buf *buf = private; 313 struct rchan_buf *buf =
314 container_of(work, struct rchan_buf, wake_readers.work);
314 wake_up_interruptible(&buf->read_wait); 315 wake_up_interruptible(&buf->read_wait);
315} 316}
316 317
@@ -321,14 +322,14 @@ static void wakeup_readers(void *private)
321 * 322 *
322 * See relay_reset for description of effect. 323 * See relay_reset for description of effect.
323 */ 324 */
324static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) 325static void __relay_reset(struct rchan_buf *buf, unsigned int init)
325{ 326{
326 size_t i; 327 size_t i;
327 328
328 if (init) { 329 if (init) {
329 init_waitqueue_head(&buf->read_wait); 330 init_waitqueue_head(&buf->read_wait);
330 kref_init(&buf->kref); 331 kref_init(&buf->kref);
331 INIT_WORK(&buf->wake_readers, NULL, NULL); 332 INIT_DELAYED_WORK(&buf->wake_readers, NULL);
332 } else { 333 } else {
333 cancel_delayed_work(&buf->wake_readers); 334 cancel_delayed_work(&buf->wake_readers);
334 flush_scheduled_work(); 335 flush_scheduled_work();
@@ -417,7 +418,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
417 * The channel buffer and channel buffer data structure are then freed 418 * The channel buffer and channel buffer data structure are then freed
418 * automatically when the last reference is given up. 419 * automatically when the last reference is given up.
419 */ 420 */
420static inline void relay_close_buf(struct rchan_buf *buf) 421static void relay_close_buf(struct rchan_buf *buf)
421{ 422{
422 buf->finalized = 1; 423 buf->finalized = 1;
423 cancel_delayed_work(&buf->wake_readers); 424 cancel_delayed_work(&buf->wake_readers);
@@ -425,7 +426,7 @@ static inline void relay_close_buf(struct rchan_buf *buf)
425 kref_put(&buf->kref, relay_remove_buf); 426 kref_put(&buf->kref, relay_remove_buf);
426} 427}
427 428
428static inline void setup_callbacks(struct rchan *chan, 429static void setup_callbacks(struct rchan *chan,
429 struct rchan_callbacks *cb) 430 struct rchan_callbacks *cb)
430{ 431{
431 if (!cb) { 432 if (!cb) {
@@ -478,7 +479,7 @@ struct rchan *relay_open(const char *base_filename,
478 if (!(subbuf_size && n_subbufs)) 479 if (!(subbuf_size && n_subbufs))
479 return NULL; 480 return NULL;
480 481
481 chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL); 482 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
482 if (!chan) 483 if (!chan)
483 return NULL; 484 return NULL;
484 485
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
549 buf->padding[old_subbuf]; 550 buf->padding[old_subbuf];
550 smp_mb(); 551 smp_mb();
551 if (waitqueue_active(&buf->read_wait)) { 552 if (waitqueue_active(&buf->read_wait)) {
552 PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); 553 PREPARE_DELAYED_WORK(&buf->wake_readers,
554 wakeup_readers);
553 schedule_delayed_work(&buf->wake_readers, 1); 555 schedule_delayed_work(&buf->wake_readers, 1);
554 } 556 }
555 } 557 }
@@ -944,11 +946,10 @@ typedef int (*subbuf_actor_t) (size_t read_start,
944/* 946/*
945 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries 947 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
946 */ 948 */
947static inline ssize_t relay_file_read_subbufs(struct file *filp, 949static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
948 loff_t *ppos, 950 subbuf_actor_t subbuf_actor,
949 subbuf_actor_t subbuf_actor, 951 read_actor_t actor,
950 read_actor_t actor, 952 read_descriptor_t *desc)
951 read_descriptor_t *desc)
952{ 953{
953 struct rchan_buf *buf = filp->private_data; 954 struct rchan_buf *buf = filp->private_data;
954 size_t read_start, avail; 955 size_t read_start, avail;
@@ -957,7 +958,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
957 if (!desc->count) 958 if (!desc->count)
958 return 0; 959 return 0;
959 960
960 mutex_lock(&filp->f_dentry->d_inode->i_mutex); 961 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
961 do { 962 do {
962 if (!relay_file_read_avail(buf, *ppos)) 963 if (!relay_file_read_avail(buf, *ppos))
963 break; 964 break;
@@ -977,7 +978,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
977 *ppos = relay_file_read_end_pos(buf, read_start, ret); 978 *ppos = relay_file_read_end_pos(buf, read_start, ret);
978 } 979 }
979 } while (desc->count && ret); 980 } while (desc->count && ret);
980 mutex_unlock(&filp->f_dentry->d_inode->i_mutex); 981 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
981 982
982 return desc->written; 983 return desc->written;
983} 984}
@@ -1011,7 +1012,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
1011 actor, &desc); 1012 actor, &desc);
1012} 1013}
1013 1014
1014struct file_operations relay_file_operations = { 1015const struct file_operations relay_file_operations = {
1015 .open = relay_file_open, 1016 .open = relay_file_open,
1016 .poll = relay_file_poll, 1017 .poll = relay_file_poll,
1017 .mmap = relay_file_mmap, 1018 .mmap = relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143e..7b9a497419d9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
88 return 0; 88 return 0;
89} 89}
90 90
91static struct seq_operations resource_op = { 91static const struct seq_operations resource_op = {
92 .start = r_start, 92 .start = r_start,
93 .next = r_next, 93 .next = r_next,
94 .stop = r_stop, 94 .stop = r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
115 return res; 115 return res;
116} 116}
117 117
118static struct file_operations proc_ioports_operations = { 118static const struct file_operations proc_ioports_operations = {
119 .open = ioports_open, 119 .open = ioports_open,
120 .read = seq_read, 120 .read = seq_read,
121 .llseek = seq_lseek, 121 .llseek = seq_lseek,
122 .release = seq_release, 122 .release = seq_release,
123}; 123};
124 124
125static struct file_operations proc_iomem_operations = { 125static const struct file_operations proc_iomem_operations = {
126 .open = iomem_open, 126 .open = iomem_open,
127 .read = seq_read, 127 .read = seq_read,
128 .llseek = seq_lseek, 128 .llseek = seq_lseek,
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c94..015fc633c96c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/sysdev.h> 14#include <linux/sysdev.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16#include <linux/freezer.h>
16 17
17#include "rtmutex.h" 18#include "rtmutex.h"
18 19
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701c680e..cca93cc0dd7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
34#include <linux/security.h> 34#include <linux/security.h>
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/profile.h> 36#include <linux/profile.h>
37#include <linux/suspend.h> 37#include <linux/freezer.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
@@ -225,8 +225,10 @@ struct rq {
225 unsigned long nr_uninterruptible; 225 unsigned long nr_uninterruptible;
226 226
227 unsigned long expired_timestamp; 227 unsigned long expired_timestamp;
228 unsigned long long timestamp_last_tick; 228 /* Cached timestamp set by update_cpu_clock() */
229 unsigned long long most_recent_timestamp;
229 struct task_struct *curr, *idle; 230 struct task_struct *curr, *idle;
231 unsigned long next_balance;
230 struct mm_struct *prev_mm; 232 struct mm_struct *prev_mm;
231 struct prio_array *active, *expired, arrays[2]; 233 struct prio_array *active, *expired, arrays[2];
232 int best_expired_prio; 234 int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
426 * bump this up when changing the output format or the meaning of an existing 428 * bump this up when changing the output format or the meaning of an existing
427 * format, so that tools can adapt (or abort) 429 * format, so that tools can adapt (or abort)
428 */ 430 */
429#define SCHEDSTAT_VERSION 12 431#define SCHEDSTAT_VERSION 14
430 432
431static int show_schedstat(struct seq_file *seq, void *v) 433static int show_schedstat(struct seq_file *seq, void *v)
432{ 434{
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
464 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 466 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
465 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
466 itype++) { 468 itype++) {
467 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
470 "%lu",
468 sd->lb_cnt[itype], 471 sd->lb_cnt[itype],
469 sd->lb_balanced[itype], 472 sd->lb_balanced[itype],
470 sd->lb_failed[itype], 473 sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
474 sd->lb_nobusyq[itype], 477 sd->lb_nobusyq[itype],
475 sd->lb_nobusyg[itype]); 478 sd->lb_nobusyg[itype]);
476 } 479 }
477 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 480 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
481 " %lu %lu %lu\n",
478 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 482 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
479 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 483 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
480 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 484 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
481 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 485 sd->ttwu_wake_remote, sd->ttwu_move_affine,
486 sd->ttwu_move_balance);
482 } 487 }
483 preempt_enable(); 488 preempt_enable();
484#endif 489#endif
@@ -505,7 +510,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
505 return res; 510 return res;
506} 511}
507 512
508struct file_operations proc_schedstat_operations = { 513const struct file_operations proc_schedstat_operations = {
509 .open = schedstat_open, 514 .open = schedstat_open,
510 .read = seq_read, 515 .read = seq_read,
511 .llseek = seq_lseek, 516 .llseek = seq_lseek,
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
547#endif 552#endif
548 553
549/* 554/*
550 * rq_lock - lock a given runqueue and disable interrupts. 555 * this_rq_lock - lock this runqueue and disable interrupts.
551 */ 556 */
552static inline struct rq *this_rq_lock(void) 557static inline struct rq *this_rq_lock(void)
553 __acquires(rq->lock) 558 __acquires(rq->lock)
@@ -938,18 +943,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
938{ 943{
939 unsigned long long now; 944 unsigned long long now;
940 945
946 if (rt_task(p))
947 goto out;
948
941 now = sched_clock(); 949 now = sched_clock();
942#ifdef CONFIG_SMP 950#ifdef CONFIG_SMP
943 if (!local) { 951 if (!local) {
944 /* Compensate for drifting sched_clock */ 952 /* Compensate for drifting sched_clock */
945 struct rq *this_rq = this_rq(); 953 struct rq *this_rq = this_rq();
946 now = (now - this_rq->timestamp_last_tick) 954 now = (now - this_rq->most_recent_timestamp)
947 + rq->timestamp_last_tick; 955 + rq->most_recent_timestamp;
948 } 956 }
949#endif 957#endif
950 958
951 if (!rt_task(p)) 959 /*
952 p->prio = recalc_task_prio(p, now); 960 * Sleep time is in units of nanosecs, so shift by 20 to get a
961 * milliseconds-range estimation of the amount of time that the task
962 * spent sleeping:
963 */
964 if (unlikely(prof_on == SLEEP_PROFILING)) {
965 if (p->state == TASK_UNINTERRUPTIBLE)
966 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
967 (now - p->timestamp) >> 20);
968 }
969
970 p->prio = recalc_task_prio(p, now);
953 971
954 /* 972 /*
955 * This checks to make sure it's not an uninterruptible task 973 * This checks to make sure it's not an uninterruptible task
@@ -974,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
974 } 992 }
975 } 993 }
976 p->timestamp = now; 994 p->timestamp = now;
977 995out:
978 __activate_task(p, rq); 996 __activate_task(p, rq);
979} 997}
980 998
@@ -1439,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1439 1457
1440 if (this_sd->flags & SD_WAKE_AFFINE) { 1458 if (this_sd->flags & SD_WAKE_AFFINE) {
1441 unsigned long tl = this_load; 1459 unsigned long tl = this_load;
1442 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); 1460 unsigned long tl_per_task;
1461
1462 tl_per_task = cpu_avg_load_per_task(this_cpu);
1443 1463
1444 /* 1464 /*
1445 * If sync wakeup then subtract the (maximum possible) 1465 * If sync wakeup then subtract the (maximum possible)
@@ -1547,6 +1567,7 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1547 return try_to_wake_up(p, state, 0); 1567 return try_to_wake_up(p, state, 0);
1548} 1568}
1549 1569
1570static void task_running_tick(struct rq *rq, struct task_struct *p);
1550/* 1571/*
1551 * Perform scheduler related setup for a newly forked process p. 1572 * Perform scheduler related setup for a newly forked process p.
1552 * p is forked by current. 1573 * p is forked by current.
@@ -1607,7 +1628,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1607 * runqueue lock is not a problem. 1628 * runqueue lock is not a problem.
1608 */ 1629 */
1609 current->time_slice = 1; 1630 current->time_slice = 1;
1610 scheduler_tick(); 1631 task_running_tick(cpu_rq(cpu), current);
1611 } 1632 }
1612 local_irq_enable(); 1633 local_irq_enable();
1613 put_cpu(); 1634 put_cpu();
@@ -1677,8 +1698,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1677 * Not the local CPU - must adjust timestamp. This should 1698 * Not the local CPU - must adjust timestamp. This should
1678 * get optimised away in the !CONFIG_SMP case. 1699 * get optimised away in the !CONFIG_SMP case.
1679 */ 1700 */
1680 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) 1701 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1681 + rq->timestamp_last_tick; 1702 + rq->most_recent_timestamp;
1682 __activate_task(p, rq); 1703 __activate_task(p, rq);
1683 if (TASK_PREEMPTS_CURR(p, rq)) 1704 if (TASK_PREEMPTS_CURR(p, rq))
1684 resched_task(rq->curr); 1705 resched_task(rq->curr);
@@ -1941,6 +1962,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1941 __acquires(rq1->lock) 1962 __acquires(rq1->lock)
1942 __acquires(rq2->lock) 1963 __acquires(rq2->lock)
1943{ 1964{
1965 BUG_ON(!irqs_disabled());
1944 if (rq1 == rq2) { 1966 if (rq1 == rq2) {
1945 spin_lock(&rq1->lock); 1967 spin_lock(&rq1->lock);
1946 __acquire(rq2->lock); /* Fake it out ;) */ 1968 __acquire(rq2->lock); /* Fake it out ;) */
@@ -1980,6 +2002,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1980 __acquires(busiest->lock) 2002 __acquires(busiest->lock)
1981 __acquires(this_rq->lock) 2003 __acquires(this_rq->lock)
1982{ 2004{
2005 if (unlikely(!irqs_disabled())) {
2006 /* printk() doesn't work good under rq->lock */
2007 spin_unlock(&this_rq->lock);
2008 BUG_ON(1);
2009 }
1983 if (unlikely(!spin_trylock(&busiest->lock))) { 2010 if (unlikely(!spin_trylock(&busiest->lock))) {
1984 if (busiest < this_rq) { 2011 if (busiest < this_rq) {
1985 spin_unlock(&this_rq->lock); 2012 spin_unlock(&this_rq->lock);
@@ -2050,8 +2077,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2050 set_task_cpu(p, this_cpu); 2077 set_task_cpu(p, this_cpu);
2051 inc_nr_running(p, this_rq); 2078 inc_nr_running(p, this_rq);
2052 enqueue_task(p, this_array); 2079 enqueue_task(p, this_array);
2053 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 2080 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2054 + this_rq->timestamp_last_tick; 2081 + this_rq->most_recent_timestamp;
2055 /* 2082 /*
2056 * Note that idle threads have a prio of MAX_PRIO, for this test 2083 * Note that idle threads have a prio of MAX_PRIO, for this test
2057 * to be always true for them. 2084 * to be always true for them.
@@ -2087,10 +2114,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2087 * 2) too many balance attempts have failed. 2114 * 2) too many balance attempts have failed.
2088 */ 2115 */
2089 2116
2090 if (sd->nr_balance_failed > sd->cache_nice_tries) 2117 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2118#ifdef CONFIG_SCHEDSTATS
2119 if (task_hot(p, rq->most_recent_timestamp, sd))
2120 schedstat_inc(sd, lb_hot_gained[idle]);
2121#endif
2091 return 1; 2122 return 1;
2123 }
2092 2124
2093 if (task_hot(p, rq->timestamp_last_tick, sd)) 2125 if (task_hot(p, rq->most_recent_timestamp, sd))
2094 return 0; 2126 return 0;
2095 return 1; 2127 return 1;
2096} 2128}
@@ -2188,11 +2220,6 @@ skip_queue:
2188 goto skip_bitmap; 2220 goto skip_bitmap;
2189 } 2221 }
2190 2222
2191#ifdef CONFIG_SCHEDSTATS
2192 if (task_hot(tmp, busiest->timestamp_last_tick, sd))
2193 schedstat_inc(sd, lb_hot_gained[idle]);
2194#endif
2195
2196 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2223 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2197 pulled++; 2224 pulled++;
2198 rem_load_move -= tmp->load_weight; 2225 rem_load_move -= tmp->load_weight;
@@ -2230,7 +2257,7 @@ out:
2230static struct sched_group * 2257static struct sched_group *
2231find_busiest_group(struct sched_domain *sd, int this_cpu, 2258find_busiest_group(struct sched_domain *sd, int this_cpu,
2232 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2259 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2233 cpumask_t *cpus) 2260 cpumask_t *cpus, int *balance)
2234{ 2261{
2235 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2262 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2236 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2263 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2259,10 +2286,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2259 unsigned long load, group_capacity; 2286 unsigned long load, group_capacity;
2260 int local_group; 2287 int local_group;
2261 int i; 2288 int i;
2289 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2262 unsigned long sum_nr_running, sum_weighted_load; 2290 unsigned long sum_nr_running, sum_weighted_load;
2263 2291
2264 local_group = cpu_isset(this_cpu, group->cpumask); 2292 local_group = cpu_isset(this_cpu, group->cpumask);
2265 2293
2294 if (local_group)
2295 balance_cpu = first_cpu(group->cpumask);
2296
2266 /* Tally up the load of all CPUs in the group */ 2297 /* Tally up the load of all CPUs in the group */
2267 sum_weighted_load = sum_nr_running = avg_load = 0; 2298 sum_weighted_load = sum_nr_running = avg_load = 0;
2268 2299
@@ -2278,9 +2309,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2278 *sd_idle = 0; 2309 *sd_idle = 0;
2279 2310
2280 /* Bias balancing toward cpus of our domain */ 2311 /* Bias balancing toward cpus of our domain */
2281 if (local_group) 2312 if (local_group) {
2313 if (idle_cpu(i) && !first_idle_cpu) {
2314 first_idle_cpu = 1;
2315 balance_cpu = i;
2316 }
2317
2282 load = target_load(i, load_idx); 2318 load = target_load(i, load_idx);
2283 else 2319 } else
2284 load = source_load(i, load_idx); 2320 load = source_load(i, load_idx);
2285 2321
2286 avg_load += load; 2322 avg_load += load;
@@ -2288,6 +2324,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2288 sum_weighted_load += rq->raw_weighted_load; 2324 sum_weighted_load += rq->raw_weighted_load;
2289 } 2325 }
2290 2326
2327 /*
2328 * First idle cpu or the first cpu(busiest) in this sched group
2329 * is eligible for doing load balancing at this and above
2330 * domains.
2331 */
2332 if (local_group && balance_cpu != this_cpu && balance) {
2333 *balance = 0;
2334 goto ret;
2335 }
2336
2291 total_load += avg_load; 2337 total_load += avg_load;
2292 total_pwr += group->cpu_power; 2338 total_pwr += group->cpu_power;
2293 2339
@@ -2447,18 +2493,21 @@ small_imbalance:
2447 pwr_now /= SCHED_LOAD_SCALE; 2493 pwr_now /= SCHED_LOAD_SCALE;
2448 2494
2449 /* Amount of load we'd subtract */ 2495 /* Amount of load we'd subtract */
2450 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; 2496 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2497 busiest->cpu_power;
2451 if (max_load > tmp) 2498 if (max_load > tmp)
2452 pwr_move += busiest->cpu_power * 2499 pwr_move += busiest->cpu_power *
2453 min(busiest_load_per_task, max_load - tmp); 2500 min(busiest_load_per_task, max_load - tmp);
2454 2501
2455 /* Amount of load we'd add */ 2502 /* Amount of load we'd add */
2456 if (max_load*busiest->cpu_power < 2503 if (max_load * busiest->cpu_power <
2457 busiest_load_per_task*SCHED_LOAD_SCALE) 2504 busiest_load_per_task * SCHED_LOAD_SCALE)
2458 tmp = max_load*busiest->cpu_power/this->cpu_power; 2505 tmp = max_load * busiest->cpu_power / this->cpu_power;
2459 else 2506 else
2460 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; 2507 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2461 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); 2508 this->cpu_power;
2509 pwr_move += this->cpu_power *
2510 min(this_load_per_task, this_load + tmp);
2462 pwr_move /= SCHED_LOAD_SCALE; 2511 pwr_move /= SCHED_LOAD_SCALE;
2463 2512
2464 /* Move if we gain throughput */ 2513 /* Move if we gain throughput */
@@ -2479,8 +2528,8 @@ out_balanced:
2479 *imbalance = min_load_per_task; 2528 *imbalance = min_load_per_task;
2480 return group_min; 2529 return group_min;
2481 } 2530 }
2482ret:
2483#endif 2531#endif
2532ret:
2484 *imbalance = 0; 2533 *imbalance = 0;
2485 return NULL; 2534 return NULL;
2486} 2535}
@@ -2529,17 +2578,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2529/* 2578/*
2530 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2579 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2531 * tasks if there is an imbalance. 2580 * tasks if there is an imbalance.
2532 *
2533 * Called with this_rq unlocked.
2534 */ 2581 */
2535static int load_balance(int this_cpu, struct rq *this_rq, 2582static int load_balance(int this_cpu, struct rq *this_rq,
2536 struct sched_domain *sd, enum idle_type idle) 2583 struct sched_domain *sd, enum idle_type idle,
2584 int *balance)
2537{ 2585{
2538 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2586 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2539 struct sched_group *group; 2587 struct sched_group *group;
2540 unsigned long imbalance; 2588 unsigned long imbalance;
2541 struct rq *busiest; 2589 struct rq *busiest;
2542 cpumask_t cpus = CPU_MASK_ALL; 2590 cpumask_t cpus = CPU_MASK_ALL;
2591 unsigned long flags;
2543 2592
2544 /* 2593 /*
2545 * When power savings policy is enabled for the parent domain, idle 2594 * When power savings policy is enabled for the parent domain, idle
@@ -2555,7 +2604,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2555 2604
2556redo: 2605redo:
2557 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2606 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2558 &cpus); 2607 &cpus, balance);
2608
2609 if (*balance == 0)
2610 goto out_balanced;
2611
2559 if (!group) { 2612 if (!group) {
2560 schedstat_inc(sd, lb_nobusyg[idle]); 2613 schedstat_inc(sd, lb_nobusyg[idle]);
2561 goto out_balanced; 2614 goto out_balanced;
@@ -2579,11 +2632,13 @@ redo:
2579 * still unbalanced. nr_moved simply stays zero, so it is 2632 * still unbalanced. nr_moved simply stays zero, so it is
2580 * correctly treated as an imbalance. 2633 * correctly treated as an imbalance.
2581 */ 2634 */
2635 local_irq_save(flags);
2582 double_rq_lock(this_rq, busiest); 2636 double_rq_lock(this_rq, busiest);
2583 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2637 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2584 minus_1_or_zero(busiest->nr_running), 2638 minus_1_or_zero(busiest->nr_running),
2585 imbalance, sd, idle, &all_pinned); 2639 imbalance, sd, idle, &all_pinned);
2586 double_rq_unlock(this_rq, busiest); 2640 double_rq_unlock(this_rq, busiest);
2641 local_irq_restore(flags);
2587 2642
2588 /* All tasks on this runqueue were pinned by CPU affinity */ 2643 /* All tasks on this runqueue were pinned by CPU affinity */
2589 if (unlikely(all_pinned)) { 2644 if (unlikely(all_pinned)) {
@@ -2600,13 +2655,13 @@ redo:
2600 2655
2601 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2656 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2602 2657
2603 spin_lock(&busiest->lock); 2658 spin_lock_irqsave(&busiest->lock, flags);
2604 2659
2605 /* don't kick the migration_thread, if the curr 2660 /* don't kick the migration_thread, if the curr
2606 * task on busiest cpu can't be moved to this_cpu 2661 * task on busiest cpu can't be moved to this_cpu
2607 */ 2662 */
2608 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 2663 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2609 spin_unlock(&busiest->lock); 2664 spin_unlock_irqrestore(&busiest->lock, flags);
2610 all_pinned = 1; 2665 all_pinned = 1;
2611 goto out_one_pinned; 2666 goto out_one_pinned;
2612 } 2667 }
@@ -2616,7 +2671,7 @@ redo:
2616 busiest->push_cpu = this_cpu; 2671 busiest->push_cpu = this_cpu;
2617 active_balance = 1; 2672 active_balance = 1;
2618 } 2673 }
2619 spin_unlock(&busiest->lock); 2674 spin_unlock_irqrestore(&busiest->lock, flags);
2620 if (active_balance) 2675 if (active_balance)
2621 wake_up_process(busiest->migration_thread); 2676 wake_up_process(busiest->migration_thread);
2622 2677
@@ -2695,7 +2750,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2695 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2750 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2696redo: 2751redo:
2697 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2752 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2698 &sd_idle, &cpus); 2753 &sd_idle, &cpus, NULL);
2699 if (!group) { 2754 if (!group) {
2700 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2755 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2701 goto out_balanced; 2756 goto out_balanced;
@@ -2755,14 +2810,28 @@ out_balanced:
2755static void idle_balance(int this_cpu, struct rq *this_rq) 2810static void idle_balance(int this_cpu, struct rq *this_rq)
2756{ 2811{
2757 struct sched_domain *sd; 2812 struct sched_domain *sd;
2813 int pulled_task = 0;
2814 unsigned long next_balance = jiffies + 60 * HZ;
2758 2815
2759 for_each_domain(this_cpu, sd) { 2816 for_each_domain(this_cpu, sd) {
2760 if (sd->flags & SD_BALANCE_NEWIDLE) { 2817 if (sd->flags & SD_BALANCE_NEWIDLE) {
2761 /* If we've pulled tasks over stop searching: */ 2818 /* If we've pulled tasks over stop searching: */
2762 if (load_balance_newidle(this_cpu, this_rq, sd)) 2819 pulled_task = load_balance_newidle(this_cpu,
2820 this_rq, sd);
2821 if (time_after(next_balance,
2822 sd->last_balance + sd->balance_interval))
2823 next_balance = sd->last_balance
2824 + sd->balance_interval;
2825 if (pulled_task)
2763 break; 2826 break;
2764 } 2827 }
2765 } 2828 }
2829 if (!pulled_task)
2830 /*
2831 * We are going idle. next_balance may be set based on
2832 * a busy processor. So reset next_balance.
2833 */
2834 this_rq->next_balance = next_balance;
2766} 2835}
2767 2836
2768/* 2837/*
@@ -2815,26 +2884,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2815 spin_unlock(&target_rq->lock); 2884 spin_unlock(&target_rq->lock);
2816} 2885}
2817 2886
2818/* 2887static void update_load(struct rq *this_rq)
2819 * rebalance_tick will get called every timer tick, on every CPU.
2820 *
2821 * It checks each scheduling domain to see if it is due to be balanced,
2822 * and initiates a balancing operation if so.
2823 *
2824 * Balancing parameters are set up in arch_init_sched_domains.
2825 */
2826
2827/* Don't have all balancing operations going off at once: */
2828static inline unsigned long cpu_offset(int cpu)
2829{
2830 return jiffies + cpu * HZ / NR_CPUS;
2831}
2832
2833static void
2834rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2835{ 2888{
2836 unsigned long this_load, interval, j = cpu_offset(this_cpu); 2889 unsigned long this_load;
2837 struct sched_domain *sd;
2838 int i, scale; 2890 int i, scale;
2839 2891
2840 this_load = this_rq->raw_weighted_load; 2892 this_load = this_rq->raw_weighted_load;
@@ -2854,6 +2906,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2854 new_load += scale-1; 2906 new_load += scale-1;
2855 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; 2907 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2856 } 2908 }
2909}
2910
2911/*
2912 * run_rebalance_domains is triggered when needed from the scheduler tick.
2913 *
2914 * It checks each scheduling domain to see if it is due to be balanced,
2915 * and initiates a balancing operation if so.
2916 *
2917 * Balancing parameters are set up in arch_init_sched_domains.
2918 */
2919static DEFINE_SPINLOCK(balancing);
2920
2921static void run_rebalance_domains(struct softirq_action *h)
2922{
2923 int this_cpu = smp_processor_id(), balance = 1;
2924 struct rq *this_rq = cpu_rq(this_cpu);
2925 unsigned long interval;
2926 struct sched_domain *sd;
2927 /*
2928 * We are idle if there are no processes running. This
2929 * is valid even if we are the idle process (SMT).
2930 */
2931 enum idle_type idle = !this_rq->nr_running ?
2932 SCHED_IDLE : NOT_IDLE;
2933 /* Earliest time when we have to call run_rebalance_domains again */
2934 unsigned long next_balance = jiffies + 60*HZ;
2857 2935
2858 for_each_domain(this_cpu, sd) { 2936 for_each_domain(this_cpu, sd) {
2859 if (!(sd->flags & SD_LOAD_BALANCE)) 2937 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2868,8 +2946,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2868 if (unlikely(!interval)) 2946 if (unlikely(!interval))
2869 interval = 1; 2947 interval = 1;
2870 2948
2871 if (j - sd->last_balance >= interval) { 2949 if (sd->flags & SD_SERIALIZE) {
2872 if (load_balance(this_cpu, this_rq, sd, idle)) { 2950 if (!spin_trylock(&balancing))
2951 goto out;
2952 }
2953
2954 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2955 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
2873 /* 2956 /*
2874 * We've pulled tasks over so either we're no 2957 * We've pulled tasks over so either we're no
2875 * longer idle, or one of our SMT siblings is 2958 * longer idle, or one of our SMT siblings is
@@ -2877,39 +2960,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2877 */ 2960 */
2878 idle = NOT_IDLE; 2961 idle = NOT_IDLE;
2879 } 2962 }
2880 sd->last_balance += interval; 2963 sd->last_balance = jiffies;
2881 } 2964 }
2965 if (sd->flags & SD_SERIALIZE)
2966 spin_unlock(&balancing);
2967out:
2968 if (time_after(next_balance, sd->last_balance + interval))
2969 next_balance = sd->last_balance + interval;
2970
2971 /*
2972 * Stop the load balance at this level. There is another
2973 * CPU in our sched group which is doing load balancing more
2974 * actively.
2975 */
2976 if (!balance)
2977 break;
2882 } 2978 }
2979 this_rq->next_balance = next_balance;
2883} 2980}
2884#else 2981#else
2885/* 2982/*
2886 * on UP we do not need to balance between CPUs: 2983 * on UP we do not need to balance between CPUs:
2887 */ 2984 */
2888static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
2889{
2890}
2891static inline void idle_balance(int cpu, struct rq *rq) 2985static inline void idle_balance(int cpu, struct rq *rq)
2892{ 2986{
2893} 2987}
2894#endif 2988#endif
2895 2989
2896static inline int wake_priority_sleeper(struct rq *rq) 2990static inline void wake_priority_sleeper(struct rq *rq)
2897{ 2991{
2898 int ret = 0;
2899
2900#ifdef CONFIG_SCHED_SMT 2992#ifdef CONFIG_SCHED_SMT
2993 if (!rq->nr_running)
2994 return;
2995
2901 spin_lock(&rq->lock); 2996 spin_lock(&rq->lock);
2902 /* 2997 /*
2903 * If an SMT sibling task has been put to sleep for priority 2998 * If an SMT sibling task has been put to sleep for priority
2904 * reasons reschedule the idle task to see if it can now run. 2999 * reasons reschedule the idle task to see if it can now run.
2905 */ 3000 */
2906 if (rq->nr_running) { 3001 if (rq->nr_running)
2907 resched_task(rq->idle); 3002 resched_task(rq->idle);
2908 ret = 1;
2909 }
2910 spin_unlock(&rq->lock); 3003 spin_unlock(&rq->lock);
2911#endif 3004#endif
2912 return ret;
2913} 3005}
2914 3006
2915DEFINE_PER_CPU(struct kernel_stat, kstat); 3007DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2923,7 +3015,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2923static inline void 3015static inline void
2924update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) 3016update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2925{ 3017{
2926 p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); 3018 p->sched_time += now - p->last_ran;
3019 p->last_ran = rq->most_recent_timestamp = now;
2927} 3020}
2928 3021
2929/* 3022/*
@@ -2936,8 +3029,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
2936 unsigned long flags; 3029 unsigned long flags;
2937 3030
2938 local_irq_save(flags); 3031 local_irq_save(flags);
2939 ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); 3032 ns = p->sched_time + sched_clock() - p->last_ran;
2940 ns = p->sched_time + sched_clock() - ns;
2941 local_irq_restore(flags); 3033 local_irq_restore(flags);
2942 3034
2943 return ns; 3035 return ns;
@@ -3037,35 +3129,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3037 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3129 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3038} 3130}
3039 3131
3040/* 3132static void task_running_tick(struct rq *rq, struct task_struct *p)
3041 * This function gets called by the timer code, with HZ frequency.
3042 * We call it with interrupts disabled.
3043 *
3044 * It also gets called by the fork code, when changing the parent's
3045 * timeslices.
3046 */
3047void scheduler_tick(void)
3048{ 3133{
3049 unsigned long long now = sched_clock();
3050 struct task_struct *p = current;
3051 int cpu = smp_processor_id();
3052 struct rq *rq = cpu_rq(cpu);
3053
3054 update_cpu_clock(p, rq, now);
3055
3056 rq->timestamp_last_tick = now;
3057
3058 if (p == rq->idle) {
3059 if (wake_priority_sleeper(rq))
3060 goto out;
3061 rebalance_tick(cpu, rq, SCHED_IDLE);
3062 return;
3063 }
3064
3065 /* Task might have expired already, but not scheduled off yet */
3066 if (p->array != rq->active) { 3134 if (p->array != rq->active) {
3135 /* Task has expired but was not scheduled yet */
3067 set_tsk_need_resched(p); 3136 set_tsk_need_resched(p);
3068 goto out; 3137 return;
3069 } 3138 }
3070 spin_lock(&rq->lock); 3139 spin_lock(&rq->lock);
3071 /* 3140 /*
@@ -3133,8 +3202,34 @@ void scheduler_tick(void)
3133 } 3202 }
3134out_unlock: 3203out_unlock:
3135 spin_unlock(&rq->lock); 3204 spin_unlock(&rq->lock);
3136out: 3205}
3137 rebalance_tick(cpu, rq, NOT_IDLE); 3206
3207/*
3208 * This function gets called by the timer code, with HZ frequency.
3209 * We call it with interrupts disabled.
3210 *
3211 * It also gets called by the fork code, when changing the parent's
3212 * timeslices.
3213 */
3214void scheduler_tick(void)
3215{
3216 unsigned long long now = sched_clock();
3217 struct task_struct *p = current;
3218 int cpu = smp_processor_id();
3219 struct rq *rq = cpu_rq(cpu);
3220
3221 update_cpu_clock(p, rq, now);
3222
3223 if (p == rq->idle)
3224 /* Task on the idle queue */
3225 wake_priority_sleeper(rq);
3226 else
3227 task_running_tick(rq, p);
3228#ifdef CONFIG_SMP
3229 update_load(rq);
3230 if (time_after_eq(jiffies, rq->next_balance))
3231 raise_softirq(SCHED_SOFTIRQ);
3232#endif
3138} 3233}
3139 3234
3140#ifdef CONFIG_SCHED_SMT 3235#ifdef CONFIG_SCHED_SMT
@@ -3280,7 +3375,8 @@ void fastcall add_preempt_count(int val)
3280 /* 3375 /*
3281 * Spinlock count overflowing soon? 3376 * Spinlock count overflowing soon?
3282 */ 3377 */
3283 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 3378 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3379 PREEMPT_MASK - 10);
3284} 3380}
3285EXPORT_SYMBOL(add_preempt_count); 3381EXPORT_SYMBOL(add_preempt_count);
3286 3382
@@ -3333,6 +3429,9 @@ asmlinkage void __sched schedule(void)
3333 printk(KERN_ERR "BUG: scheduling while atomic: " 3429 printk(KERN_ERR "BUG: scheduling while atomic: "
3334 "%s/0x%08x/%d\n", 3430 "%s/0x%08x/%d\n",
3335 current->comm, preempt_count(), current->pid); 3431 current->comm, preempt_count(), current->pid);
3432 debug_show_held_locks(current);
3433 if (irqs_disabled())
3434 print_irqtrace_events(current);
3336 dump_stack(); 3435 dump_stack();
3337 } 3436 }
3338 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3437 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4518,15 +4617,6 @@ asmlinkage long sys_sched_yield(void)
4518 return 0; 4617 return 0;
4519} 4618}
4520 4619
4521static inline int __resched_legal(int expected_preempt_count)
4522{
4523 if (unlikely(preempt_count() != expected_preempt_count))
4524 return 0;
4525 if (unlikely(system_state != SYSTEM_RUNNING))
4526 return 0;
4527 return 1;
4528}
4529
4530static void __cond_resched(void) 4620static void __cond_resched(void)
4531{ 4621{
4532#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 4622#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -4546,7 +4636,8 @@ static void __cond_resched(void)
4546 4636
4547int __sched cond_resched(void) 4637int __sched cond_resched(void)
4548{ 4638{
4549 if (need_resched() && __resched_legal(0)) { 4639 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4640 system_state == SYSTEM_RUNNING) {
4550 __cond_resched(); 4641 __cond_resched();
4551 return 1; 4642 return 1;
4552 } 4643 }
@@ -4572,7 +4663,7 @@ int cond_resched_lock(spinlock_t *lock)
4572 ret = 1; 4663 ret = 1;
4573 spin_lock(lock); 4664 spin_lock(lock);
4574 } 4665 }
4575 if (need_resched() && __resched_legal(1)) { 4666 if (need_resched() && system_state == SYSTEM_RUNNING) {
4576 spin_release(&lock->dep_map, 1, _THIS_IP_); 4667 spin_release(&lock->dep_map, 1, _THIS_IP_);
4577 _raw_spin_unlock(lock); 4668 _raw_spin_unlock(lock);
4578 preempt_enable_no_resched(); 4669 preempt_enable_no_resched();
@@ -4588,7 +4679,7 @@ int __sched cond_resched_softirq(void)
4588{ 4679{
4589 BUG_ON(!in_softirq()); 4680 BUG_ON(!in_softirq());
4590 4681
4591 if (need_resched() && __resched_legal(0)) { 4682 if (need_resched() && system_state == SYSTEM_RUNNING) {
4592 raw_local_irq_disable(); 4683 raw_local_irq_disable();
4593 _local_bh_enable(); 4684 _local_bh_enable();
4594 raw_local_irq_enable(); 4685 raw_local_irq_enable();
@@ -4804,18 +4895,18 @@ static void show_task(struct task_struct *p)
4804 show_stack(p, NULL); 4895 show_stack(p, NULL);
4805} 4896}
4806 4897
4807void show_state(void) 4898void show_state_filter(unsigned long state_filter)
4808{ 4899{
4809 struct task_struct *g, *p; 4900 struct task_struct *g, *p;
4810 4901
4811#if (BITS_PER_LONG == 32) 4902#if (BITS_PER_LONG == 32)
4812 printk("\n" 4903 printk("\n"
4813 " sibling\n"); 4904 " free sibling\n");
4814 printk(" task PC pid father child younger older\n"); 4905 printk(" task PC stack pid father child younger older\n");
4815#else 4906#else
4816 printk("\n" 4907 printk("\n"
4817 " sibling\n"); 4908 " free sibling\n");
4818 printk(" task PC pid father child younger older\n"); 4909 printk(" task PC stack pid father child younger older\n");
4819#endif 4910#endif
4820 read_lock(&tasklist_lock); 4911 read_lock(&tasklist_lock);
4821 do_each_thread(g, p) { 4912 do_each_thread(g, p) {
@@ -4824,11 +4915,16 @@ void show_state(void)
4824 * console might take alot of time: 4915 * console might take alot of time:
4825 */ 4916 */
4826 touch_nmi_watchdog(); 4917 touch_nmi_watchdog();
4827 show_task(p); 4918 if (p->state & state_filter)
4919 show_task(p);
4828 } while_each_thread(g, p); 4920 } while_each_thread(g, p);
4829 4921
4830 read_unlock(&tasklist_lock); 4922 read_unlock(&tasklist_lock);
4831 debug_show_all_locks(); 4923 /*
4924 * Only show locks if all tasks are dumped:
4925 */
4926 if (state_filter == -1)
4927 debug_show_all_locks();
4832} 4928}
4833 4929
4834/** 4930/**
@@ -4973,8 +5069,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4973 * afterwards, and pretending it was a local activate. 5069 * afterwards, and pretending it was a local activate.
4974 * This way is cleaner and logically correct. 5070 * This way is cleaner and logically correct.
4975 */ 5071 */
4976 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 5072 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
4977 + rq_dest->timestamp_last_tick; 5073 + rq_dest->most_recent_timestamp;
4978 deactivate_task(p, rq_src); 5074 deactivate_task(p, rq_src);
4979 __activate_task(p, rq_dest); 5075 __activate_task(p, rq_dest);
4980 if (TASK_PREEMPTS_CURR(p, rq_dest)) 5076 if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5050,7 +5146,10 @@ wait_to_die:
5050} 5146}
5051 5147
5052#ifdef CONFIG_HOTPLUG_CPU 5148#ifdef CONFIG_HOTPLUG_CPU
5053/* Figure out where task on dead CPU should go, use force if neccessary. */ 5149/*
5150 * Figure out where task on dead CPU should go, use force if neccessary.
5151 * NOTE: interrupts should be disabled by the caller
5152 */
5054static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5153static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5055{ 5154{
5056 unsigned long flags; 5155 unsigned long flags;
@@ -5170,6 +5269,7 @@ void idle_task_exit(void)
5170 mmdrop(mm); 5269 mmdrop(mm);
5171} 5270}
5172 5271
5272/* called under rq->lock with disabled interrupts */
5173static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5273static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5174{ 5274{
5175 struct rq *rq = cpu_rq(dead_cpu); 5275 struct rq *rq = cpu_rq(dead_cpu);
@@ -5186,10 +5286,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5186 * Drop lock around migration; if someone else moves it, 5286 * Drop lock around migration; if someone else moves it,
5187 * that's OK. No task can be added to this CPU, so iteration is 5287 * that's OK. No task can be added to this CPU, so iteration is
5188 * fine. 5288 * fine.
5289 * NOTE: interrupts should be left disabled --dev@
5189 */ 5290 */
5190 spin_unlock_irq(&rq->lock); 5291 spin_unlock(&rq->lock);
5191 move_task_off_dead_cpu(dead_cpu, p); 5292 move_task_off_dead_cpu(dead_cpu, p);
5192 spin_lock_irq(&rq->lock); 5293 spin_lock(&rq->lock);
5193 5294
5194 put_task_struct(p); 5295 put_task_struct(p);
5195} 5296}
@@ -5342,16 +5443,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5342 if (!(sd->flags & SD_LOAD_BALANCE)) { 5443 if (!(sd->flags & SD_LOAD_BALANCE)) {
5343 printk("does not load-balance\n"); 5444 printk("does not load-balance\n");
5344 if (sd->parent) 5445 if (sd->parent)
5345 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); 5446 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5447 " has parent");
5346 break; 5448 break;
5347 } 5449 }
5348 5450
5349 printk("span %s\n", str); 5451 printk("span %s\n", str);
5350 5452
5351 if (!cpu_isset(cpu, sd->span)) 5453 if (!cpu_isset(cpu, sd->span))
5352 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 5454 printk(KERN_ERR "ERROR: domain->span does not contain "
5455 "CPU%d\n", cpu);
5353 if (!cpu_isset(cpu, group->cpumask)) 5456 if (!cpu_isset(cpu, group->cpumask))
5354 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 5457 printk(KERN_ERR "ERROR: domain->groups does not contain"
5458 " CPU%d\n", cpu);
5355 5459
5356 printk(KERN_DEBUG); 5460 printk(KERN_DEBUG);
5357 for (i = 0; i < level + 2; i++) 5461 for (i = 0; i < level + 2; i++)
@@ -5366,7 +5470,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5366 5470
5367 if (!group->cpu_power) { 5471 if (!group->cpu_power) {
5368 printk("\n"); 5472 printk("\n");
5369 printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); 5473 printk(KERN_ERR "ERROR: domain->cpu_power not "
5474 "set\n");
5370 } 5475 }
5371 5476
5372 if (!cpus_weight(group->cpumask)) { 5477 if (!cpus_weight(group->cpumask)) {
@@ -5389,15 +5494,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5389 printk("\n"); 5494 printk("\n");
5390 5495
5391 if (!cpus_equal(sd->span, groupmask)) 5496 if (!cpus_equal(sd->span, groupmask))
5392 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5497 printk(KERN_ERR "ERROR: groups don't span "
5498 "domain->span\n");
5393 5499
5394 level++; 5500 level++;
5395 sd = sd->parent; 5501 sd = sd->parent;
5502 if (!sd)
5503 continue;
5396 5504
5397 if (sd) { 5505 if (!cpus_subset(groupmask, sd->span))
5398 if (!cpus_subset(groupmask, sd->span)) 5506 printk(KERN_ERR "ERROR: parent span is not a superset "
5399 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 5507 "of domain->span\n");
5400 }
5401 5508
5402 } while (sd); 5509 } while (sd);
5403} 5510}
@@ -5493,7 +5600,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5493} 5600}
5494 5601
5495/* cpus with isolated domains */ 5602/* cpus with isolated domains */
5496static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE; 5603static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5497 5604
5498/* Setup the mask of cpus configured for isolated domains */ 5605/* Setup the mask of cpus configured for isolated domains */
5499static int __init isolated_cpu_setup(char *str) 5606static int __init isolated_cpu_setup(char *str)
@@ -5511,28 +5618,27 @@ static int __init isolated_cpu_setup(char *str)
5511__setup ("isolcpus=", isolated_cpu_setup); 5618__setup ("isolcpus=", isolated_cpu_setup);
5512 5619
5513/* 5620/*
5514 * init_sched_build_groups takes an array of groups, the cpumask we wish 5621 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5515 * to span, and a pointer to a function which identifies what group a CPU 5622 * to a function which identifies what group(along with sched group) a CPU
5516 * belongs to. The return value of group_fn must be a valid index into the 5623 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5517 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we 5624 * (due to the fact that we keep track of groups covered with a cpumask_t).
5518 * keep track of groups covered with a cpumask_t).
5519 * 5625 *
5520 * init_sched_build_groups will build a circular linked list of the groups 5626 * init_sched_build_groups will build a circular linked list of the groups
5521 * covered by the given span, and will set each group's ->cpumask correctly, 5627 * covered by the given span, and will set each group's ->cpumask correctly,
5522 * and ->cpu_power to 0. 5628 * and ->cpu_power to 0.
5523 */ 5629 */
5524static void 5630static void
5525init_sched_build_groups(struct sched_group groups[], cpumask_t span, 5631init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5526 const cpumask_t *cpu_map, 5632 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5527 int (*group_fn)(int cpu, const cpumask_t *cpu_map)) 5633 struct sched_group **sg))
5528{ 5634{
5529 struct sched_group *first = NULL, *last = NULL; 5635 struct sched_group *first = NULL, *last = NULL;
5530 cpumask_t covered = CPU_MASK_NONE; 5636 cpumask_t covered = CPU_MASK_NONE;
5531 int i; 5637 int i;
5532 5638
5533 for_each_cpu_mask(i, span) { 5639 for_each_cpu_mask(i, span) {
5534 int group = group_fn(i, cpu_map); 5640 struct sched_group *sg;
5535 struct sched_group *sg = &groups[group]; 5641 int group = group_fn(i, cpu_map, &sg);
5536 int j; 5642 int j;
5537 5643
5538 if (cpu_isset(i, covered)) 5644 if (cpu_isset(i, covered))
@@ -5542,7 +5648,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5542 sg->cpu_power = 0; 5648 sg->cpu_power = 0;
5543 5649
5544 for_each_cpu_mask(j, span) { 5650 for_each_cpu_mask(j, span) {
5545 if (group_fn(j, cpu_map) != group) 5651 if (group_fn(j, cpu_map, NULL) != group)
5546 continue; 5652 continue;
5547 5653
5548 cpu_set(j, covered); 5654 cpu_set(j, covered);
@@ -5716,8 +5822,9 @@ __setup("max_cache_size=", setup_max_cache_size);
5716 */ 5822 */
5717static void touch_cache(void *__cache, unsigned long __size) 5823static void touch_cache(void *__cache, unsigned long __size)
5718{ 5824{
5719 unsigned long size = __size/sizeof(long), chunk1 = size/3, 5825 unsigned long size = __size / sizeof(long);
5720 chunk2 = 2*size/3; 5826 unsigned long chunk1 = size / 3;
5827 unsigned long chunk2 = 2 * size / 3;
5721 unsigned long *cache = __cache; 5828 unsigned long *cache = __cache;
5722 int i; 5829 int i;
5723 5830
@@ -5826,11 +5933,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5826 */ 5933 */
5827 measure_one(cache, size, cpu1, cpu2); 5934 measure_one(cache, size, cpu1, cpu2);
5828 for (i = 0; i < ITERATIONS; i++) 5935 for (i = 0; i < ITERATIONS; i++)
5829 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); 5936 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
5830 5937
5831 measure_one(cache, size, cpu2, cpu1); 5938 measure_one(cache, size, cpu2, cpu1);
5832 for (i = 0; i < ITERATIONS; i++) 5939 for (i = 0; i < ITERATIONS; i++)
5833 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); 5940 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
5834 5941
5835 /* 5942 /*
5836 * (We measure the non-migrating [cached] cost on both 5943 * (We measure the non-migrating [cached] cost on both
@@ -5840,17 +5947,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5840 5947
5841 measure_one(cache, size, cpu1, cpu1); 5948 measure_one(cache, size, cpu1, cpu1);
5842 for (i = 0; i < ITERATIONS; i++) 5949 for (i = 0; i < ITERATIONS; i++)
5843 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); 5950 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
5844 5951
5845 measure_one(cache, size, cpu2, cpu2); 5952 measure_one(cache, size, cpu2, cpu2);
5846 for (i = 0; i < ITERATIONS; i++) 5953 for (i = 0; i < ITERATIONS; i++)
5847 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); 5954 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
5848 5955
5849 /* 5956 /*
5850 * Get the per-iteration migration cost: 5957 * Get the per-iteration migration cost:
5851 */ 5958 */
5852 do_div(cost1, 2*ITERATIONS); 5959 do_div(cost1, 2 * ITERATIONS);
5853 do_div(cost2, 2*ITERATIONS); 5960 do_div(cost2, 2 * ITERATIONS);
5854 5961
5855 return cost1 - cost2; 5962 return cost1 - cost2;
5856} 5963}
@@ -5888,7 +5995,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5888 */ 5995 */
5889 cache = vmalloc(max_size); 5996 cache = vmalloc(max_size);
5890 if (!cache) { 5997 if (!cache) {
5891 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 5998 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
5892 return 1000000; /* return 1 msec on very small boxen */ 5999 return 1000000; /* return 1 msec on very small boxen */
5893 } 6000 }
5894 6001
@@ -5913,7 +6020,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5913 avg_fluct = (avg_fluct + fluct)/2; 6020 avg_fluct = (avg_fluct + fluct)/2;
5914 6021
5915 if (migration_debug) 6022 if (migration_debug)
5916 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", 6023 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6024 "(%8Ld %8Ld)\n",
5917 cpu1, cpu2, size, 6025 cpu1, cpu2, size,
5918 (long)cost / 1000000, 6026 (long)cost / 1000000,
5919 ((long)cost / 100000) % 10, 6027 ((long)cost / 100000) % 10,
@@ -6008,20 +6116,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
6008 -1 6116 -1
6009#endif 6117#endif
6010 ); 6118 );
6011 if (system_state == SYSTEM_BOOTING) { 6119 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6012 if (num_online_cpus() > 1) { 6120 printk("migration_cost=");
6013 printk("migration_cost="); 6121 for (distance = 0; distance <= max_distance; distance++) {
6014 for (distance = 0; distance <= max_distance; distance++) { 6122 if (distance)
6015 if (distance) 6123 printk(",");
6016 printk(","); 6124 printk("%ld", (long)migration_cost[distance] / 1000);
6017 printk("%ld", (long)migration_cost[distance] / 1000);
6018 }
6019 printk("\n");
6020 } 6125 }
6126 printk("\n");
6021 } 6127 }
6022 j1 = jiffies; 6128 j1 = jiffies;
6023 if (migration_debug) 6129 if (migration_debug)
6024 printk("migration: %ld seconds\n", (j1-j0)/HZ); 6130 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6025 6131
6026 /* 6132 /*
6027 * Move back to the original CPU. NUMA-Q gets confused 6133 * Move back to the original CPU. NUMA-Q gets confused
@@ -6118,10 +6224,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6118 */ 6224 */
6119#ifdef CONFIG_SCHED_SMT 6225#ifdef CONFIG_SCHED_SMT
6120static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6226static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6121static struct sched_group sched_group_cpus[NR_CPUS]; 6227static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6122 6228
6123static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) 6229static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
6230 struct sched_group **sg)
6124{ 6231{
6232 if (sg)
6233 *sg = &per_cpu(sched_group_cpus, cpu);
6125 return cpu; 6234 return cpu;
6126} 6235}
6127#endif 6236#endif
@@ -6131,39 +6240,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
6131 */ 6240 */
6132#ifdef CONFIG_SCHED_MC 6241#ifdef CONFIG_SCHED_MC
6133static DEFINE_PER_CPU(struct sched_domain, core_domains); 6242static DEFINE_PER_CPU(struct sched_domain, core_domains);
6134static struct sched_group sched_group_core[NR_CPUS]; 6243static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6135#endif 6244#endif
6136 6245
6137#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6246#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6138static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6247static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6248 struct sched_group **sg)
6139{ 6249{
6250 int group;
6140 cpumask_t mask = cpu_sibling_map[cpu]; 6251 cpumask_t mask = cpu_sibling_map[cpu];
6141 cpus_and(mask, mask, *cpu_map); 6252 cpus_and(mask, mask, *cpu_map);
6142 return first_cpu(mask); 6253 group = first_cpu(mask);
6254 if (sg)
6255 *sg = &per_cpu(sched_group_core, group);
6256 return group;
6143} 6257}
6144#elif defined(CONFIG_SCHED_MC) 6258#elif defined(CONFIG_SCHED_MC)
6145static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6259static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6260 struct sched_group **sg)
6146{ 6261{
6262 if (sg)
6263 *sg = &per_cpu(sched_group_core, cpu);
6147 return cpu; 6264 return cpu;
6148} 6265}
6149#endif 6266#endif
6150 6267
6151static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6268static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6152static struct sched_group sched_group_phys[NR_CPUS]; 6269static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6153 6270
6154static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) 6271static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
6272 struct sched_group **sg)
6155{ 6273{
6274 int group;
6156#ifdef CONFIG_SCHED_MC 6275#ifdef CONFIG_SCHED_MC
6157 cpumask_t mask = cpu_coregroup_map(cpu); 6276 cpumask_t mask = cpu_coregroup_map(cpu);
6158 cpus_and(mask, mask, *cpu_map); 6277 cpus_and(mask, mask, *cpu_map);
6159 return first_cpu(mask); 6278 group = first_cpu(mask);
6160#elif defined(CONFIG_SCHED_SMT) 6279#elif defined(CONFIG_SCHED_SMT)
6161 cpumask_t mask = cpu_sibling_map[cpu]; 6280 cpumask_t mask = cpu_sibling_map[cpu];
6162 cpus_and(mask, mask, *cpu_map); 6281 cpus_and(mask, mask, *cpu_map);
6163 return first_cpu(mask); 6282 group = first_cpu(mask);
6164#else 6283#else
6165 return cpu; 6284 group = cpu;
6166#endif 6285#endif
6286 if (sg)
6287 *sg = &per_cpu(sched_group_phys, group);
6288 return group;
6167} 6289}
6168 6290
6169#ifdef CONFIG_NUMA 6291#ifdef CONFIG_NUMA
@@ -6176,12 +6298,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
6176static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 6298static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6177 6299
6178static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 6300static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6179static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; 6301static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6180 6302
6181static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) 6303static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6304 struct sched_group **sg)
6182{ 6305{
6183 return cpu_to_node(cpu); 6306 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6307 int group;
6308
6309 cpus_and(nodemask, nodemask, *cpu_map);
6310 group = first_cpu(nodemask);
6311
6312 if (sg)
6313 *sg = &per_cpu(sched_group_allnodes, group);
6314 return group;
6184} 6315}
6316
6185static void init_numa_sched_groups_power(struct sched_group *group_head) 6317static void init_numa_sched_groups_power(struct sched_group *group_head)
6186{ 6318{
6187 struct sched_group *sg = group_head; 6319 struct sched_group *sg = group_head;
@@ -6217,16 +6349,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6217 int cpu, i; 6349 int cpu, i;
6218 6350
6219 for_each_cpu_mask(cpu, *cpu_map) { 6351 for_each_cpu_mask(cpu, *cpu_map) {
6220 struct sched_group *sched_group_allnodes
6221 = sched_group_allnodes_bycpu[cpu];
6222 struct sched_group **sched_group_nodes 6352 struct sched_group **sched_group_nodes
6223 = sched_group_nodes_bycpu[cpu]; 6353 = sched_group_nodes_bycpu[cpu];
6224 6354
6225 if (sched_group_allnodes) {
6226 kfree(sched_group_allnodes);
6227 sched_group_allnodes_bycpu[cpu] = NULL;
6228 }
6229
6230 if (!sched_group_nodes) 6355 if (!sched_group_nodes)
6231 continue; 6356 continue;
6232 6357
@@ -6320,7 +6445,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6320 struct sched_domain *sd; 6445 struct sched_domain *sd;
6321#ifdef CONFIG_NUMA 6446#ifdef CONFIG_NUMA
6322 struct sched_group **sched_group_nodes = NULL; 6447 struct sched_group **sched_group_nodes = NULL;
6323 struct sched_group *sched_group_allnodes = NULL; 6448 int sd_allnodes = 0;
6324 6449
6325 /* 6450 /*
6326 * Allocate the per-node list of sched groups 6451 * Allocate the per-node list of sched groups
@@ -6338,7 +6463,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6338 * Set up domains for cpus specified by the cpu_map. 6463 * Set up domains for cpus specified by the cpu_map.
6339 */ 6464 */
6340 for_each_cpu_mask(i, *cpu_map) { 6465 for_each_cpu_mask(i, *cpu_map) {
6341 int group;
6342 struct sched_domain *sd = NULL, *p; 6466 struct sched_domain *sd = NULL, *p;
6343 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 6467 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6344 6468
@@ -6347,26 +6471,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6347#ifdef CONFIG_NUMA 6471#ifdef CONFIG_NUMA
6348 if (cpus_weight(*cpu_map) 6472 if (cpus_weight(*cpu_map)
6349 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 6473 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6350 if (!sched_group_allnodes) {
6351 sched_group_allnodes
6352 = kmalloc_node(sizeof(struct sched_group)
6353 * MAX_NUMNODES,
6354 GFP_KERNEL,
6355 cpu_to_node(i));
6356 if (!sched_group_allnodes) {
6357 printk(KERN_WARNING
6358 "Can not alloc allnodes sched group\n");
6359 goto error;
6360 }
6361 sched_group_allnodes_bycpu[i]
6362 = sched_group_allnodes;
6363 }
6364 sd = &per_cpu(allnodes_domains, i); 6474 sd = &per_cpu(allnodes_domains, i);
6365 *sd = SD_ALLNODES_INIT; 6475 *sd = SD_ALLNODES_INIT;
6366 sd->span = *cpu_map; 6476 sd->span = *cpu_map;
6367 group = cpu_to_allnodes_group(i, cpu_map); 6477 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6368 sd->groups = &sched_group_allnodes[group];
6369 p = sd; 6478 p = sd;
6479 sd_allnodes = 1;
6370 } else 6480 } else
6371 p = NULL; 6481 p = NULL;
6372 6482
@@ -6381,36 +6491,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6381 6491
6382 p = sd; 6492 p = sd;
6383 sd = &per_cpu(phys_domains, i); 6493 sd = &per_cpu(phys_domains, i);
6384 group = cpu_to_phys_group(i, cpu_map);
6385 *sd = SD_CPU_INIT; 6494 *sd = SD_CPU_INIT;
6386 sd->span = nodemask; 6495 sd->span = nodemask;
6387 sd->parent = p; 6496 sd->parent = p;
6388 if (p) 6497 if (p)
6389 p->child = sd; 6498 p->child = sd;
6390 sd->groups = &sched_group_phys[group]; 6499 cpu_to_phys_group(i, cpu_map, &sd->groups);
6391 6500
6392#ifdef CONFIG_SCHED_MC 6501#ifdef CONFIG_SCHED_MC
6393 p = sd; 6502 p = sd;
6394 sd = &per_cpu(core_domains, i); 6503 sd = &per_cpu(core_domains, i);
6395 group = cpu_to_core_group(i, cpu_map);
6396 *sd = SD_MC_INIT; 6504 *sd = SD_MC_INIT;
6397 sd->span = cpu_coregroup_map(i); 6505 sd->span = cpu_coregroup_map(i);
6398 cpus_and(sd->span, sd->span, *cpu_map); 6506 cpus_and(sd->span, sd->span, *cpu_map);
6399 sd->parent = p; 6507 sd->parent = p;
6400 p->child = sd; 6508 p->child = sd;
6401 sd->groups = &sched_group_core[group]; 6509 cpu_to_core_group(i, cpu_map, &sd->groups);
6402#endif 6510#endif
6403 6511
6404#ifdef CONFIG_SCHED_SMT 6512#ifdef CONFIG_SCHED_SMT
6405 p = sd; 6513 p = sd;
6406 sd = &per_cpu(cpu_domains, i); 6514 sd = &per_cpu(cpu_domains, i);
6407 group = cpu_to_cpu_group(i, cpu_map);
6408 *sd = SD_SIBLING_INIT; 6515 *sd = SD_SIBLING_INIT;
6409 sd->span = cpu_sibling_map[i]; 6516 sd->span = cpu_sibling_map[i];
6410 cpus_and(sd->span, sd->span, *cpu_map); 6517 cpus_and(sd->span, sd->span, *cpu_map);
6411 sd->parent = p; 6518 sd->parent = p;
6412 p->child = sd; 6519 p->child = sd;
6413 sd->groups = &sched_group_cpus[group]; 6520 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6414#endif 6521#endif
6415 } 6522 }
6416 6523
@@ -6422,8 +6529,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6422 if (i != first_cpu(this_sibling_map)) 6529 if (i != first_cpu(this_sibling_map))
6423 continue; 6530 continue;
6424 6531
6425 init_sched_build_groups(sched_group_cpus, this_sibling_map, 6532 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
6426 cpu_map, &cpu_to_cpu_group);
6427 } 6533 }
6428#endif 6534#endif
6429 6535
@@ -6434,8 +6540,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6434 cpus_and(this_core_map, this_core_map, *cpu_map); 6540 cpus_and(this_core_map, this_core_map, *cpu_map);
6435 if (i != first_cpu(this_core_map)) 6541 if (i != first_cpu(this_core_map))
6436 continue; 6542 continue;
6437 init_sched_build_groups(sched_group_core, this_core_map, 6543 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
6438 cpu_map, &cpu_to_core_group);
6439 } 6544 }
6440#endif 6545#endif
6441 6546
@@ -6448,15 +6553,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6448 if (cpus_empty(nodemask)) 6553 if (cpus_empty(nodemask))
6449 continue; 6554 continue;
6450 6555
6451 init_sched_build_groups(sched_group_phys, nodemask, 6556 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6452 cpu_map, &cpu_to_phys_group);
6453 } 6557 }
6454 6558
6455#ifdef CONFIG_NUMA 6559#ifdef CONFIG_NUMA
6456 /* Set up node groups */ 6560 /* Set up node groups */
6457 if (sched_group_allnodes) 6561 if (sd_allnodes)
6458 init_sched_build_groups(sched_group_allnodes, *cpu_map, 6562 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
6459 cpu_map, &cpu_to_allnodes_group);
6460 6563
6461 for (i = 0; i < MAX_NUMNODES; i++) { 6564 for (i = 0; i < MAX_NUMNODES; i++) {
6462 /* Set up node groups */ 6565 /* Set up node groups */
@@ -6548,10 +6651,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6548 for (i = 0; i < MAX_NUMNODES; i++) 6651 for (i = 0; i < MAX_NUMNODES; i++)
6549 init_numa_sched_groups_power(sched_group_nodes[i]); 6652 init_numa_sched_groups_power(sched_group_nodes[i]);
6550 6653
6551 if (sched_group_allnodes) { 6654 if (sd_allnodes) {
6552 int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); 6655 struct sched_group *sg;
6553 struct sched_group *sg = &sched_group_allnodes[group];
6554 6656
6657 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6555 init_numa_sched_groups_power(sg); 6658 init_numa_sched_groups_power(sg);
6556 } 6659 }
6557#endif 6660#endif
@@ -6723,8 +6826,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6723 sched_smt_power_savings_store); 6826 sched_smt_power_savings_store);
6724#endif 6827#endif
6725 6828
6726
6727#ifdef CONFIG_HOTPLUG_CPU
6728/* 6829/*
6729 * Force a reinitialization of the sched domains hierarchy. The domains 6830 * Force a reinitialization of the sched domains hierarchy. The domains
6730 * and groups cannot be updated in place without racing with the balancing 6831 * and groups cannot be updated in place without racing with the balancing
@@ -6757,7 +6858,6 @@ static int update_sched_domains(struct notifier_block *nfb,
6757 6858
6758 return NOTIFY_OK; 6859 return NOTIFY_OK;
6759} 6860}
6760#endif
6761 6861
6762void __init sched_init_smp(void) 6862void __init sched_init_smp(void)
6763{ 6863{
@@ -6765,7 +6865,7 @@ void __init sched_init_smp(void)
6765 6865
6766 lock_cpu_hotplug(); 6866 lock_cpu_hotplug();
6767 arch_init_sched_domains(&cpu_online_map); 6867 arch_init_sched_domains(&cpu_online_map);
6768 cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map); 6868 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6769 if (cpus_empty(non_isolated_cpus)) 6869 if (cpus_empty(non_isolated_cpus))
6770 cpu_set(smp_processor_id(), non_isolated_cpus); 6870 cpu_set(smp_processor_id(), non_isolated_cpus);
6771 unlock_cpu_hotplug(); 6871 unlock_cpu_hotplug();
@@ -6833,6 +6933,10 @@ void __init sched_init(void)
6833 6933
6834 set_load_weight(&init_task); 6934 set_load_weight(&init_task);
6835 6935
6936#ifdef CONFIG_SMP
6937 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6938#endif
6939
6836#ifdef CONFIG_RT_MUTEXES 6940#ifdef CONFIG_RT_MUTEXES
6837 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 6941 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6838#endif 6942#endif
@@ -6867,6 +6971,9 @@ void __might_sleep(char *file, int line)
6867 " context at %s:%d\n", file, line); 6971 " context at %s:%d\n", file, line);
6868 printk("in_atomic():%d, irqs_disabled():%d\n", 6972 printk("in_atomic():%d, irqs_disabled():%d\n",
6869 in_atomic(), irqs_disabled()); 6973 in_atomic(), irqs_disabled());
6974 debug_show_held_locks(current);
6975 if (irqs_disabled())
6976 print_irqtrace_events(current);
6870 dump_stack(); 6977 dump_stack();
6871 } 6978 }
6872#endif 6979#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index df18c167a2a7..5630255d2e2a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,10 @@
23#include <linux/ptrace.h> 23#include <linux/ptrace.h>
24#include <linux/signal.h> 24#include <linux/signal.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/freezer.h>
27#include <linux/pid_namespace.h>
28#include <linux/nsproxy.h>
29
26#include <asm/param.h> 30#include <asm/param.h>
27#include <asm/uaccess.h> 31#include <asm/uaccess.h>
28#include <asm/unistd.h> 32#include <asm/unistd.h>
@@ -33,7 +37,7 @@
33 * SLAB caches for signal bits. 37 * SLAB caches for signal bits.
34 */ 38 */
35 39
36static kmem_cache_t *sigqueue_cachep; 40static struct kmem_cache *sigqueue_cachep;
37 41
38/* 42/*
39 * In POSIX a signal is sent either to a specific thread (Linux task) 43 * In POSIX a signal is sent either to a specific thread (Linux task)
@@ -582,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
582 error = -EPERM; 586 error = -EPERM;
583 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 587 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
584 && ((sig != SIGCONT) || 588 && ((sig != SIGCONT) ||
585 (current->signal->session != t->signal->session)) 589 (process_session(current) != process_session(t)))
586 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 590 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
587 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 591 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
588 && !capable(CAP_KILL)) 592 && !capable(CAP_KILL))
@@ -1133,8 +1137,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1133 return error; 1137 return error;
1134} 1138}
1135 1139
1136int 1140static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1137kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1138{ 1141{
1139 int error; 1142 int error;
1140 rcu_read_lock(); 1143 rcu_read_lock();
@@ -1702,7 +1705,9 @@ finish_stop(int stop_count)
1702 read_unlock(&tasklist_lock); 1705 read_unlock(&tasklist_lock);
1703 } 1706 }
1704 1707
1705 schedule(); 1708 do {
1709 schedule();
1710 } while (try_to_freeze());
1706 /* 1711 /*
1707 * Now we don't run again until continued. 1712 * Now we don't run again until continued.
1708 */ 1713 */
@@ -1877,8 +1882,12 @@ relock:
1877 if (sig_kernel_ignore(signr)) /* Default is nothing. */ 1882 if (sig_kernel_ignore(signr)) /* Default is nothing. */
1878 continue; 1883 continue;
1879 1884
1880 /* Init gets no signals it doesn't want. */ 1885 /*
1881 if (current == child_reaper) 1886 * Init of a pid space gets no signals it doesn't want from
1887 * within that pid space. It can of course get signals from
1888 * its parent pid space.
1889 */
1890 if (current == child_reaper(current))
1882 continue; 1891 continue;
1883 1892
1884 if (sig_kernel_stop(signr)) { 1893 if (sig_kernel_stop(signr)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce16..918e52df090e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
574 574
575 switch (action) { 575 switch (action) {
576 case CPU_UP_PREPARE: 576 case CPU_UP_PREPARE:
577 BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
578 BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
579 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 577 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
580 if (IS_ERR(p)) { 578 if (IS_ERR(p)) {
581 printk("ksoftirqd for %i failed\n", hotcpu); 579 printk("ksoftirqd for %i failed\n", hotcpu);
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801b..c7675c1bfdf2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
880 return 0; 880 return 0;
881} 881}
882 882
883static void deferred_cad(void *dummy) 883static void deferred_cad(struct work_struct *dummy)
884{ 884{
885 kernel_restart(NULL); 885 kernel_restart(NULL);
886} 886}
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
892 */ 892 */
893void ctrl_alt_del(void) 893void ctrl_alt_del(void)
894{ 894{
895 static DECLARE_WORK(cad_work, deferred_cad, NULL); 895 static DECLARE_WORK(cad_work, deferred_cad);
896 896
897 if (C_A_D) 897 if (C_A_D)
898 schedule_work(&cad_work); 898 schedule_work(&cad_work);
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
1102asmlinkage long sys_setuid(uid_t uid) 1102asmlinkage long sys_setuid(uid_t uid)
1103{ 1103{
1104 int old_euid = current->euid; 1104 int old_euid = current->euid;
1105 int old_ruid, old_suid, new_ruid, new_suid; 1105 int old_ruid, old_suid, new_suid;
1106 int retval; 1106 int retval;
1107 1107
1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
1109 if (retval) 1109 if (retval)
1110 return retval; 1110 return retval;
1111 1111
1112 old_ruid = new_ruid = current->uid; 1112 old_ruid = current->uid;
1113 old_suid = current->suid; 1113 old_suid = current->suid;
1114 new_suid = old_suid; 1114 new_suid = old_suid;
1115 1115
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1381 1381
1382 if (p->real_parent == group_leader) { 1382 if (p->real_parent == group_leader) {
1383 err = -EPERM; 1383 err = -EPERM;
1384 if (p->signal->session != group_leader->signal->session) 1384 if (process_session(p) != process_session(group_leader))
1385 goto out; 1385 goto out;
1386 err = -EACCES; 1386 err = -EACCES;
1387 if (p->did_exec) 1387 if (p->did_exec)
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1397 goto out; 1397 goto out;
1398 1398
1399 if (pgid != pid) { 1399 if (pgid != pid) {
1400 struct task_struct *p; 1400 struct task_struct *g =
1401 find_task_by_pid_type(PIDTYPE_PGID, pgid);
1401 1402
1402 do_each_task_pid(pgid, PIDTYPE_PGID, p) { 1403 if (!g || process_session(g) != process_session(group_leader))
1403 if (p->signal->session == group_leader->signal->session) 1404 goto out;
1404 goto ok_pgid;
1405 } while_each_task_pid(pgid, PIDTYPE_PGID, p);
1406 goto out;
1407 } 1405 }
1408 1406
1409ok_pgid:
1410 err = security_task_setpgid(p, pgid); 1407 err = security_task_setpgid(p, pgid);
1411 if (err) 1408 if (err)
1412 goto out; 1409 goto out;
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void)
1459asmlinkage long sys_getsid(pid_t pid) 1456asmlinkage long sys_getsid(pid_t pid)
1460{ 1457{
1461 if (!pid) 1458 if (!pid)
1462 return current->signal->session; 1459 return process_session(current);
1463 else { 1460 else {
1464 int retval; 1461 int retval;
1465 struct task_struct *p; 1462 struct task_struct *p;
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
1471 if (p) { 1468 if (p) {
1472 retval = security_task_getsid(p); 1469 retval = security_task_getsid(p);
1473 if (!retval) 1470 if (!retval)
1474 retval = p->signal->session; 1471 retval = process_session(p);
1475 } 1472 }
1476 read_unlock(&tasklist_lock); 1473 read_unlock(&tasklist_lock);
1477 return retval; 1474 return retval;
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void)
1484 pid_t session; 1481 pid_t session;
1485 int err = -EPERM; 1482 int err = -EPERM;
1486 1483
1487 mutex_lock(&tty_mutex);
1488 write_lock_irq(&tasklist_lock); 1484 write_lock_irq(&tasklist_lock);
1489 1485
1490 /* Fail if I am already a session leader */ 1486 /* Fail if I am already a session leader */
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void)
1504 1500
1505 group_leader->signal->leader = 1; 1501 group_leader->signal->leader = 1;
1506 __set_special_pids(session, session); 1502 __set_special_pids(session, session);
1503
1504 spin_lock(&group_leader->sighand->siglock);
1507 group_leader->signal->tty = NULL; 1505 group_leader->signal->tty = NULL;
1508 group_leader->signal->tty_old_pgrp = 0; 1506 group_leader->signal->tty_old_pgrp = 0;
1507 spin_unlock(&group_leader->sighand->siglock);
1508
1509 err = process_group(group_leader); 1509 err = process_group(group_leader);
1510out: 1510out:
1511 write_unlock_irq(&tasklist_lock); 1511 write_unlock_irq(&tasklist_lock);
1512 mutex_unlock(&tty_mutex);
1513 return err; 1512 return err;
1514} 1513}
1515 1514
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09e569f4792b..600b33358ded 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
54 54
55#ifdef CONFIG_X86 55#ifdef CONFIG_X86
56#include <asm/nmi.h> 56#include <asm/nmi.h>
57#include <asm/stacktrace.h>
57#endif 58#endif
58 59
59#if defined(CONFIG_SYSCTL) 60#if defined(CONFIG_SYSCTL)
@@ -64,7 +65,6 @@ extern int sysctl_overcommit_memory;
64extern int sysctl_overcommit_ratio; 65extern int sysctl_overcommit_ratio;
65extern int sysctl_panic_on_oom; 66extern int sysctl_panic_on_oom;
66extern int max_threads; 67extern int max_threads;
67extern int sysrq_enabled;
68extern int core_uses_pid; 68extern int core_uses_pid;
69extern int suid_dumpable; 69extern int suid_dumpable;
70extern char core_pattern[]; 70extern char core_pattern[];
@@ -91,7 +91,9 @@ extern char modprobe_path[];
91extern int sg_big_buff; 91extern int sg_big_buff;
92#endif 92#endif
93#ifdef CONFIG_SYSVIPC 93#ifdef CONFIG_SYSVIPC
94static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, 94static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
95 void __user *buffer, size_t *lenp, loff_t *ppos);
96static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
95 void __user *buffer, size_t *lenp, loff_t *ppos); 97 void __user *buffer, size_t *lenp, loff_t *ppos);
96#endif 98#endif
97 99
@@ -130,12 +132,22 @@ extern int max_lock_depth;
130 132
131#ifdef CONFIG_SYSCTL_SYSCALL 133#ifdef CONFIG_SYSCTL_SYSCALL
132static int parse_table(int __user *, int, void __user *, size_t __user *, 134static int parse_table(int __user *, int, void __user *, size_t __user *,
133 void __user *, size_t, ctl_table *, void **); 135 void __user *, size_t, ctl_table *);
134#endif 136#endif
135 137
136static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 138static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
137 void __user *buffer, size_t *lenp, loff_t *ppos); 139 void __user *buffer, size_t *lenp, loff_t *ppos);
138 140
141static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
142 void __user *oldval, size_t __user *oldlenp,
143 void __user *newval, size_t newlen);
144
145#ifdef CONFIG_SYSVIPC
146static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
147 void __user *oldval, size_t __user *oldlenp,
148 void __user *newval, size_t newlen);
149#endif
150
139#ifdef CONFIG_PROC_SYSCTL 151#ifdef CONFIG_PROC_SYSCTL
140static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 152static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
141 void __user *buffer, size_t *lenp, loff_t *ppos); 153 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -162,6 +174,40 @@ extern ctl_table inotify_table[];
162int sysctl_legacy_va_layout; 174int sysctl_legacy_va_layout;
163#endif 175#endif
164 176
177static void *get_uts(ctl_table *table, int write)
178{
179 char *which = table->data;
180#ifdef CONFIG_UTS_NS
181 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
182 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
183#endif
184 if (!write)
185 down_read(&uts_sem);
186 else
187 down_write(&uts_sem);
188 return which;
189}
190
191static void put_uts(ctl_table *table, int write, void *which)
192{
193 if (!write)
194 up_read(&uts_sem);
195 else
196 up_write(&uts_sem);
197}
198
199#ifdef CONFIG_SYSVIPC
200static void *get_ipc(ctl_table *table, int write)
201{
202 char *which = table->data;
203 struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
204 which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
205 return which;
206}
207#else
208#define get_ipc(T,W) ((T)->data)
209#endif
210
165/* /proc declarations: */ 211/* /proc declarations: */
166 212
167#ifdef CONFIG_PROC_SYSCTL 213#ifdef CONFIG_PROC_SYSCTL
@@ -170,7 +216,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
170static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); 216static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
171static int proc_opensys(struct inode *, struct file *); 217static int proc_opensys(struct inode *, struct file *);
172 218
173struct file_operations proc_sys_file_operations = { 219const struct file_operations proc_sys_file_operations = {
174 .open = proc_opensys, 220 .open = proc_opensys,
175 .read = proc_readsys, 221 .read = proc_readsys,
176 .write = proc_writesys, 222 .write = proc_writesys,
@@ -228,7 +274,6 @@ static ctl_table root_table[] = {
228}; 274};
229 275
230static ctl_table kern_table[] = { 276static ctl_table kern_table[] = {
231#ifndef CONFIG_UTS_NS
232 { 277 {
233 .ctl_name = KERN_OSTYPE, 278 .ctl_name = KERN_OSTYPE,
234 .procname = "ostype", 279 .procname = "ostype",
@@ -236,7 +281,7 @@ static ctl_table kern_table[] = {
236 .maxlen = sizeof(init_uts_ns.name.sysname), 281 .maxlen = sizeof(init_uts_ns.name.sysname),
237 .mode = 0444, 282 .mode = 0444,
238 .proc_handler = &proc_do_uts_string, 283 .proc_handler = &proc_do_uts_string,
239 .strategy = &sysctl_string, 284 .strategy = &sysctl_uts_string,
240 }, 285 },
241 { 286 {
242 .ctl_name = KERN_OSRELEASE, 287 .ctl_name = KERN_OSRELEASE,
@@ -245,7 +290,7 @@ static ctl_table kern_table[] = {
245 .maxlen = sizeof(init_uts_ns.name.release), 290 .maxlen = sizeof(init_uts_ns.name.release),
246 .mode = 0444, 291 .mode = 0444,
247 .proc_handler = &proc_do_uts_string, 292 .proc_handler = &proc_do_uts_string,
248 .strategy = &sysctl_string, 293 .strategy = &sysctl_uts_string,
249 }, 294 },
250 { 295 {
251 .ctl_name = KERN_VERSION, 296 .ctl_name = KERN_VERSION,
@@ -254,7 +299,7 @@ static ctl_table kern_table[] = {
254 .maxlen = sizeof(init_uts_ns.name.version), 299 .maxlen = sizeof(init_uts_ns.name.version),
255 .mode = 0444, 300 .mode = 0444,
256 .proc_handler = &proc_do_uts_string, 301 .proc_handler = &proc_do_uts_string,
257 .strategy = &sysctl_string, 302 .strategy = &sysctl_uts_string,
258 }, 303 },
259 { 304 {
260 .ctl_name = KERN_NODENAME, 305 .ctl_name = KERN_NODENAME,
@@ -263,7 +308,7 @@ static ctl_table kern_table[] = {
263 .maxlen = sizeof(init_uts_ns.name.nodename), 308 .maxlen = sizeof(init_uts_ns.name.nodename),
264 .mode = 0644, 309 .mode = 0644,
265 .proc_handler = &proc_do_uts_string, 310 .proc_handler = &proc_do_uts_string,
266 .strategy = &sysctl_string, 311 .strategy = &sysctl_uts_string,
267 }, 312 },
268 { 313 {
269 .ctl_name = KERN_DOMAINNAME, 314 .ctl_name = KERN_DOMAINNAME,
@@ -272,57 +317,9 @@ static ctl_table kern_table[] = {
272 .maxlen = sizeof(init_uts_ns.name.domainname), 317 .maxlen = sizeof(init_uts_ns.name.domainname),
273 .mode = 0644, 318 .mode = 0644,
274 .proc_handler = &proc_do_uts_string, 319 .proc_handler = &proc_do_uts_string,
275 .strategy = &sysctl_string, 320 .strategy = &sysctl_uts_string,
276 },
277#else /* !CONFIG_UTS_NS */
278 {
279 .ctl_name = KERN_OSTYPE,
280 .procname = "ostype",
281 .data = NULL,
282 /* could maybe use __NEW_UTS_LEN here? */
283 .maxlen = FIELD_SIZEOF(struct new_utsname, sysname),
284 .mode = 0444,
285 .proc_handler = &proc_do_uts_string,
286 .strategy = &sysctl_string,
287 }, 321 },
288 { 322 {
289 .ctl_name = KERN_OSRELEASE,
290 .procname = "osrelease",
291 .data = NULL,
292 .maxlen = FIELD_SIZEOF(struct new_utsname, release),
293 .mode = 0444,
294 .proc_handler = &proc_do_uts_string,
295 .strategy = &sysctl_string,
296 },
297 {
298 .ctl_name = KERN_VERSION,
299 .procname = "version",
300 .data = NULL,
301 .maxlen = FIELD_SIZEOF(struct new_utsname, version),
302 .mode = 0444,
303 .proc_handler = &proc_do_uts_string,
304 .strategy = &sysctl_string,
305 },
306 {
307 .ctl_name = KERN_NODENAME,
308 .procname = "hostname",
309 .data = NULL,
310 .maxlen = FIELD_SIZEOF(struct new_utsname, nodename),
311 .mode = 0644,
312 .proc_handler = &proc_do_uts_string,
313 .strategy = &sysctl_string,
314 },
315 {
316 .ctl_name = KERN_DOMAINNAME,
317 .procname = "domainname",
318 .data = NULL,
319 .maxlen = FIELD_SIZEOF(struct new_utsname, domainname),
320 .mode = 0644,
321 .proc_handler = &proc_do_uts_string,
322 .strategy = &sysctl_string,
323 },
324#endif /* !CONFIG_UTS_NS */
325 {
326 .ctl_name = KERN_PANIC, 323 .ctl_name = KERN_PANIC,
327 .procname = "panic", 324 .procname = "panic",
328 .data = &panic_timeout, 325 .data = &panic_timeout,
@@ -480,65 +477,72 @@ static ctl_table kern_table[] = {
480 { 477 {
481 .ctl_name = KERN_SHMMAX, 478 .ctl_name = KERN_SHMMAX,
482 .procname = "shmmax", 479 .procname = "shmmax",
483 .data = NULL, 480 .data = &init_ipc_ns.shm_ctlmax,
484 .maxlen = sizeof (size_t), 481 .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
485 .mode = 0644, 482 .mode = 0644,
486 .proc_handler = &proc_do_ipc_string, 483 .proc_handler = &proc_ipc_doulongvec_minmax,
484 .strategy = sysctl_ipc_data,
487 }, 485 },
488 { 486 {
489 .ctl_name = KERN_SHMALL, 487 .ctl_name = KERN_SHMALL,
490 .procname = "shmall", 488 .procname = "shmall",
491 .data = NULL, 489 .data = &init_ipc_ns.shm_ctlall,
492 .maxlen = sizeof (size_t), 490 .maxlen = sizeof (init_ipc_ns.shm_ctlall),
493 .mode = 0644, 491 .mode = 0644,
494 .proc_handler = &proc_do_ipc_string, 492 .proc_handler = &proc_ipc_doulongvec_minmax,
493 .strategy = sysctl_ipc_data,
495 }, 494 },
496 { 495 {
497 .ctl_name = KERN_SHMMNI, 496 .ctl_name = KERN_SHMMNI,
498 .procname = "shmmni", 497 .procname = "shmmni",
499 .data = NULL, 498 .data = &init_ipc_ns.shm_ctlmni,
500 .maxlen = sizeof (int), 499 .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
501 .mode = 0644, 500 .mode = 0644,
502 .proc_handler = &proc_do_ipc_string, 501 .proc_handler = &proc_ipc_dointvec,
502 .strategy = sysctl_ipc_data,
503 }, 503 },
504 { 504 {
505 .ctl_name = KERN_MSGMAX, 505 .ctl_name = KERN_MSGMAX,
506 .procname = "msgmax", 506 .procname = "msgmax",
507 .data = NULL, 507 .data = &init_ipc_ns.msg_ctlmax,
508 .maxlen = sizeof (int), 508 .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
509 .mode = 0644, 509 .mode = 0644,
510 .proc_handler = &proc_do_ipc_string, 510 .proc_handler = &proc_ipc_dointvec,
511 .strategy = sysctl_ipc_data,
511 }, 512 },
512 { 513 {
513 .ctl_name = KERN_MSGMNI, 514 .ctl_name = KERN_MSGMNI,
514 .procname = "msgmni", 515 .procname = "msgmni",
515 .data = NULL, 516 .data = &init_ipc_ns.msg_ctlmni,
516 .maxlen = sizeof (int), 517 .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
517 .mode = 0644, 518 .mode = 0644,
518 .proc_handler = &proc_do_ipc_string, 519 .proc_handler = &proc_ipc_dointvec,
520 .strategy = sysctl_ipc_data,
519 }, 521 },
520 { 522 {
521 .ctl_name = KERN_MSGMNB, 523 .ctl_name = KERN_MSGMNB,
522 .procname = "msgmnb", 524 .procname = "msgmnb",
523 .data = NULL, 525 .data = &init_ipc_ns.msg_ctlmnb,
524 .maxlen = sizeof (int), 526 .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
525 .mode = 0644, 527 .mode = 0644,
526 .proc_handler = &proc_do_ipc_string, 528 .proc_handler = &proc_ipc_dointvec,
529 .strategy = sysctl_ipc_data,
527 }, 530 },
528 { 531 {
529 .ctl_name = KERN_SEM, 532 .ctl_name = KERN_SEM,
530 .procname = "sem", 533 .procname = "sem",
531 .data = NULL, 534 .data = &init_ipc_ns.sem_ctls,
532 .maxlen = 4*sizeof (int), 535 .maxlen = 4*sizeof (int),
533 .mode = 0644, 536 .mode = 0644,
534 .proc_handler = &proc_do_ipc_string, 537 .proc_handler = &proc_ipc_dointvec,
538 .strategy = sysctl_ipc_data,
535 }, 539 },
536#endif 540#endif
537#ifdef CONFIG_MAGIC_SYSRQ 541#ifdef CONFIG_MAGIC_SYSRQ
538 { 542 {
539 .ctl_name = KERN_SYSRQ, 543 .ctl_name = KERN_SYSRQ,
540 .procname = "sysrq", 544 .procname = "sysrq",
541 .data = &sysrq_enabled, 545 .data = &__sysrq_enabled,
542 .maxlen = sizeof (int), 546 .maxlen = sizeof (int),
543 .mode = 0644, 547 .mode = 0644,
544 .proc_handler = &proc_dointvec, 548 .proc_handler = &proc_dointvec,
@@ -707,6 +711,14 @@ static ctl_table kern_table[] = {
707 .mode = 0444, 711 .mode = 0444,
708 .proc_handler = &proc_dointvec, 712 .proc_handler = &proc_dointvec,
709 }, 713 },
714 {
715 .ctl_name = CTL_UNNUMBERED,
716 .procname = "kstack_depth_to_print",
717 .data = &kstack_depth_to_print,
718 .maxlen = sizeof(int),
719 .mode = 0644,
720 .proc_handler = &proc_dointvec,
721 },
710#endif 722#endif
711#if defined(CONFIG_MMU) 723#if defined(CONFIG_MMU)
712 { 724 {
@@ -977,17 +989,6 @@ static ctl_table vm_table[] = {
977 .extra1 = &zero, 989 .extra1 = &zero,
978 }, 990 },
979#endif 991#endif
980#ifdef CONFIG_SWAP
981 {
982 .ctl_name = VM_SWAP_TOKEN_TIMEOUT,
983 .procname = "swap_token_timeout",
984 .data = &swap_token_default_timeout,
985 .maxlen = sizeof(swap_token_default_timeout),
986 .mode = 0644,
987 .proc_handler = &proc_dointvec_jiffies,
988 .strategy = &sysctl_jiffies,
989 },
990#endif
991#ifdef CONFIG_NUMA 992#ifdef CONFIG_NUMA
992 { 993 {
993 .ctl_name = VM_ZONE_RECLAIM_MODE, 994 .ctl_name = VM_ZONE_RECLAIM_MODE,
@@ -1241,7 +1242,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1241 do { 1242 do {
1242 struct ctl_table_header *head = 1243 struct ctl_table_header *head =
1243 list_entry(tmp, struct ctl_table_header, ctl_entry); 1244 list_entry(tmp, struct ctl_table_header, ctl_entry);
1244 void *context = NULL;
1245 1245
1246 if (!use_table(head)) 1246 if (!use_table(head))
1247 continue; 1247 continue;
@@ -1249,9 +1249,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1249 spin_unlock(&sysctl_lock); 1249 spin_unlock(&sysctl_lock);
1250 1250
1251 error = parse_table(name, nlen, oldval, oldlenp, 1251 error = parse_table(name, nlen, oldval, oldlenp,
1252 newval, newlen, head->ctl_table, 1252 newval, newlen, head->ctl_table);
1253 &context);
1254 kfree(context);
1255 1253
1256 spin_lock(&sysctl_lock); 1254 spin_lock(&sysctl_lock);
1257 unuse_table(head); 1255 unuse_table(head);
@@ -1307,7 +1305,7 @@ static inline int ctl_perm(ctl_table *table, int op)
1307static int parse_table(int __user *name, int nlen, 1305static int parse_table(int __user *name, int nlen,
1308 void __user *oldval, size_t __user *oldlenp, 1306 void __user *oldval, size_t __user *oldlenp,
1309 void __user *newval, size_t newlen, 1307 void __user *newval, size_t newlen,
1310 ctl_table *table, void **context) 1308 ctl_table *table)
1311{ 1309{
1312 int n; 1310 int n;
1313repeat: 1311repeat:
@@ -1327,7 +1325,7 @@ repeat:
1327 error = table->strategy( 1325 error = table->strategy(
1328 table, name, nlen, 1326 table, name, nlen,
1329 oldval, oldlenp, 1327 oldval, oldlenp,
1330 newval, newlen, context); 1328 newval, newlen);
1331 if (error) 1329 if (error)
1332 return error; 1330 return error;
1333 } 1331 }
@@ -1338,7 +1336,7 @@ repeat:
1338 } 1336 }
1339 error = do_sysctl_strategy(table, name, nlen, 1337 error = do_sysctl_strategy(table, name, nlen,
1340 oldval, oldlenp, 1338 oldval, oldlenp,
1341 newval, newlen, context); 1339 newval, newlen);
1342 return error; 1340 return error;
1343 } 1341 }
1344 } 1342 }
@@ -1349,7 +1347,7 @@ repeat:
1349int do_sysctl_strategy (ctl_table *table, 1347int do_sysctl_strategy (ctl_table *table,
1350 int __user *name, int nlen, 1348 int __user *name, int nlen,
1351 void __user *oldval, size_t __user *oldlenp, 1349 void __user *oldval, size_t __user *oldlenp,
1352 void __user *newval, size_t newlen, void **context) 1350 void __user *newval, size_t newlen)
1353{ 1351{
1354 int op = 0, rc; 1352 int op = 0, rc;
1355 size_t len; 1353 size_t len;
@@ -1363,7 +1361,7 @@ int do_sysctl_strategy (ctl_table *table,
1363 1361
1364 if (table->strategy) { 1362 if (table->strategy) {
1365 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1363 rc = table->strategy(table, name, nlen, oldval, oldlenp,
1366 newval, newlen, context); 1364 newval, newlen);
1367 if (rc < 0) 1365 if (rc < 0)
1368 return rc; 1366 return rc;
1369 if (rc > 0) 1367 if (rc > 0)
@@ -1616,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1616 size_t count, loff_t *ppos) 1614 size_t count, loff_t *ppos)
1617{ 1615{
1618 int op; 1616 int op;
1619 struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); 1617 struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
1620 struct ctl_table *table; 1618 struct ctl_table *table;
1621 size_t res; 1619 size_t res;
1622 ssize_t error = -ENOTDIR; 1620 ssize_t error = -ENOTDIR;
@@ -1755,66 +1753,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
1755 * Special case of dostring for the UTS structure. This has locks 1753 * Special case of dostring for the UTS structure. This has locks
1756 * to observe. Should this be in kernel/sys.c ???? 1754 * to observe. Should this be in kernel/sys.c ????
1757 */ 1755 */
1758
1759#ifndef CONFIG_UTS_NS
1760static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1761 void __user *buffer, size_t *lenp, loff_t *ppos)
1762{
1763 int r;
1764 1756
1765 if (!write) {
1766 down_read(&uts_sem);
1767 r=proc_dostring(table,0,filp,buffer,lenp, ppos);
1768 up_read(&uts_sem);
1769 } else {
1770 down_write(&uts_sem);
1771 r=proc_dostring(table,1,filp,buffer,lenp, ppos);
1772 up_write(&uts_sem);
1773 }
1774 return r;
1775}
1776#else /* !CONFIG_UTS_NS */
1777static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 1757static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
1778 void __user *buffer, size_t *lenp, loff_t *ppos) 1758 void __user *buffer, size_t *lenp, loff_t *ppos)
1779{ 1759{
1780 int r; 1760 int r;
1781 struct uts_namespace* uts_ns = current->nsproxy->uts_ns; 1761 void *which;
1782 char* which; 1762 which = get_uts(table, write);
1783 1763 r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
1784 switch (table->ctl_name) { 1764 put_uts(table, write, which);
1785 case KERN_OSTYPE:
1786 which = uts_ns->name.sysname;
1787 break;
1788 case KERN_NODENAME:
1789 which = uts_ns->name.nodename;
1790 break;
1791 case KERN_OSRELEASE:
1792 which = uts_ns->name.release;
1793 break;
1794 case KERN_VERSION:
1795 which = uts_ns->name.version;
1796 break;
1797 case KERN_DOMAINNAME:
1798 which = uts_ns->name.domainname;
1799 break;
1800 default:
1801 r = -EINVAL;
1802 goto out;
1803 }
1804
1805 if (!write) {
1806 down_read(&uts_sem);
1807 r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
1808 up_read(&uts_sem);
1809 } else {
1810 down_write(&uts_sem);
1811 r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
1812 up_write(&uts_sem);
1813 }
1814 out:
1815 return r; 1765 return r;
1816} 1766}
1817#endif /* !CONFIG_UTS_NS */
1818 1767
1819static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 1768static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1820 int *valp, 1769 int *valp,
@@ -1886,7 +1835,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
1886 p = buf; 1835 p = buf;
1887 if (*p == '-' && left > 1) { 1836 if (*p == '-' && left > 1) {
1888 neg = 1; 1837 neg = 1;
1889 left--, p++; 1838 p++;
1890 } 1839 }
1891 if (*p < '0' || *p > '9') 1840 if (*p < '0' || *p > '9')
1892 break; 1841 break;
@@ -1978,9 +1927,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
1978 1927
1979#define OP_SET 0 1928#define OP_SET 0
1980#define OP_AND 1 1929#define OP_AND 1
1981#define OP_OR 2
1982#define OP_MAX 3
1983#define OP_MIN 4
1984 1930
1985static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, 1931static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1986 int *valp, 1932 int *valp,
@@ -1992,13 +1938,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1992 switch(op) { 1938 switch(op) {
1993 case OP_SET: *valp = val; break; 1939 case OP_SET: *valp = val; break;
1994 case OP_AND: *valp &= val; break; 1940 case OP_AND: *valp &= val; break;
1995 case OP_OR: *valp |= val; break;
1996 case OP_MAX: if(*valp < val)
1997 *valp = val;
1998 break;
1999 case OP_MIN: if(*valp > val)
2000 *valp = val;
2001 break;
2002 } 1941 }
2003 } else { 1942 } else {
2004 int val = *valp; 1943 int val = *valp;
@@ -2137,7 +2076,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
2137 p = buf; 2076 p = buf;
2138 if (*p == '-' && left > 1) { 2077 if (*p == '-' && left > 1) {
2139 neg = 1; 2078 neg = 1;
2140 left--, p++; 2079 p++;
2141 } 2080 }
2142 if (*p < '0' || *p > '9') 2081 if (*p < '0' || *p > '9')
2143 break; 2082 break;
@@ -2393,46 +2332,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2393} 2332}
2394 2333
2395#ifdef CONFIG_SYSVIPC 2334#ifdef CONFIG_SYSVIPC
2396static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, 2335static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2397 void __user *buffer, size_t *lenp, loff_t *ppos) 2336 void __user *buffer, size_t *lenp, loff_t *ppos)
2398{ 2337{
2399 void *data; 2338 void *which;
2400 struct ipc_namespace *ns; 2339 which = get_ipc(table, write);
2401 2340 return __do_proc_dointvec(which, table, write, filp, buffer,
2402 ns = current->nsproxy->ipc_ns;
2403
2404 switch (table->ctl_name) {
2405 case KERN_SHMMAX:
2406 data = &ns->shm_ctlmax;
2407 goto proc_minmax;
2408 case KERN_SHMALL:
2409 data = &ns->shm_ctlall;
2410 goto proc_minmax;
2411 case KERN_SHMMNI:
2412 data = &ns->shm_ctlmni;
2413 break;
2414 case KERN_MSGMAX:
2415 data = &ns->msg_ctlmax;
2416 break;
2417 case KERN_MSGMNI:
2418 data = &ns->msg_ctlmni;
2419 break;
2420 case KERN_MSGMNB:
2421 data = &ns->msg_ctlmnb;
2422 break;
2423 case KERN_SEM:
2424 data = &ns->sem_ctls;
2425 break;
2426 default:
2427 return -EINVAL;
2428 }
2429
2430 return __do_proc_dointvec(data, table, write, filp, buffer,
2431 lenp, ppos, NULL, NULL); 2341 lenp, ppos, NULL, NULL);
2432proc_minmax: 2342}
2433 return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, 2343
2344static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2345 struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
2346{
2347 void *which;
2348 which = get_ipc(table, write);
2349 return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
2434 lenp, ppos, 1l, 1l); 2350 lenp, ppos, 1l, 1l);
2435} 2351}
2352
2436#endif 2353#endif
2437 2354
2438static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 2355static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -2477,6 +2394,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
2477{ 2394{
2478 return -ENOSYS; 2395 return -ENOSYS;
2479} 2396}
2397static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
2398 void __user *buffer, size_t *lenp, loff_t *ppos)
2399{
2400 return -ENOSYS;
2401}
2402static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
2403 struct file *filp, void __user *buffer,
2404 size_t *lenp, loff_t *ppos)
2405{
2406 return -ENOSYS;
2407}
2480#endif 2408#endif
2481 2409
2482int proc_dointvec(ctl_table *table, int write, struct file *filp, 2410int proc_dointvec(ctl_table *table, int write, struct file *filp,
@@ -2541,7 +2469,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2541/* The generic string strategy routine: */ 2469/* The generic string strategy routine: */
2542int sysctl_string(ctl_table *table, int __user *name, int nlen, 2470int sysctl_string(ctl_table *table, int __user *name, int nlen,
2543 void __user *oldval, size_t __user *oldlenp, 2471 void __user *oldval, size_t __user *oldlenp,
2544 void __user *newval, size_t newlen, void **context) 2472 void __user *newval, size_t newlen)
2545{ 2473{
2546 if (!table->data || !table->maxlen) 2474 if (!table->data || !table->maxlen)
2547 return -ENOTDIR; 2475 return -ENOTDIR;
@@ -2587,7 +2515,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2587 */ 2515 */
2588int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2516int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2589 void __user *oldval, size_t __user *oldlenp, 2517 void __user *oldval, size_t __user *oldlenp,
2590 void __user *newval, size_t newlen, void **context) 2518 void __user *newval, size_t newlen)
2591{ 2519{
2592 2520
2593 if (newval && newlen) { 2521 if (newval && newlen) {
@@ -2623,7 +2551,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2623/* Strategy function to convert jiffies to seconds */ 2551/* Strategy function to convert jiffies to seconds */
2624int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2552int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2625 void __user *oldval, size_t __user *oldlenp, 2553 void __user *oldval, size_t __user *oldlenp,
2626 void __user *newval, size_t newlen, void **context) 2554 void __user *newval, size_t newlen)
2627{ 2555{
2628 if (oldval) { 2556 if (oldval) {
2629 size_t olen; 2557 size_t olen;
@@ -2651,7 +2579,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2651/* Strategy function to convert jiffies to seconds */ 2579/* Strategy function to convert jiffies to seconds */
2652int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2580int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2653 void __user *oldval, size_t __user *oldlenp, 2581 void __user *oldval, size_t __user *oldlenp,
2654 void __user *newval, size_t newlen, void **context) 2582 void __user *newval, size_t newlen)
2655{ 2583{
2656 if (oldval) { 2584 if (oldval) {
2657 size_t olen; 2585 size_t olen;
@@ -2676,6 +2604,64 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2676 return 1; 2604 return 1;
2677} 2605}
2678 2606
2607
2608/* The generic string strategy routine: */
2609static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2610 void __user *oldval, size_t __user *oldlenp,
2611 void __user *newval, size_t newlen)
2612{
2613 struct ctl_table uts_table;
2614 int r, write;
2615 write = newval && newlen;
2616 memcpy(&uts_table, table, sizeof(uts_table));
2617 uts_table.data = get_uts(table, write);
2618 r = sysctl_string(&uts_table, name, nlen,
2619 oldval, oldlenp, newval, newlen);
2620 put_uts(table, write, uts_table.data);
2621 return r;
2622}
2623
2624#ifdef CONFIG_SYSVIPC
2625/* The generic sysctl ipc data routine. */
2626static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2627 void __user *oldval, size_t __user *oldlenp,
2628 void __user *newval, size_t newlen)
2629{
2630 size_t len;
2631 void *data;
2632
2633 /* Get out of I don't have a variable */
2634 if (!table->data || !table->maxlen)
2635 return -ENOTDIR;
2636
2637 data = get_ipc(table, 1);
2638 if (!data)
2639 return -ENOTDIR;
2640
2641 if (oldval && oldlenp) {
2642 if (get_user(len, oldlenp))
2643 return -EFAULT;
2644 if (len) {
2645 if (len > table->maxlen)
2646 len = table->maxlen;
2647 if (copy_to_user(oldval, data, len))
2648 return -EFAULT;
2649 if (put_user(len, oldlenp))
2650 return -EFAULT;
2651 }
2652 }
2653
2654 if (newval && newlen) {
2655 if (newlen > table->maxlen)
2656 newlen = table->maxlen;
2657
2658 if (copy_from_user(data, newval, newlen))
2659 return -EFAULT;
2660 }
2661 return 1;
2662}
2663#endif
2664
2679#else /* CONFIG_SYSCTL_SYSCALL */ 2665#else /* CONFIG_SYSCTL_SYSCALL */
2680 2666
2681 2667
@@ -2714,32 +2700,44 @@ out:
2714 2700
2715int sysctl_string(ctl_table *table, int __user *name, int nlen, 2701int sysctl_string(ctl_table *table, int __user *name, int nlen,
2716 void __user *oldval, size_t __user *oldlenp, 2702 void __user *oldval, size_t __user *oldlenp,
2717 void __user *newval, size_t newlen, void **context) 2703 void __user *newval, size_t newlen)
2718{ 2704{
2719 return -ENOSYS; 2705 return -ENOSYS;
2720} 2706}
2721 2707
2722int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2708int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2723 void __user *oldval, size_t __user *oldlenp, 2709 void __user *oldval, size_t __user *oldlenp,
2724 void __user *newval, size_t newlen, void **context) 2710 void __user *newval, size_t newlen)
2725{ 2711{
2726 return -ENOSYS; 2712 return -ENOSYS;
2727} 2713}
2728 2714
2729int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2715int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2730 void __user *oldval, size_t __user *oldlenp, 2716 void __user *oldval, size_t __user *oldlenp,
2731 void __user *newval, size_t newlen, void **context) 2717 void __user *newval, size_t newlen)
2732{ 2718{
2733 return -ENOSYS; 2719 return -ENOSYS;
2734} 2720}
2735 2721
2736int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2722int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2737 void __user *oldval, size_t __user *oldlenp, 2723 void __user *oldval, size_t __user *oldlenp,
2738 void __user *newval, size_t newlen, void **context) 2724 void __user *newval, size_t newlen)
2739{ 2725{
2740 return -ENOSYS; 2726 return -ENOSYS;
2741} 2727}
2742 2728
2729static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
2730 void __user *oldval, size_t __user *oldlenp,
2731 void __user *newval, size_t newlen)
2732{
2733 return -ENOSYS;
2734}
2735static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
2736 void __user *oldval, size_t __user *oldlenp,
2737 void __user *newval, size_t newlen)
2738{
2739 return -ENOSYS;
2740}
2743#endif /* CONFIG_SYSCTL_SYSCALL */ 2741#endif /* CONFIG_SYSCTL_SYSCALL */
2744 2742
2745/* 2743/*
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f45c5e70773c..4c3476fa058d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
34 34
35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
36static int family_registered; 36static int family_registered;
37kmem_cache_t *taskstats_cache; 37struct kmem_cache *taskstats_cache;
38 38
39static struct genl_family family = { 39static struct genl_family family = {
40 .id = GENL_ID_GENERATE, 40 .id = GENL_ID_GENERATE,
@@ -69,7 +69,7 @@ enum actions {
69}; 69};
70 70
71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
72 void **replyp, size_t size) 72 size_t size)
73{ 73{
74 struct sk_buff *skb; 74 struct sk_buff *skb;
75 void *reply; 75 void *reply;
@@ -77,8 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
77 /* 77 /*
78 * If new attributes are added, please revisit this allocation 78 * If new attributes are added, please revisit this allocation
79 */ 79 */
80 size = nlmsg_total_size(genlmsg_total_size(size)); 80 skb = genlmsg_new(size, GFP_KERNEL);
81 skb = nlmsg_new(size, GFP_KERNEL);
82 if (!skb) 81 if (!skb)
83 return -ENOMEM; 82 return -ENOMEM;
84 83
@@ -86,20 +85,15 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
86 int seq = get_cpu_var(taskstats_seqnum)++; 85 int seq = get_cpu_var(taskstats_seqnum)++;
87 put_cpu_var(taskstats_seqnum); 86 put_cpu_var(taskstats_seqnum);
88 87
89 reply = genlmsg_put(skb, 0, seq, 88 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
90 family.id, 0, 0,
91 cmd, family.version);
92 } else 89 } else
93 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, 90 reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
94 family.id, 0, 0,
95 cmd, family.version);
96 if (reply == NULL) { 91 if (reply == NULL) {
97 nlmsg_free(skb); 92 nlmsg_free(skb);
98 return -EINVAL; 93 return -EINVAL;
99 } 94 }
100 95
101 *skbp = skb; 96 *skbp = skb;
102 *replyp = reply;
103 return 0; 97 return 0;
104} 98}
105 99
@@ -124,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
124/* 118/*
125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 119 * Send taskstats data in @skb to listeners registered for @cpu's exit data
126 */ 120 */
127static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) 121static void send_cpu_listeners(struct sk_buff *skb,
122 struct listener_list *listeners)
128{ 123{
129 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 124 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
130 struct listener_list *listeners;
131 struct listener *s, *tmp; 125 struct listener *s, *tmp;
132 struct sk_buff *skb_next, *skb_cur = skb; 126 struct sk_buff *skb_next, *skb_cur = skb;
133 void *reply = genlmsg_data(genlhdr); 127 void *reply = genlmsg_data(genlhdr);
@@ -140,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
140 } 134 }
141 135
142 rc = 0; 136 rc = 0;
143 listeners = &per_cpu(listener_array, cpu);
144 down_read(&listeners->sem); 137 down_read(&listeners->sem);
145 list_for_each_entry(s, &listeners->list, list) { 138 list_for_each_entry(s, &listeners->list, list) {
146 skb_next = NULL; 139 skb_next = NULL;
@@ -191,6 +184,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
191 } else 184 } else
192 get_task_struct(tsk); 185 get_task_struct(tsk);
193 186
187 memset(stats, 0, sizeof(*stats));
194 /* 188 /*
195 * Each accounting subsystem adds calls to its functions to 189 * Each accounting subsystem adds calls to its functions to
196 * fill in relevant parts of struct taskstsats as follows 190 * fill in relevant parts of struct taskstsats as follows
@@ -233,6 +227,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
233 227
234 if (first->signal->stats) 228 if (first->signal->stats)
235 memcpy(stats, first->signal->stats, sizeof(*stats)); 229 memcpy(stats, first->signal->stats, sizeof(*stats));
230 else
231 memset(stats, 0, sizeof(*stats));
236 232
237 tsk = first; 233 tsk = first;
238 do { 234 do {
@@ -349,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask)
349 return ret; 345 return ret;
350} 346}
351 347
348static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
349{
350 struct nlattr *na, *ret;
351 int aggr;
352
353 aggr = (type == TASKSTATS_TYPE_PID)
354 ? TASKSTATS_TYPE_AGGR_PID
355 : TASKSTATS_TYPE_AGGR_TGID;
356
357 na = nla_nest_start(skb, aggr);
358 if (!na)
359 goto err;
360 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
361 goto err;
362 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
363 if (!ret)
364 goto err;
365 nla_nest_end(skb, na);
366
367 return nla_data(ret);
368err:
369 return NULL;
370}
371
352static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 372static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
353{ 373{
354 int rc = 0; 374 int rc = 0;
355 struct sk_buff *rep_skb; 375 struct sk_buff *rep_skb;
356 struct taskstats stats; 376 struct taskstats *stats;
357 void *reply;
358 size_t size; 377 size_t size;
359 struct nlattr *na;
360 cpumask_t mask; 378 cpumask_t mask;
361 379
362 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 380 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -377,83 +395,71 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
377 size = nla_total_size(sizeof(u32)) + 395 size = nla_total_size(sizeof(u32)) +
378 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 396 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
379 397
380 memset(&stats, 0, sizeof(stats)); 398 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
381 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
382 if (rc < 0) 399 if (rc < 0)
383 return rc; 400 return rc;
384 401
402 rc = -EINVAL;
385 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 403 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
386 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 404 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
387 rc = fill_pid(pid, NULL, &stats); 405 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
388 if (rc < 0) 406 if (!stats)
389 goto err; 407 goto err;
390 408
391 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 409 rc = fill_pid(pid, NULL, stats);
392 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); 410 if (rc < 0)
393 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 411 goto err;
394 stats);
395 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 412 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
396 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 413 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
397 rc = fill_tgid(tgid, NULL, &stats); 414 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
398 if (rc < 0) 415 if (!stats)
399 goto err; 416 goto err;
400 417
401 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 418 rc = fill_tgid(tgid, NULL, stats);
402 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); 419 if (rc < 0)
403 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 420 goto err;
404 stats); 421 } else
405 } else {
406 rc = -EINVAL;
407 goto err; 422 goto err;
408 }
409
410 nla_nest_end(rep_skb, na);
411 423
412 return send_reply(rep_skb, info->snd_pid); 424 return send_reply(rep_skb, info->snd_pid);
413
414nla_put_failure:
415 rc = genlmsg_cancel(rep_skb, reply);
416err: 425err:
417 nlmsg_free(rep_skb); 426 nlmsg_free(rep_skb);
418 return rc; 427 return rc;
419} 428}
420 429
421void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 430static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
422{ 431{
423 struct listener_list *listeners; 432 struct signal_struct *sig = tsk->signal;
424 struct taskstats *tmp; 433 struct taskstats *stats;
425 /*
426 * This is the cpu on which the task is exiting currently and will
427 * be the one for which the exit event is sent, even if the cpu
428 * on which this function is running changes later.
429 */
430 *mycpu = raw_smp_processor_id();
431 434
432 *ptidstats = NULL; 435 if (sig->stats || thread_group_empty(tsk))
433 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 436 goto ret;
434 if (!tmp)
435 return;
436 437
437 listeners = &per_cpu(listener_array, *mycpu); 438 /* No problem if kmem_cache_zalloc() fails */
438 down_read(&listeners->sem); 439 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
439 if (!list_empty(&listeners->list)) { 440
440 *ptidstats = tmp; 441 spin_lock_irq(&tsk->sighand->siglock);
441 tmp = NULL; 442 if (!sig->stats) {
443 sig->stats = stats;
444 stats = NULL;
442 } 445 }
443 up_read(&listeners->sem); 446 spin_unlock_irq(&tsk->sighand->siglock);
444 kfree(tmp); 447
448 if (stats)
449 kmem_cache_free(taskstats_cache, stats);
450ret:
451 return sig->stats;
445} 452}
446 453
447/* Send pid data out on exit */ 454/* Send pid data out on exit */
448void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 455void taskstats_exit(struct task_struct *tsk, int group_dead)
449 int group_dead, unsigned int mycpu)
450{ 456{
451 int rc; 457 int rc;
458 struct listener_list *listeners;
459 struct taskstats *stats;
452 struct sk_buff *rep_skb; 460 struct sk_buff *rep_skb;
453 void *reply;
454 size_t size; 461 size_t size;
455 int is_thread_group; 462 int is_thread_group;
456 struct nlattr *na;
457 463
458 if (!family_registered) 464 if (!family_registered)
459 return; 465 return;
@@ -464,7 +470,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
464 size = nla_total_size(sizeof(u32)) + 470 size = nla_total_size(sizeof(u32)) +
465 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 471 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
466 472
467 is_thread_group = (tsk->signal->stats != NULL); 473 is_thread_group = !!taskstats_tgid_alloc(tsk);
468 if (is_thread_group) { 474 if (is_thread_group) {
469 /* PID + STATS + TGID + STATS */ 475 /* PID + STATS + TGID + STATS */
470 size = 2 * size; 476 size = 2 * size;
@@ -472,49 +478,39 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
472 fill_tgid_exit(tsk); 478 fill_tgid_exit(tsk);
473 } 479 }
474 480
475 if (!tidstats) 481 listeners = &__raw_get_cpu_var(listener_array);
482 if (list_empty(&listeners->list))
476 return; 483 return;
477 484
478 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 485 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
479 if (rc < 0)
480 goto ret;
481
482 rc = fill_pid(tsk->pid, tsk, tidstats);
483 if (rc < 0) 486 if (rc < 0)
484 goto err_skb; 487 return;
485 488
486 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 489 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
487 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); 490 if (!stats)
488 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 491 goto err;
489 *tidstats);
490 nla_nest_end(rep_skb, na);
491 492
492 if (!is_thread_group) 493 rc = fill_pid(tsk->pid, tsk, stats);
493 goto send; 494 if (rc < 0)
495 goto err;
494 496
495 /* 497 /*
496 * Doesn't matter if tsk is the leader or the last group member leaving 498 * Doesn't matter if tsk is the leader or the last group member leaving
497 */ 499 */
498 if (!group_dead) 500 if (!is_thread_group || !group_dead)
499 goto send; 501 goto send;
500 502
501 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 503 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
502 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); 504 if (!stats)
503 /* No locking needed for tsk->signal->stats since group is dead */ 505 goto err;
504 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 506
505 *tsk->signal->stats); 507 memcpy(stats, tsk->signal->stats, sizeof(*stats));
506 nla_nest_end(rep_skb, na);
507 508
508send: 509send:
509 send_cpu_listeners(rep_skb, mycpu); 510 send_cpu_listeners(rep_skb, listeners);
510 return; 511 return;
511 512err:
512nla_put_failure:
513 genlmsg_cancel(rep_skb, reply);
514err_skb:
515 nlmsg_free(rep_skb); 513 nlmsg_free(rep_skb);
516ret:
517 return;
518} 514}
519 515
520static struct genl_ops taskstats_ops = { 516static struct genl_ops taskstats_ops = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939bd9..22504afc0d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
156 /* check if clocksource is already registered */ 156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) { 157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. " 158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name); 159 "Already registered!", c->name);
160 ret = -EBUSY; 160 ret = -EBUSY;
161 } else { 161 } else {
162 /* register it */ 162 /* register it */
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
186} 186}
187EXPORT_SYMBOL(clocksource_reselect); 187EXPORT_SYMBOL(clocksource_reselect);
188 188
189#ifdef CONFIG_SYSFS
189/** 190/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource 191 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused 192 * @dev: unused
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
275 * Sysfs setup bits: 276 * Sysfs setup bits:
276 */ 277 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, 278static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource); 279 sysfs_override_clocksource);
279 280
280static SYSDEV_ATTR(available_clocksource, 0600, 281static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL); 282 sysfs_show_available_clocksources, NULL);
282 283
283static struct sysdev_class clocksource_sysclass = { 284static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"), 285 set_kset_name("clocksource"),
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
307} 308}
308 309
309device_initcall(init_clocksource_sysfs); 310device_initcall(init_clocksource_sysfs);
311#endif /* CONFIG_SYSFS */
310 312
311/** 313/**
312 * boot_override_clocksource - boot clock override 314 * boot_override_clocksource - boot clock override
diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffec1..c2a8ccfc2882 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
80EXPORT_SYMBOL(boot_tvec_bases); 80EXPORT_SYMBOL(boot_tvec_bases);
81static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 81static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
82 82
83/**
84 * __round_jiffies - function to round jiffies to a full second
85 * @j: the time in (absolute) jiffies that should be rounded
86 * @cpu: the processor number on which the timeout will happen
87 *
88 * __round_jiffies rounds an absolute time in the future (in jiffies)
89 * up or down to (approximately) full seconds. This is useful for timers
90 * for which the exact time they fire does not matter too much, as long as
91 * they fire approximately every X seconds.
92 *
93 * By rounding these timers to whole seconds, all such timers will fire
94 * at the same time, rather than at various times spread out. The goal
95 * of this is to have the CPU wake up less, which saves power.
96 *
97 * The exact rounding is skewed for each processor to avoid all
98 * processors firing at the exact same time, which could lead
99 * to lock contention or spurious cache line bouncing.
100 *
101 * The return value is the rounded version of the "j" parameter.
102 */
103unsigned long __round_jiffies(unsigned long j, int cpu)
104{
105 int rem;
106 unsigned long original = j;
107
108 /*
109 * We don't want all cpus firing their timers at once hitting the
110 * same lock or cachelines, so we skew each extra cpu with an extra
111 * 3 jiffies. This 3 jiffies came originally from the mm/ code which
112 * already did this.
113 * The skew is done by adding 3*cpunr, then round, then subtract this
114 * extra offset again.
115 */
116 j += cpu * 3;
117
118 rem = j % HZ;
119
120 /*
121 * If the target jiffie is just after a whole second (which can happen
122 * due to delays of the timer irq, long irq off times etc etc) then
123 * we should round down to the whole second, not up. Use 1/4th second
124 * as cutoff for this rounding as an extreme upper bound for this.
125 */
126 if (rem < HZ/4) /* round down */
127 j = j - rem;
128 else /* round up */
129 j = j - rem + HZ;
130
131 /* now that we have rounded, subtract the extra skew again */
132 j -= cpu * 3;
133
134 if (j <= jiffies) /* rounding ate our timeout entirely; */
135 return original;
136 return j;
137}
138EXPORT_SYMBOL_GPL(__round_jiffies);
139
140/**
141 * __round_jiffies_relative - function to round jiffies to a full second
142 * @j: the time in (relative) jiffies that should be rounded
143 * @cpu: the processor number on which the timeout will happen
144 *
145 * __round_jiffies_relative rounds a time delta in the future (in jiffies)
146 * up or down to (approximately) full seconds. This is useful for timers
147 * for which the exact time they fire does not matter too much, as long as
148 * they fire approximately every X seconds.
149 *
150 * By rounding these timers to whole seconds, all such timers will fire
151 * at the same time, rather than at various times spread out. The goal
152 * of this is to have the CPU wake up less, which saves power.
153 *
154 * The exact rounding is skewed for each processor to avoid all
155 * processors firing at the exact same time, which could lead
156 * to lock contention or spurious cache line bouncing.
157 *
158 * The return value is the rounded version of the "j" parameter.
159 */
160unsigned long __round_jiffies_relative(unsigned long j, int cpu)
161{
162 /*
163 * In theory the following code can skip a jiffy in case jiffies
164 * increments right between the addition and the later subtraction.
165 * However since the entire point of this function is to use approximate
166 * timeouts, it's entirely ok to not handle that.
167 */
168 return __round_jiffies(j + jiffies, cpu) - jiffies;
169}
170EXPORT_SYMBOL_GPL(__round_jiffies_relative);
171
172/**
173 * round_jiffies - function to round jiffies to a full second
174 * @j: the time in (absolute) jiffies that should be rounded
175 *
176 * round_jiffies rounds an absolute time in the future (in jiffies)
177 * up or down to (approximately) full seconds. This is useful for timers
178 * for which the exact time they fire does not matter too much, as long as
179 * they fire approximately every X seconds.
180 *
181 * By rounding these timers to whole seconds, all such timers will fire
182 * at the same time, rather than at various times spread out. The goal
183 * of this is to have the CPU wake up less, which saves power.
184 *
185 * The return value is the rounded version of the "j" parameter.
186 */
187unsigned long round_jiffies(unsigned long j)
188{
189 return __round_jiffies(j, raw_smp_processor_id());
190}
191EXPORT_SYMBOL_GPL(round_jiffies);
192
193/**
194 * round_jiffies_relative - function to round jiffies to a full second
195 * @j: the time in (relative) jiffies that should be rounded
196 *
197 * round_jiffies_relative rounds a time delta in the future (in jiffies)
198 * up or down to (approximately) full seconds. This is useful for timers
199 * for which the exact time they fire does not matter too much, as long as
200 * they fire approximately every X seconds.
201 *
202 * By rounding these timers to whole seconds, all such timers will fire
203 * at the same time, rather than at various times spread out. The goal
204 * of this is to have the CPU wake up less, which saves power.
205 *
206 * The return value is the rounded version of the "j" parameter.
207 */
208unsigned long round_jiffies_relative(unsigned long j)
209{
210 return __round_jiffies_relative(j, raw_smp_processor_id());
211}
212EXPORT_SYMBOL_GPL(round_jiffies_relative);
213
214
83static inline void set_running_timer(tvec_base_t *base, 215static inline void set_running_timer(tvec_base_t *base,
84 struct timer_list *timer) 216 struct timer_list *timer)
85{ 217{
@@ -714,7 +846,7 @@ static int change_clocksource(void)
714 clock = new; 846 clock = new;
715 clock->cycle_last = now; 847 clock->cycle_last = now;
716 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 848 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
717 clock->name); 849 clock->name);
718 return 1; 850 return 1;
719 } else if (clock->update_callback) { 851 } else if (clock->update_callback) {
720 return clock->update_callback(); 852 return clock->update_callback();
@@ -722,7 +854,10 @@ static int change_clocksource(void)
722 return 0; 854 return 0;
723} 855}
724#else 856#else
725#define change_clocksource() (0) 857static inline int change_clocksource(void)
858{
859 return 0;
860}
726#endif 861#endif
727 862
728/** 863/**
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device);
820 * If the error is already larger, we look ahead even further 955 * If the error is already larger, we look ahead even further
821 * to compensate for late or lost adjustments. 956 * to compensate for late or lost adjustments.
822 */ 957 */
823static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) 958static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
959 s64 *offset)
824{ 960{
825 s64 tick_error, i; 961 s64 tick_error, i;
826 u32 look_ahead, adj; 962 u32 look_ahead, adj;
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
844 * Now calculate the error in (1 << look_ahead) ticks, but first 980 * Now calculate the error in (1 << look_ahead) ticks, but first
845 * remove the single look ahead already included in the error. 981 * remove the single look ahead already included in the error.
846 */ 982 */
847 tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); 983 tick_error = current_tick_length() >>
984 (TICK_LENGTH_SHIFT - clock->shift + 1);
848 tick_error -= clock->xtime_interval >> 1; 985 tick_error -= clock->xtime_interval >> 1;
849 error = ((error - tick_error) >> look_ahead) + tick_error; 986 error = ((error - tick_error) >> look_ahead) + tick_error;
850 987
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
896 clock->mult += adj; 1033 clock->mult += adj;
897 clock->xtime_interval += interval; 1034 clock->xtime_interval += interval;
898 clock->xtime_nsec -= offset; 1035 clock->xtime_nsec -= offset;
899 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); 1036 clock->error -= (interval - offset) <<
1037 (TICK_LENGTH_SHIFT - clock->shift);
900} 1038}
901 1039
902/** 1040/**
@@ -1008,11 +1146,15 @@ static inline void calc_load(unsigned long ticks)
1008 unsigned long active_tasks; /* fixed-point */ 1146 unsigned long active_tasks; /* fixed-point */
1009 static int count = LOAD_FREQ; 1147 static int count = LOAD_FREQ;
1010 1148
1011 active_tasks = count_active_tasks(); 1149 count -= ticks;
1012 for (count -= ticks; count < 0; count += LOAD_FREQ) { 1150 if (unlikely(count < 0)) {
1013 CALC_LOAD(avenrun[0], EXP_1, active_tasks); 1151 active_tasks = count_active_tasks();
1014 CALC_LOAD(avenrun[1], EXP_5, active_tasks); 1152 do {
1015 CALC_LOAD(avenrun[2], EXP_15, active_tasks); 1153 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1154 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1155 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1156 count += LOAD_FREQ;
1157 } while (count < 0);
1016 } 1158 }
1017} 1159}
1018 1160
@@ -1202,11 +1344,10 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
1202 * should never happens anyway). You just have the printk() 1344 * should never happens anyway). You just have the printk()
1203 * that will tell you if something is gone wrong and where. 1345 * that will tell you if something is gone wrong and where.
1204 */ 1346 */
1205 if (timeout < 0) 1347 if (timeout < 0) {
1206 {
1207 printk(KERN_ERR "schedule_timeout: wrong timeout " 1348 printk(KERN_ERR "schedule_timeout: wrong timeout "
1208 "value %lx from %p\n", timeout, 1349 "value %lx\n", timeout);
1209 __builtin_return_address(0)); 1350 dump_stack();
1210 current->state = TASK_RUNNING; 1351 current->state = TASK_RUNNING;
1211 goto out; 1352 goto out;
1212 } 1353 }
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 96f77013d3f0..baacc3691415 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
96 stats->write_char = p->wchar; 96 stats->write_char = p->wchar;
97 stats->read_syscalls = p->syscr; 97 stats->read_syscalls = p->syscr;
98 stats->write_syscalls = p->syscw; 98 stats->write_syscalls = p->syscw;
99#ifdef CONFIG_TASK_IO_ACCOUNTING
100 stats->read_bytes = p->ioac.read_bytes;
101 stats->write_bytes = p->ioac.write_bytes;
102 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
103#else
104 stats->read_bytes = 0;
105 stats->write_bytes = 0;
106 stats->cancelled_write_bytes = 0;
107#endif
99} 108}
100#undef KB 109#undef KB
101#undef MB 110#undef MB
diff --git a/kernel/unwind.c b/kernel/unwind.c
deleted file mode 100644
index ed0a21d4a902..000000000000
--- a/kernel/unwind.c
+++ /dev/null
@@ -1,1182 +0,0 @@
1/*
2 * Copyright (C) 2002-2006 Novell, Inc.
3 * Jan Beulich <jbeulich@novell.com>
4 * This code is released under version 2 of the GNU GPL.
5 *
6 * A simple API for unwinding kernel stacks. This is used for
7 * debugging and error reporting purposes. The kernel doesn't need
8 * full-blown stack unwinding with all the bells and whistles, so there
9 * is not much point in implementing the full Dwarf2 unwind API.
10 */
11
12#include <linux/unwind.h>
13#include <linux/module.h>
14#include <linux/bootmem.h>
15#include <linux/sort.h>
16#include <linux/stop_machine.h>
17#include <asm/sections.h>
18#include <asm/uaccess.h>
19#include <asm/unaligned.h>
20
21extern char __start_unwind[], __end_unwind[];
22extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
23
24#define MAX_STACK_DEPTH 8
25
26#define EXTRA_INFO(f) { \
27 BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
28 % FIELD_SIZEOF(struct unwind_frame_info, f)) \
29 + offsetof(struct unwind_frame_info, f) \
30 / FIELD_SIZEOF(struct unwind_frame_info, f), \
31 FIELD_SIZEOF(struct unwind_frame_info, f) \
32 }
33#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
34
35static const struct {
36 unsigned offs:BITS_PER_LONG / 2;
37 unsigned width:BITS_PER_LONG / 2;
38} reg_info[] = {
39 UNW_REGISTER_INFO
40};
41
42#undef PTREGS_INFO
43#undef EXTRA_INFO
44
45#ifndef REG_INVALID
46#define REG_INVALID(r) (reg_info[r].width == 0)
47#endif
48
49#define DW_CFA_nop 0x00
50#define DW_CFA_set_loc 0x01
51#define DW_CFA_advance_loc1 0x02
52#define DW_CFA_advance_loc2 0x03
53#define DW_CFA_advance_loc4 0x04
54#define DW_CFA_offset_extended 0x05
55#define DW_CFA_restore_extended 0x06
56#define DW_CFA_undefined 0x07
57#define DW_CFA_same_value 0x08
58#define DW_CFA_register 0x09
59#define DW_CFA_remember_state 0x0a
60#define DW_CFA_restore_state 0x0b
61#define DW_CFA_def_cfa 0x0c
62#define DW_CFA_def_cfa_register 0x0d
63#define DW_CFA_def_cfa_offset 0x0e
64#define DW_CFA_def_cfa_expression 0x0f
65#define DW_CFA_expression 0x10
66#define DW_CFA_offset_extended_sf 0x11
67#define DW_CFA_def_cfa_sf 0x12
68#define DW_CFA_def_cfa_offset_sf 0x13
69#define DW_CFA_val_offset 0x14
70#define DW_CFA_val_offset_sf 0x15
71#define DW_CFA_val_expression 0x16
72#define DW_CFA_lo_user 0x1c
73#define DW_CFA_GNU_window_save 0x2d
74#define DW_CFA_GNU_args_size 0x2e
75#define DW_CFA_GNU_negative_offset_extended 0x2f
76#define DW_CFA_hi_user 0x3f
77
78#define DW_EH_PE_FORM 0x07
79#define DW_EH_PE_native 0x00
80#define DW_EH_PE_leb128 0x01
81#define DW_EH_PE_data2 0x02
82#define DW_EH_PE_data4 0x03
83#define DW_EH_PE_data8 0x04
84#define DW_EH_PE_signed 0x08
85#define DW_EH_PE_ADJUST 0x70
86#define DW_EH_PE_abs 0x00
87#define DW_EH_PE_pcrel 0x10
88#define DW_EH_PE_textrel 0x20
89#define DW_EH_PE_datarel 0x30
90#define DW_EH_PE_funcrel 0x40
91#define DW_EH_PE_aligned 0x50
92#define DW_EH_PE_indirect 0x80
93#define DW_EH_PE_omit 0xff
94
95typedef unsigned long uleb128_t;
96typedef signed long sleb128_t;
97
98static struct unwind_table {
99 struct {
100 unsigned long pc;
101 unsigned long range;
102 } core, init;
103 const void *address;
104 unsigned long size;
105 const unsigned char *header;
106 unsigned long hdrsz;
107 struct unwind_table *link;
108 const char *name;
109} root_table;
110
111struct unwind_item {
112 enum item_location {
113 Nowhere,
114 Memory,
115 Register,
116 Value
117 } where;
118 uleb128_t value;
119};
120
121struct unwind_state {
122 uleb128_t loc, org;
123 const u8 *cieStart, *cieEnd;
124 uleb128_t codeAlign;
125 sleb128_t dataAlign;
126 struct cfa {
127 uleb128_t reg, offs;
128 } cfa;
129 struct unwind_item regs[ARRAY_SIZE(reg_info)];
130 unsigned stackDepth:8;
131 unsigned version:8;
132 const u8 *label;
133 const u8 *stack[MAX_STACK_DEPTH];
134};
135
136static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
137
138static struct unwind_table *find_table(unsigned long pc)
139{
140 struct unwind_table *table;
141
142 for (table = &root_table; table; table = table->link)
143 if ((pc >= table->core.pc
144 && pc < table->core.pc + table->core.range)
145 || (pc >= table->init.pc
146 && pc < table->init.pc + table->init.range))
147 break;
148
149 return table;
150}
151
152static unsigned long read_pointer(const u8 **pLoc,
153 const void *end,
154 signed ptrType);
155
156static void init_unwind_table(struct unwind_table *table,
157 const char *name,
158 const void *core_start,
159 unsigned long core_size,
160 const void *init_start,
161 unsigned long init_size,
162 const void *table_start,
163 unsigned long table_size,
164 const u8 *header_start,
165 unsigned long header_size)
166{
167 const u8 *ptr = header_start + 4;
168 const u8 *end = header_start + header_size;
169
170 table->core.pc = (unsigned long)core_start;
171 table->core.range = core_size;
172 table->init.pc = (unsigned long)init_start;
173 table->init.range = init_size;
174 table->address = table_start;
175 table->size = table_size;
176 /* See if the linker provided table looks valid. */
177 if (header_size <= 4
178 || header_start[0] != 1
179 || (void *)read_pointer(&ptr, end, header_start[1]) != table_start
180 || header_start[2] == DW_EH_PE_omit
181 || read_pointer(&ptr, end, header_start[2]) <= 0
182 || header_start[3] == DW_EH_PE_omit)
183 header_start = NULL;
184 table->hdrsz = header_size;
185 smp_wmb();
186 table->header = header_start;
187 table->link = NULL;
188 table->name = name;
189}
190
191void __init unwind_init(void)
192{
193 init_unwind_table(&root_table, "kernel",
194 _text, _end - _text,
195 NULL, 0,
196 __start_unwind, __end_unwind - __start_unwind,
197 __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr);
198}
199
200static const u32 bad_cie, not_fde;
201static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *);
202static signed fde_pointer_type(const u32 *cie);
203
204struct eh_frame_hdr_table_entry {
205 unsigned long start, fde;
206};
207
208static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2)
209{
210 const struct eh_frame_hdr_table_entry *e1 = p1;
211 const struct eh_frame_hdr_table_entry *e2 = p2;
212
213 return (e1->start > e2->start) - (e1->start < e2->start);
214}
215
216static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size)
217{
218 struct eh_frame_hdr_table_entry *e1 = p1;
219 struct eh_frame_hdr_table_entry *e2 = p2;
220 unsigned long v;
221
222 v = e1->start;
223 e1->start = e2->start;
224 e2->start = v;
225 v = e1->fde;
226 e1->fde = e2->fde;
227 e2->fde = v;
228}
229
230static void __init setup_unwind_table(struct unwind_table *table,
231 void *(*alloc)(unsigned long))
232{
233 const u8 *ptr;
234 unsigned long tableSize = table->size, hdrSize;
235 unsigned n;
236 const u32 *fde;
237 struct {
238 u8 version;
239 u8 eh_frame_ptr_enc;
240 u8 fde_count_enc;
241 u8 table_enc;
242 unsigned long eh_frame_ptr;
243 unsigned int fde_count;
244 struct eh_frame_hdr_table_entry table[];
245 } __attribute__((__packed__)) *header;
246
247 if (table->header)
248 return;
249
250 if (table->hdrsz)
251 printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n",
252 table->name);
253
254 if (tableSize & (sizeof(*fde) - 1))
255 return;
256
257 for (fde = table->address, n = 0;
258 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
259 tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
260 const u32 *cie = cie_for_fde(fde, table);
261 signed ptrType;
262
263 if (cie == &not_fde)
264 continue;
265 if (cie == NULL
266 || cie == &bad_cie
267 || (ptrType = fde_pointer_type(cie)) < 0)
268 return;
269 ptr = (const u8 *)(fde + 2);
270 if (!read_pointer(&ptr,
271 (const u8 *)(fde + 1) + *fde,
272 ptrType))
273 return;
274 ++n;
275 }
276
277 if (tableSize || !n)
278 return;
279
280 hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
281 + 2 * n * sizeof(unsigned long);
282 header = alloc(hdrSize);
283 if (!header)
284 return;
285 header->version = 1;
286 header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native;
287 header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4;
288 header->table_enc = DW_EH_PE_abs|DW_EH_PE_native;
289 put_unaligned((unsigned long)table->address, &header->eh_frame_ptr);
290 BUILD_BUG_ON(offsetof(typeof(*header), fde_count)
291 % __alignof(typeof(header->fde_count)));
292 header->fde_count = n;
293
294 BUILD_BUG_ON(offsetof(typeof(*header), table)
295 % __alignof(typeof(*header->table)));
296 for (fde = table->address, tableSize = table->size, n = 0;
297 tableSize;
298 tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
299 const u32 *cie = fde + 1 - fde[1] / sizeof(*fde);
300
301 if (!fde[1])
302 continue; /* this is a CIE */
303 ptr = (const u8 *)(fde + 2);
304 header->table[n].start = read_pointer(&ptr,
305 (const u8 *)(fde + 1) + *fde,
306 fde_pointer_type(cie));
307 header->table[n].fde = (unsigned long)fde;
308 ++n;
309 }
310 WARN_ON(n != header->fde_count);
311
312 sort(header->table,
313 n,
314 sizeof(*header->table),
315 cmp_eh_frame_hdr_table_entries,
316 swap_eh_frame_hdr_table_entries);
317
318 table->hdrsz = hdrSize;
319 smp_wmb();
320 table->header = (const void *)header;
321}
322
323static void *__init balloc(unsigned long sz)
324{
325 return __alloc_bootmem_nopanic(sz,
326 sizeof(unsigned int),
327 __pa(MAX_DMA_ADDRESS));
328}
329
330void __init unwind_setup(void)
331{
332 setup_unwind_table(&root_table, balloc);
333}
334
335#ifdef CONFIG_MODULES
336
337static struct unwind_table *last_table;
338
339/* Must be called with module_mutex held. */
340void *unwind_add_table(struct module *module,
341 const void *table_start,
342 unsigned long table_size)
343{
344 struct unwind_table *table;
345
346 if (table_size <= 0)
347 return NULL;
348
349 table = kmalloc(sizeof(*table), GFP_KERNEL);
350 if (!table)
351 return NULL;
352
353 init_unwind_table(table, module->name,
354 module->module_core, module->core_size,
355 module->module_init, module->init_size,
356 table_start, table_size,
357 NULL, 0);
358
359 if (last_table)
360 last_table->link = table;
361 else
362 root_table.link = table;
363 last_table = table;
364
365 return table;
366}
367
368struct unlink_table_info
369{
370 struct unwind_table *table;
371 int init_only;
372};
373
374static int unlink_table(void *arg)
375{
376 struct unlink_table_info *info = arg;
377 struct unwind_table *table = info->table, *prev;
378
379 for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
380 ;
381
382 if (prev->link) {
383 if (info->init_only) {
384 table->init.pc = 0;
385 table->init.range = 0;
386 info->table = NULL;
387 } else {
388 prev->link = table->link;
389 if (!prev->link)
390 last_table = prev;
391 }
392 } else
393 info->table = NULL;
394
395 return 0;
396}
397
398/* Must be called with module_mutex held. */
399void unwind_remove_table(void *handle, int init_only)
400{
401 struct unwind_table *table = handle;
402 struct unlink_table_info info;
403
404 if (!table || table == &root_table)
405 return;
406
407 if (init_only && table == last_table) {
408 table->init.pc = 0;
409 table->init.range = 0;
410 return;
411 }
412
413 info.table = table;
414 info.init_only = init_only;
415 stop_machine_run(unlink_table, &info, NR_CPUS);
416
417 if (info.table)
418 kfree(table);
419}
420
421#endif /* CONFIG_MODULES */
422
423static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
424{
425 const u8 *cur = *pcur;
426 uleb128_t value;
427 unsigned shift;
428
429 for (shift = 0, value = 0; cur < end; shift += 7) {
430 if (shift + 7 > 8 * sizeof(value)
431 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
432 cur = end + 1;
433 break;
434 }
435 value |= (uleb128_t)(*cur & 0x7f) << shift;
436 if (!(*cur++ & 0x80))
437 break;
438 }
439 *pcur = cur;
440
441 return value;
442}
443
444static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
445{
446 const u8 *cur = *pcur;
447 sleb128_t value;
448 unsigned shift;
449
450 for (shift = 0, value = 0; cur < end; shift += 7) {
451 if (shift + 7 > 8 * sizeof(value)
452 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
453 cur = end + 1;
454 break;
455 }
456 value |= (sleb128_t)(*cur & 0x7f) << shift;
457 if (!(*cur & 0x80)) {
458 value |= -(*cur++ & 0x40) << shift;
459 break;
460 }
461 }
462 *pcur = cur;
463
464 return value;
465}
466
467static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
468{
469 const u32 *cie;
470
471 if (!*fde || (*fde & (sizeof(*fde) - 1)))
472 return &bad_cie;
473 if (!fde[1])
474 return &not_fde; /* this is a CIE */
475 if ((fde[1] & (sizeof(*fde) - 1))
476 || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address)
477 return NULL; /* this is not a valid FDE */
478 cie = fde + 1 - fde[1] / sizeof(*fde);
479 if (*cie <= sizeof(*cie) + 4
480 || *cie >= fde[1] - sizeof(*fde)
481 || (*cie & (sizeof(*cie) - 1))
482 || cie[1])
483 return NULL; /* this is not a (valid) CIE */
484 return cie;
485}
486
487static unsigned long read_pointer(const u8 **pLoc,
488 const void *end,
489 signed ptrType)
490{
491 unsigned long value = 0;
492 union {
493 const u8 *p8;
494 const u16 *p16u;
495 const s16 *p16s;
496 const u32 *p32u;
497 const s32 *p32s;
498 const unsigned long *pul;
499 } ptr;
500
501 if (ptrType < 0 || ptrType == DW_EH_PE_omit)
502 return 0;
503 ptr.p8 = *pLoc;
504 switch(ptrType & DW_EH_PE_FORM) {
505 case DW_EH_PE_data2:
506 if (end < (const void *)(ptr.p16u + 1))
507 return 0;
508 if(ptrType & DW_EH_PE_signed)
509 value = get_unaligned(ptr.p16s++);
510 else
511 value = get_unaligned(ptr.p16u++);
512 break;
513 case DW_EH_PE_data4:
514#ifdef CONFIG_64BIT
515 if (end < (const void *)(ptr.p32u + 1))
516 return 0;
517 if(ptrType & DW_EH_PE_signed)
518 value = get_unaligned(ptr.p32s++);
519 else
520 value = get_unaligned(ptr.p32u++);
521 break;
522 case DW_EH_PE_data8:
523 BUILD_BUG_ON(sizeof(u64) != sizeof(value));
524#else
525 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
526#endif
527 case DW_EH_PE_native:
528 if (end < (const void *)(ptr.pul + 1))
529 return 0;
530 value = get_unaligned(ptr.pul++);
531 break;
532 case DW_EH_PE_leb128:
533 BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
534 value = ptrType & DW_EH_PE_signed
535 ? get_sleb128(&ptr.p8, end)
536 : get_uleb128(&ptr.p8, end);
537 if ((const void *)ptr.p8 > end)
538 return 0;
539 break;
540 default:
541 return 0;
542 }
543 switch(ptrType & DW_EH_PE_ADJUST) {
544 case DW_EH_PE_abs:
545 break;
546 case DW_EH_PE_pcrel:
547 value += (unsigned long)*pLoc;
548 break;
549 default:
550 return 0;
551 }
552 if ((ptrType & DW_EH_PE_indirect)
553 && __get_user(value, (unsigned long *)value))
554 return 0;
555 *pLoc = ptr.p8;
556
557 return value;
558}
559
560static signed fde_pointer_type(const u32 *cie)
561{
562 const u8 *ptr = (const u8 *)(cie + 2);
563 unsigned version = *ptr;
564
565 if (version != 1)
566 return -1; /* unsupported */
567 if (*++ptr) {
568 const char *aug;
569 const u8 *end = (const u8 *)(cie + 1) + *cie;
570 uleb128_t len;
571
572 /* check if augmentation size is first (and thus present) */
573 if (*ptr != 'z')
574 return -1;
575 /* check if augmentation string is nul-terminated */
576 if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
577 return -1;
578 ++ptr; /* skip terminator */
579 get_uleb128(&ptr, end); /* skip code alignment */
580 get_sleb128(&ptr, end); /* skip data alignment */
581 /* skip return address column */
582 version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
583 len = get_uleb128(&ptr, end); /* augmentation length */
584 if (ptr + len < ptr || ptr + len > end)
585 return -1;
586 end = ptr + len;
587 while (*++aug) {
588 if (ptr >= end)
589 return -1;
590 switch(*aug) {
591 case 'L':
592 ++ptr;
593 break;
594 case 'P': {
595 signed ptrType = *ptr++;
596
597 if (!read_pointer(&ptr, end, ptrType) || ptr > end)
598 return -1;
599 }
600 break;
601 case 'R':
602 return *ptr;
603 default:
604 return -1;
605 }
606 }
607 }
608 return DW_EH_PE_native|DW_EH_PE_abs;
609}
610
611static int advance_loc(unsigned long delta, struct unwind_state *state)
612{
613 state->loc += delta * state->codeAlign;
614
615 return delta > 0;
616}
617
618static void set_rule(uleb128_t reg,
619 enum item_location where,
620 uleb128_t value,
621 struct unwind_state *state)
622{
623 if (reg < ARRAY_SIZE(state->regs)) {
624 state->regs[reg].where = where;
625 state->regs[reg].value = value;
626 }
627}
628
629static int processCFI(const u8 *start,
630 const u8 *end,
631 unsigned long targetLoc,
632 signed ptrType,
633 struct unwind_state *state)
634{
635 union {
636 const u8 *p8;
637 const u16 *p16;
638 const u32 *p32;
639 } ptr;
640 int result = 1;
641
642 if (start != state->cieStart) {
643 state->loc = state->org;
644 result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
645 if (targetLoc == 0 && state->label == NULL)
646 return result;
647 }
648 for (ptr.p8 = start; result && ptr.p8 < end; ) {
649 switch(*ptr.p8 >> 6) {
650 uleb128_t value;
651
652 case 0:
653 switch(*ptr.p8++) {
654 case DW_CFA_nop:
655 break;
656 case DW_CFA_set_loc:
657 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
658 result = 0;
659 break;
660 case DW_CFA_advance_loc1:
661 result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
662 break;
663 case DW_CFA_advance_loc2:
664 result = ptr.p8 <= end + 2
665 && advance_loc(*ptr.p16++, state);
666 break;
667 case DW_CFA_advance_loc4:
668 result = ptr.p8 <= end + 4
669 && advance_loc(*ptr.p32++, state);
670 break;
671 case DW_CFA_offset_extended:
672 value = get_uleb128(&ptr.p8, end);
673 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
674 break;
675 case DW_CFA_val_offset:
676 value = get_uleb128(&ptr.p8, end);
677 set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
678 break;
679 case DW_CFA_offset_extended_sf:
680 value = get_uleb128(&ptr.p8, end);
681 set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
682 break;
683 case DW_CFA_val_offset_sf:
684 value = get_uleb128(&ptr.p8, end);
685 set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
686 break;
687 case DW_CFA_restore_extended:
688 case DW_CFA_undefined:
689 case DW_CFA_same_value:
690 set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
691 break;
692 case DW_CFA_register:
693 value = get_uleb128(&ptr.p8, end);
694 set_rule(value,
695 Register,
696 get_uleb128(&ptr.p8, end), state);
697 break;
698 case DW_CFA_remember_state:
699 if (ptr.p8 == state->label) {
700 state->label = NULL;
701 return 1;
702 }
703 if (state->stackDepth >= MAX_STACK_DEPTH)
704 return 0;
705 state->stack[state->stackDepth++] = ptr.p8;
706 break;
707 case DW_CFA_restore_state:
708 if (state->stackDepth) {
709 const uleb128_t loc = state->loc;
710 const u8 *label = state->label;
711
712 state->label = state->stack[state->stackDepth - 1];
713 memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
714 memset(state->regs, 0, sizeof(state->regs));
715 state->stackDepth = 0;
716 result = processCFI(start, end, 0, ptrType, state);
717 state->loc = loc;
718 state->label = label;
719 } else
720 return 0;
721 break;
722 case DW_CFA_def_cfa:
723 state->cfa.reg = get_uleb128(&ptr.p8, end);
724 /*nobreak*/
725 case DW_CFA_def_cfa_offset:
726 state->cfa.offs = get_uleb128(&ptr.p8, end);
727 break;
728 case DW_CFA_def_cfa_sf:
729 state->cfa.reg = get_uleb128(&ptr.p8, end);
730 /*nobreak*/
731 case DW_CFA_def_cfa_offset_sf:
732 state->cfa.offs = get_sleb128(&ptr.p8, end)
733 * state->dataAlign;
734 break;
735 case DW_CFA_def_cfa_register:
736 state->cfa.reg = get_uleb128(&ptr.p8, end);
737 break;
738 /*todo case DW_CFA_def_cfa_expression: */
739 /*todo case DW_CFA_expression: */
740 /*todo case DW_CFA_val_expression: */
741 case DW_CFA_GNU_args_size:
742 get_uleb128(&ptr.p8, end);
743 break;
744 case DW_CFA_GNU_negative_offset_extended:
745 value = get_uleb128(&ptr.p8, end);
746 set_rule(value,
747 Memory,
748 (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
749 break;
750 case DW_CFA_GNU_window_save:
751 default:
752 result = 0;
753 break;
754 }
755 break;
756 case 1:
757 result = advance_loc(*ptr.p8++ & 0x3f, state);
758 break;
759 case 2:
760 value = *ptr.p8++ & 0x3f;
761 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
762 break;
763 case 3:
764 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
765 break;
766 }
767 if (ptr.p8 > end)
768 result = 0;
769 if (result && targetLoc != 0 && targetLoc < state->loc)
770 return 1;
771 }
772
773 return result
774 && ptr.p8 == end
775 && (targetLoc == 0
776 || (/*todo While in theory this should apply, gcc in practice omits
777 everything past the function prolog, and hence the location
778 never reaches the end of the function.
779 targetLoc < state->loc &&*/ state->label == NULL));
780}
781
782/* Unwind to previous to frame. Returns 0 if successful, negative
783 * number in case of an error. */
784int unwind(struct unwind_frame_info *frame)
785{
786#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
787 const u32 *fde = NULL, *cie = NULL;
788 const u8 *ptr = NULL, *end = NULL;
789 unsigned long pc = UNW_PC(frame) - frame->call_frame;
790 unsigned long startLoc = 0, endLoc = 0, cfa;
791 unsigned i;
792 signed ptrType = -1;
793 uleb128_t retAddrReg = 0;
794 const struct unwind_table *table;
795 struct unwind_state state;
796
797 if (UNW_PC(frame) == 0)
798 return -EINVAL;
799 if ((table = find_table(pc)) != NULL
800 && !(table->size & (sizeof(*fde) - 1))) {
801 const u8 *hdr = table->header;
802 unsigned long tableSize;
803
804 smp_rmb();
805 if (hdr && hdr[0] == 1) {
806 switch(hdr[3] & DW_EH_PE_FORM) {
807 case DW_EH_PE_native: tableSize = sizeof(unsigned long); break;
808 case DW_EH_PE_data2: tableSize = 2; break;
809 case DW_EH_PE_data4: tableSize = 4; break;
810 case DW_EH_PE_data8: tableSize = 8; break;
811 default: tableSize = 0; break;
812 }
813 ptr = hdr + 4;
814 end = hdr + table->hdrsz;
815 if (tableSize
816 && read_pointer(&ptr, end, hdr[1])
817 == (unsigned long)table->address
818 && (i = read_pointer(&ptr, end, hdr[2])) > 0
819 && i == (end - ptr) / (2 * tableSize)
820 && !((end - ptr) % (2 * tableSize))) {
821 do {
822 const u8 *cur = ptr + (i / 2) * (2 * tableSize);
823
824 startLoc = read_pointer(&cur,
825 cur + tableSize,
826 hdr[3]);
827 if (pc < startLoc)
828 i /= 2;
829 else {
830 ptr = cur - tableSize;
831 i = (i + 1) / 2;
832 }
833 } while (startLoc && i > 1);
834 if (i == 1
835 && (startLoc = read_pointer(&ptr,
836 ptr + tableSize,
837 hdr[3])) != 0
838 && pc >= startLoc)
839 fde = (void *)read_pointer(&ptr,
840 ptr + tableSize,
841 hdr[3]);
842 }
843 }
844
845 if (fde != NULL) {
846 cie = cie_for_fde(fde, table);
847 ptr = (const u8 *)(fde + 2);
848 if(cie != NULL
849 && cie != &bad_cie
850 && cie != &not_fde
851 && (ptrType = fde_pointer_type(cie)) >= 0
852 && read_pointer(&ptr,
853 (const u8 *)(fde + 1) + *fde,
854 ptrType) == startLoc) {
855 if (!(ptrType & DW_EH_PE_indirect))
856 ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
857 endLoc = startLoc
858 + read_pointer(&ptr,
859 (const u8 *)(fde + 1) + *fde,
860 ptrType);
861 if(pc >= endLoc)
862 fde = NULL;
863 } else
864 fde = NULL;
865 }
866 if (fde == NULL) {
867 for (fde = table->address, tableSize = table->size;
868 cie = NULL, tableSize > sizeof(*fde)
869 && tableSize - sizeof(*fde) >= *fde;
870 tableSize -= sizeof(*fde) + *fde,
871 fde += 1 + *fde / sizeof(*fde)) {
872 cie = cie_for_fde(fde, table);
873 if (cie == &bad_cie) {
874 cie = NULL;
875 break;
876 }
877 if (cie == NULL
878 || cie == &not_fde
879 || (ptrType = fde_pointer_type(cie)) < 0)
880 continue;
881 ptr = (const u8 *)(fde + 2);
882 startLoc = read_pointer(&ptr,
883 (const u8 *)(fde + 1) + *fde,
884 ptrType);
885 if (!startLoc)
886 continue;
887 if (!(ptrType & DW_EH_PE_indirect))
888 ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
889 endLoc = startLoc
890 + read_pointer(&ptr,
891 (const u8 *)(fde + 1) + *fde,
892 ptrType);
893 if (pc >= startLoc && pc < endLoc)
894 break;
895 }
896 }
897 }
898 if (cie != NULL) {
899 memset(&state, 0, sizeof(state));
900 state.cieEnd = ptr; /* keep here temporarily */
901 ptr = (const u8 *)(cie + 2);
902 end = (const u8 *)(cie + 1) + *cie;
903 frame->call_frame = 1;
904 if ((state.version = *ptr) != 1)
905 cie = NULL; /* unsupported version */
906 else if (*++ptr) {
907 /* check if augmentation size is first (and thus present) */
908 if (*ptr == 'z') {
909 while (++ptr < end && *ptr) {
910 switch(*ptr) {
911 /* check for ignorable (or already handled)
912 * nul-terminated augmentation string */
913 case 'L':
914 case 'P':
915 case 'R':
916 continue;
917 case 'S':
918 frame->call_frame = 0;
919 continue;
920 default:
921 break;
922 }
923 break;
924 }
925 }
926 if (ptr >= end || *ptr)
927 cie = NULL;
928 }
929 ++ptr;
930 }
931 if (cie != NULL) {
932 /* get code aligment factor */
933 state.codeAlign = get_uleb128(&ptr, end);
934 /* get data aligment factor */
935 state.dataAlign = get_sleb128(&ptr, end);
936 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
937 cie = NULL;
938 else {
939 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
940 /* skip augmentation */
941 if (((const char *)(cie + 2))[1] == 'z') {
942 uleb128_t augSize = get_uleb128(&ptr, end);
943
944 ptr += augSize;
945 }
946 if (ptr > end
947 || retAddrReg >= ARRAY_SIZE(reg_info)
948 || REG_INVALID(retAddrReg)
949 || reg_info[retAddrReg].width != sizeof(unsigned long))
950 cie = NULL;
951 }
952 }
953 if (cie != NULL) {
954 state.cieStart = ptr;
955 ptr = state.cieEnd;
956 state.cieEnd = end;
957 end = (const u8 *)(fde + 1) + *fde;
958 /* skip augmentation */
959 if (((const char *)(cie + 2))[1] == 'z') {
960 uleb128_t augSize = get_uleb128(&ptr, end);
961
962 if ((ptr += augSize) > end)
963 fde = NULL;
964 }
965 }
966 if (cie == NULL || fde == NULL) {
967#ifdef CONFIG_FRAME_POINTER
968 unsigned long top, bottom;
969
970 top = STACK_TOP(frame->task);
971 bottom = STACK_BOTTOM(frame->task);
972# if FRAME_RETADDR_OFFSET < 0
973 if (UNW_SP(frame) < top
974 && UNW_FP(frame) <= UNW_SP(frame)
975 && bottom < UNW_FP(frame)
976# else
977 if (UNW_SP(frame) > top
978 && UNW_FP(frame) >= UNW_SP(frame)
979 && bottom > UNW_FP(frame)
980# endif
981 && !((UNW_SP(frame) | UNW_FP(frame))
982 & (sizeof(unsigned long) - 1))) {
983 unsigned long link;
984
985 if (!__get_user(link,
986 (unsigned long *)(UNW_FP(frame)
987 + FRAME_LINK_OFFSET))
988# if FRAME_RETADDR_OFFSET < 0
989 && link > bottom && link < UNW_FP(frame)
990# else
991 && link > UNW_FP(frame) && link < bottom
992# endif
993 && !(link & (sizeof(link) - 1))
994 && !__get_user(UNW_PC(frame),
995 (unsigned long *)(UNW_FP(frame)
996 + FRAME_RETADDR_OFFSET))) {
997 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
998# if FRAME_RETADDR_OFFSET < 0
999 -
1000# else
1001 +
1002# endif
1003 sizeof(UNW_PC(frame));
1004 UNW_FP(frame) = link;
1005 return 0;
1006 }
1007 }
1008#endif
1009 return -ENXIO;
1010 }
1011 state.org = startLoc;
1012 memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
1013 /* process instructions */
1014 if (!processCFI(ptr, end, pc, ptrType, &state)
1015 || state.loc > endLoc
1016 || state.regs[retAddrReg].where == Nowhere
1017 || state.cfa.reg >= ARRAY_SIZE(reg_info)
1018 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
1019 || state.cfa.offs % sizeof(unsigned long))
1020 return -EIO;
1021 /* update frame */
1022#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
1023 if(frame->call_frame
1024 && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign))
1025 frame->call_frame = 0;
1026#endif
1027 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
1028 startLoc = min((unsigned long)UNW_SP(frame), cfa);
1029 endLoc = max((unsigned long)UNW_SP(frame), cfa);
1030 if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
1031 startLoc = min(STACK_LIMIT(cfa), cfa);
1032 endLoc = max(STACK_LIMIT(cfa), cfa);
1033 }
1034#ifndef CONFIG_64BIT
1035# define CASES CASE(8); CASE(16); CASE(32)
1036#else
1037# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
1038#endif
1039 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
1040 if (REG_INVALID(i)) {
1041 if (state.regs[i].where == Nowhere)
1042 continue;
1043 return -EIO;
1044 }
1045 switch(state.regs[i].where) {
1046 default:
1047 break;
1048 case Register:
1049 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
1050 || REG_INVALID(state.regs[i].value)
1051 || reg_info[i].width > reg_info[state.regs[i].value].width)
1052 return -EIO;
1053 switch(reg_info[state.regs[i].value].width) {
1054#define CASE(n) \
1055 case sizeof(u##n): \
1056 state.regs[i].value = FRAME_REG(state.regs[i].value, \
1057 const u##n); \
1058 break
1059 CASES;
1060#undef CASE
1061 default:
1062 return -EIO;
1063 }
1064 break;
1065 }
1066 }
1067 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
1068 if (REG_INVALID(i))
1069 continue;
1070 switch(state.regs[i].where) {
1071 case Nowhere:
1072 if (reg_info[i].width != sizeof(UNW_SP(frame))
1073 || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
1074 != &UNW_SP(frame))
1075 continue;
1076 UNW_SP(frame) = cfa;
1077 break;
1078 case Register:
1079 switch(reg_info[i].width) {
1080#define CASE(n) case sizeof(u##n): \
1081 FRAME_REG(i, u##n) = state.regs[i].value; \
1082 break
1083 CASES;
1084#undef CASE
1085 default:
1086 return -EIO;
1087 }
1088 break;
1089 case Value:
1090 if (reg_info[i].width != sizeof(unsigned long))
1091 return -EIO;
1092 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
1093 * state.dataAlign;
1094 break;
1095 case Memory: {
1096 unsigned long addr = cfa + state.regs[i].value
1097 * state.dataAlign;
1098
1099 if ((state.regs[i].value * state.dataAlign)
1100 % sizeof(unsigned long)
1101 || addr < startLoc
1102 || addr + sizeof(unsigned long) < addr
1103 || addr + sizeof(unsigned long) > endLoc)
1104 return -EIO;
1105 switch(reg_info[i].width) {
1106#define CASE(n) case sizeof(u##n): \
1107 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
1108 break
1109 CASES;
1110#undef CASE
1111 default:
1112 return -EIO;
1113 }
1114 }
1115 break;
1116 }
1117 }
1118
1119 return 0;
1120#undef CASES
1121#undef FRAME_REG
1122}
1123EXPORT_SYMBOL(unwind);
1124
1125int unwind_init_frame_info(struct unwind_frame_info *info,
1126 struct task_struct *tsk,
1127 /*const*/ struct pt_regs *regs)
1128{
1129 info->task = tsk;
1130 info->call_frame = 0;
1131 arch_unw_init_frame_info(info, regs);
1132
1133 return 0;
1134}
1135EXPORT_SYMBOL(unwind_init_frame_info);
1136
1137/*
1138 * Prepare to unwind a blocked task.
1139 */
1140int unwind_init_blocked(struct unwind_frame_info *info,
1141 struct task_struct *tsk)
1142{
1143 info->task = tsk;
1144 info->call_frame = 0;
1145 arch_unw_init_blocked(info);
1146
1147 return 0;
1148}
1149EXPORT_SYMBOL(unwind_init_blocked);
1150
1151/*
1152 * Prepare to unwind the currently running thread.
1153 */
1154int unwind_init_running(struct unwind_frame_info *info,
1155 asmlinkage int (*callback)(struct unwind_frame_info *,
1156 void *arg),
1157 void *arg)
1158{
1159 info->task = current;
1160 info->call_frame = 0;
1161
1162 return arch_unwind_init_running(info, callback, arg);
1163}
1164EXPORT_SYMBOL(unwind_init_running);
1165
1166/*
1167 * Unwind until the return pointer is in user-land (or until an error
1168 * occurs). Returns 0 if successful, negative number in case of
1169 * error.
1170 */
1171int unwind_to_user(struct unwind_frame_info *info)
1172{
1173 while (!arch_unw_user_mode(info)) {
1174 int err = unwind(info);
1175
1176 if (err < 0)
1177 return err;
1178 }
1179
1180 return 0;
1181}
1182EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/user.c b/kernel/user.c
index 220e586127a0..4869563080e9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) 27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
28 28
29static kmem_cache_t *uid_cachep; 29static struct kmem_cache *uid_cachep;
30static struct list_head uidhash_table[UIDHASH_SZ]; 30static struct list_head uidhash_table[UIDHASH_SZ];
31 31
32/* 32/*
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
132 if (!up) { 132 if (!up) {
133 struct user_struct *new; 133 struct user_struct *new;
134 134
135 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); 135 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
136 if (!new) 136 if (!new)
137 return NULL; 137 return NULL;
138 new->uid = uid; 138 new->uid = uid;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 17c2f03d2c27..a3da07c5af28 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,9 @@
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30#include <linux/hardirq.h> 30#include <linux/hardirq.h>
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/freezer.h>
33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h>
32 35
33/* 36/*
34 * The per-CPU workqueue (if single thread, we always use the first 37 * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct {
55 struct task_struct *thread; 58 struct task_struct *thread;
56 59
57 int run_depth; /* Detect run_workqueue() recursion depth */ 60 int run_depth; /* Detect run_workqueue() recursion depth */
61
62 int freezeable; /* Freeze the thread during suspend */
58} ____cacheline_aligned; 63} ____cacheline_aligned;
59 64
60/* 65/*
@@ -80,6 +85,99 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
80 return list_empty(&wq->list); 85 return list_empty(&wq->list);
81} 86}
82 87
88/*
89 * Set the workqueue on which a work item is to be run
90 * - Must *only* be called if the pending flag is set
91 */
92static inline void set_wq_data(struct work_struct *work, void *wq)
93{
94 unsigned long new;
95
96 BUG_ON(!work_pending(work));
97
98 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
99 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
100 atomic_long_set(&work->data, new);
101}
102
103static inline void *get_wq_data(struct work_struct *work)
104{
105 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
106}
107
108static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
109{
110 int ret = 0;
111 unsigned long flags;
112
113 spin_lock_irqsave(&cwq->lock, flags);
114 /*
115 * We need to re-validate the work info after we've gotten
116 * the cpu_workqueue lock. We can run the work now iff:
117 *
118 * - the wq_data still matches the cpu_workqueue_struct
119 * - AND the work is still marked pending
120 * - AND the work is still on a list (which will be this
121 * workqueue_struct list)
122 *
123 * All these conditions are important, because we
124 * need to protect against the work being run right
125 * now on another CPU (all but the last one might be
126 * true if it's currently running and has not been
127 * released yet, for example).
128 */
129 if (get_wq_data(work) == cwq
130 && work_pending(work)
131 && !list_empty(&work->entry)) {
132 work_func_t f = work->func;
133 list_del_init(&work->entry);
134 spin_unlock_irqrestore(&cwq->lock, flags);
135
136 if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
137 work_release(work);
138 f(work);
139
140 spin_lock_irqsave(&cwq->lock, flags);
141 cwq->remove_sequence++;
142 wake_up(&cwq->work_done);
143 ret = 1;
144 }
145 spin_unlock_irqrestore(&cwq->lock, flags);
146 return ret;
147}
148
149/**
150 * run_scheduled_work - run scheduled work synchronously
151 * @work: work to run
152 *
153 * This checks if the work was pending, and runs it
154 * synchronously if so. It returns a boolean to indicate
155 * whether it had any scheduled work to run or not.
156 *
157 * NOTE! This _only_ works for normal work_structs. You
158 * CANNOT use this for delayed work, because the wq data
159 * for delayed work will not point properly to the per-
160 * CPU workqueue struct, but will change!
161 */
162int fastcall run_scheduled_work(struct work_struct *work)
163{
164 for (;;) {
165 struct cpu_workqueue_struct *cwq;
166
167 if (!work_pending(work))
168 return 0;
169 if (list_empty(&work->entry))
170 return 0;
171 /* NOTE! This depends intimately on __queue_work! */
172 cwq = get_wq_data(work);
173 if (!cwq)
174 return 0;
175 if (__run_work(cwq, work))
176 return 1;
177 }
178}
179EXPORT_SYMBOL(run_scheduled_work);
180
83/* Preempt must be disabled. */ 181/* Preempt must be disabled. */
84static void __queue_work(struct cpu_workqueue_struct *cwq, 182static void __queue_work(struct cpu_workqueue_struct *cwq,
85 struct work_struct *work) 183 struct work_struct *work)
@@ -87,7 +185,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
87 unsigned long flags; 185 unsigned long flags;
88 186
89 spin_lock_irqsave(&cwq->lock, flags); 187 spin_lock_irqsave(&cwq->lock, flags);
90 work->wq_data = cwq; 188 set_wq_data(work, cwq);
91 list_add_tail(&work->entry, &cwq->worklist); 189 list_add_tail(&work->entry, &cwq->worklist);
92 cwq->insert_sequence++; 190 cwq->insert_sequence++;
93 wake_up(&cwq->more_work); 191 wake_up(&cwq->more_work);
@@ -108,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
108{ 206{
109 int ret = 0, cpu = get_cpu(); 207 int ret = 0, cpu = get_cpu();
110 208
111 if (!test_and_set_bit(0, &work->pending)) { 209 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
112 if (unlikely(is_single_threaded(wq))) 210 if (unlikely(is_single_threaded(wq)))
113 cpu = singlethread_cpu; 211 cpu = singlethread_cpu;
114 BUG_ON(!list_empty(&work->entry)); 212 BUG_ON(!list_empty(&work->entry));
@@ -122,38 +220,42 @@ EXPORT_SYMBOL_GPL(queue_work);
122 220
123static void delayed_work_timer_fn(unsigned long __data) 221static void delayed_work_timer_fn(unsigned long __data)
124{ 222{
125 struct work_struct *work = (struct work_struct *)__data; 223 struct delayed_work *dwork = (struct delayed_work *)__data;
126 struct workqueue_struct *wq = work->wq_data; 224 struct workqueue_struct *wq = get_wq_data(&dwork->work);
127 int cpu = smp_processor_id(); 225 int cpu = smp_processor_id();
128 226
129 if (unlikely(is_single_threaded(wq))) 227 if (unlikely(is_single_threaded(wq)))
130 cpu = singlethread_cpu; 228 cpu = singlethread_cpu;
131 229
132 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 230 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
133} 231}
134 232
135/** 233/**
136 * queue_delayed_work - queue work on a workqueue after delay 234 * queue_delayed_work - queue work on a workqueue after delay
137 * @wq: workqueue to use 235 * @wq: workqueue to use
138 * @work: work to queue 236 * @dwork: delayable work to queue
139 * @delay: number of jiffies to wait before queueing 237 * @delay: number of jiffies to wait before queueing
140 * 238 *
141 * Returns 0 if @work was already on a queue, non-zero otherwise. 239 * Returns 0 if @work was already on a queue, non-zero otherwise.
142 */ 240 */
143int fastcall queue_delayed_work(struct workqueue_struct *wq, 241int fastcall queue_delayed_work(struct workqueue_struct *wq,
144 struct work_struct *work, unsigned long delay) 242 struct delayed_work *dwork, unsigned long delay)
145{ 243{
146 int ret = 0; 244 int ret = 0;
147 struct timer_list *timer = &work->timer; 245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work;
148 247
149 if (!test_and_set_bit(0, &work->pending)) { 248 if (delay == 0)
249 return queue_work(wq, work);
250
251 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
150 BUG_ON(timer_pending(timer)); 252 BUG_ON(timer_pending(timer));
151 BUG_ON(!list_empty(&work->entry)); 253 BUG_ON(!list_empty(&work->entry));
152 254
153 /* This stores wq for the moment, for the timer_fn */ 255 /* This stores wq for the moment, for the timer_fn */
154 work->wq_data = wq; 256 set_wq_data(work, wq);
155 timer->expires = jiffies + delay; 257 timer->expires = jiffies + delay;
156 timer->data = (unsigned long)work; 258 timer->data = (unsigned long)dwork;
157 timer->function = delayed_work_timer_fn; 259 timer->function = delayed_work_timer_fn;
158 add_timer(timer); 260 add_timer(timer);
159 ret = 1; 261 ret = 1;
@@ -166,25 +268,26 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
166 * queue_delayed_work_on - queue work on specific CPU after delay 268 * queue_delayed_work_on - queue work on specific CPU after delay
167 * @cpu: CPU number to execute work on 269 * @cpu: CPU number to execute work on
168 * @wq: workqueue to use 270 * @wq: workqueue to use
169 * @work: work to queue 271 * @dwork: work to queue
170 * @delay: number of jiffies to wait before queueing 272 * @delay: number of jiffies to wait before queueing
171 * 273 *
172 * Returns 0 if @work was already on a queue, non-zero otherwise. 274 * Returns 0 if @work was already on a queue, non-zero otherwise.
173 */ 275 */
174int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 276int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
175 struct work_struct *work, unsigned long delay) 277 struct delayed_work *dwork, unsigned long delay)
176{ 278{
177 int ret = 0; 279 int ret = 0;
178 struct timer_list *timer = &work->timer; 280 struct timer_list *timer = &dwork->timer;
281 struct work_struct *work = &dwork->work;
179 282
180 if (!test_and_set_bit(0, &work->pending)) { 283 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
181 BUG_ON(timer_pending(timer)); 284 BUG_ON(timer_pending(timer));
182 BUG_ON(!list_empty(&work->entry)); 285 BUG_ON(!list_empty(&work->entry));
183 286
184 /* This stores wq for the moment, for the timer_fn */ 287 /* This stores wq for the moment, for the timer_fn */
185 work->wq_data = wq; 288 set_wq_data(work, wq);
186 timer->expires = jiffies + delay; 289 timer->expires = jiffies + delay;
187 timer->data = (unsigned long)work; 290 timer->data = (unsigned long)dwork;
188 timer->function = delayed_work_timer_fn; 291 timer->function = delayed_work_timer_fn;
189 add_timer_on(timer, cpu); 292 add_timer_on(timer, cpu);
190 ret = 1; 293 ret = 1;
@@ -212,15 +315,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
212 while (!list_empty(&cwq->worklist)) { 315 while (!list_empty(&cwq->worklist)) {
213 struct work_struct *work = list_entry(cwq->worklist.next, 316 struct work_struct *work = list_entry(cwq->worklist.next,
214 struct work_struct, entry); 317 struct work_struct, entry);
215 void (*f) (void *) = work->func; 318 work_func_t f = work->func;
216 void *data = work->data;
217 319
218 list_del_init(cwq->worklist.next); 320 list_del_init(cwq->worklist.next);
219 spin_unlock_irqrestore(&cwq->lock, flags); 321 spin_unlock_irqrestore(&cwq->lock, flags);
220 322
221 BUG_ON(work->wq_data != cwq); 323 BUG_ON(get_wq_data(work) != cwq);
222 clear_bit(0, &work->pending); 324 if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
223 f(data); 325 work_release(work);
326 f(work);
327
328 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
329 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
330 "%s/0x%08x/%d\n",
331 current->comm, preempt_count(),
332 current->pid);
333 printk(KERN_ERR " last function: ");
334 print_symbol("%s\n", (unsigned long)f);
335 debug_show_held_locks(current);
336 dump_stack();
337 }
224 338
225 spin_lock_irqsave(&cwq->lock, flags); 339 spin_lock_irqsave(&cwq->lock, flags);
226 cwq->remove_sequence++; 340 cwq->remove_sequence++;
@@ -237,7 +351,8 @@ static int worker_thread(void *__cwq)
237 struct k_sigaction sa; 351 struct k_sigaction sa;
238 sigset_t blocked; 352 sigset_t blocked;
239 353
240 current->flags |= PF_NOFREEZE; 354 if (!cwq->freezeable)
355 current->flags |= PF_NOFREEZE;
241 356
242 set_user_nice(current, -5); 357 set_user_nice(current, -5);
243 358
@@ -260,6 +375,9 @@ static int worker_thread(void *__cwq)
260 375
261 set_current_state(TASK_INTERRUPTIBLE); 376 set_current_state(TASK_INTERRUPTIBLE);
262 while (!kthread_should_stop()) { 377 while (!kthread_should_stop()) {
378 if (cwq->freezeable)
379 try_to_freeze();
380
263 add_wait_queue(&cwq->more_work, &wait); 381 add_wait_queue(&cwq->more_work, &wait);
264 if (list_empty(&cwq->worklist)) 382 if (list_empty(&cwq->worklist))
265 schedule(); 383 schedule();
@@ -336,7 +454,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
336EXPORT_SYMBOL_GPL(flush_workqueue); 454EXPORT_SYMBOL_GPL(flush_workqueue);
337 455
338static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 456static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
339 int cpu) 457 int cpu, int freezeable)
340{ 458{
341 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 459 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
342 struct task_struct *p; 460 struct task_struct *p;
@@ -346,6 +464,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
346 cwq->thread = NULL; 464 cwq->thread = NULL;
347 cwq->insert_sequence = 0; 465 cwq->insert_sequence = 0;
348 cwq->remove_sequence = 0; 466 cwq->remove_sequence = 0;
467 cwq->freezeable = freezeable;
349 INIT_LIST_HEAD(&cwq->worklist); 468 INIT_LIST_HEAD(&cwq->worklist);
350 init_waitqueue_head(&cwq->more_work); 469 init_waitqueue_head(&cwq->more_work);
351 init_waitqueue_head(&cwq->work_done); 470 init_waitqueue_head(&cwq->work_done);
@@ -361,7 +480,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
361} 480}
362 481
363struct workqueue_struct *__create_workqueue(const char *name, 482struct workqueue_struct *__create_workqueue(const char *name,
364 int singlethread) 483 int singlethread, int freezeable)
365{ 484{
366 int cpu, destroy = 0; 485 int cpu, destroy = 0;
367 struct workqueue_struct *wq; 486 struct workqueue_struct *wq;
@@ -381,7 +500,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
381 mutex_lock(&workqueue_mutex); 500 mutex_lock(&workqueue_mutex);
382 if (singlethread) { 501 if (singlethread) {
383 INIT_LIST_HEAD(&wq->list); 502 INIT_LIST_HEAD(&wq->list);
384 p = create_workqueue_thread(wq, singlethread_cpu); 503 p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
385 if (!p) 504 if (!p)
386 destroy = 1; 505 destroy = 1;
387 else 506 else
@@ -389,7 +508,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
389 } else { 508 } else {
390 list_add(&wq->list, &workqueues); 509 list_add(&wq->list, &workqueues);
391 for_each_online_cpu(cpu) { 510 for_each_online_cpu(cpu) {
392 p = create_workqueue_thread(wq, cpu); 511 p = create_workqueue_thread(wq, cpu, freezeable);
393 if (p) { 512 if (p) {
394 kthread_bind(p, cpu); 513 kthread_bind(p, cpu);
395 wake_up_process(p); 514 wake_up_process(p);
@@ -468,38 +587,37 @@ EXPORT_SYMBOL(schedule_work);
468 587
469/** 588/**
470 * schedule_delayed_work - put work task in global workqueue after delay 589 * schedule_delayed_work - put work task in global workqueue after delay
471 * @work: job to be done 590 * @dwork: job to be done
472 * @delay: number of jiffies to wait 591 * @delay: number of jiffies to wait or 0 for immediate execution
473 * 592 *
474 * After waiting for a given time this puts a job in the kernel-global 593 * After waiting for a given time this puts a job in the kernel-global
475 * workqueue. 594 * workqueue.
476 */ 595 */
477int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) 596int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
478{ 597{
479 return queue_delayed_work(keventd_wq, work, delay); 598 return queue_delayed_work(keventd_wq, dwork, delay);
480} 599}
481EXPORT_SYMBOL(schedule_delayed_work); 600EXPORT_SYMBOL(schedule_delayed_work);
482 601
483/** 602/**
484 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 603 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
485 * @cpu: cpu to use 604 * @cpu: cpu to use
486 * @work: job to be done 605 * @dwork: job to be done
487 * @delay: number of jiffies to wait 606 * @delay: number of jiffies to wait
488 * 607 *
489 * After waiting for a given time this puts a job in the kernel-global 608 * After waiting for a given time this puts a job in the kernel-global
490 * workqueue on the specified CPU. 609 * workqueue on the specified CPU.
491 */ 610 */
492int schedule_delayed_work_on(int cpu, 611int schedule_delayed_work_on(int cpu,
493 struct work_struct *work, unsigned long delay) 612 struct delayed_work *dwork, unsigned long delay)
494{ 613{
495 return queue_delayed_work_on(cpu, keventd_wq, work, delay); 614 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
496} 615}
497EXPORT_SYMBOL(schedule_delayed_work_on); 616EXPORT_SYMBOL(schedule_delayed_work_on);
498 617
499/** 618/**
500 * schedule_on_each_cpu - call a function on each online CPU from keventd 619 * schedule_on_each_cpu - call a function on each online CPU from keventd
501 * @func: the function to call 620 * @func: the function to call
502 * @info: a pointer to pass to func()
503 * 621 *
504 * Returns zero on success. 622 * Returns zero on success.
505 * Returns -ve errno on failure. 623 * Returns -ve errno on failure.
@@ -508,7 +626,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
508 * 626 *
509 * schedule_on_each_cpu() is very slow. 627 * schedule_on_each_cpu() is very slow.
510 */ 628 */
511int schedule_on_each_cpu(void (*func)(void *info), void *info) 629int schedule_on_each_cpu(work_func_t func)
512{ 630{
513 int cpu; 631 int cpu;
514 struct work_struct *works; 632 struct work_struct *works;
@@ -519,9 +637,11 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
519 637
520 mutex_lock(&workqueue_mutex); 638 mutex_lock(&workqueue_mutex);
521 for_each_online_cpu(cpu) { 639 for_each_online_cpu(cpu) {
522 INIT_WORK(per_cpu_ptr(works, cpu), func, info); 640 struct work_struct *work = per_cpu_ptr(works, cpu);
523 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 641
524 per_cpu_ptr(works, cpu)); 642 INIT_WORK(work, func);
643 set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
644 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
525 } 645 }
526 mutex_unlock(&workqueue_mutex); 646 mutex_unlock(&workqueue_mutex);
527 flush_workqueue(keventd_wq); 647 flush_workqueue(keventd_wq);
@@ -539,12 +659,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
539 * cancel_rearming_delayed_workqueue - reliably kill off a delayed 659 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
540 * work whose handler rearms the delayed work. 660 * work whose handler rearms the delayed work.
541 * @wq: the controlling workqueue structure 661 * @wq: the controlling workqueue structure
542 * @work: the delayed work struct 662 * @dwork: the delayed work struct
543 */ 663 */
544void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, 664void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
545 struct work_struct *work) 665 struct delayed_work *dwork)
546{ 666{
547 while (!cancel_delayed_work(work)) 667 while (!cancel_delayed_work(dwork))
548 flush_workqueue(wq); 668 flush_workqueue(wq);
549} 669}
550EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); 670EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,18 +672,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
552/** 672/**
553 * cancel_rearming_delayed_work - reliably kill off a delayed keventd 673 * cancel_rearming_delayed_work - reliably kill off a delayed keventd
554 * work whose handler rearms the delayed work. 674 * work whose handler rearms the delayed work.
555 * @work: the delayed work struct 675 * @dwork: the delayed work struct
556 */ 676 */
557void cancel_rearming_delayed_work(struct work_struct *work) 677void cancel_rearming_delayed_work(struct delayed_work *dwork)
558{ 678{
559 cancel_rearming_delayed_workqueue(keventd_wq, work); 679 cancel_rearming_delayed_workqueue(keventd_wq, dwork);
560} 680}
561EXPORT_SYMBOL(cancel_rearming_delayed_work); 681EXPORT_SYMBOL(cancel_rearming_delayed_work);
562 682
563/** 683/**
564 * execute_in_process_context - reliably execute the routine with user context 684 * execute_in_process_context - reliably execute the routine with user context
565 * @fn: the function to execute 685 * @fn: the function to execute
566 * @data: data to pass to the function
567 * @ew: guaranteed storage for the execute work structure (must 686 * @ew: guaranteed storage for the execute work structure (must
568 * be available when the work executes) 687 * be available when the work executes)
569 * 688 *
@@ -573,15 +692,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
573 * Returns: 0 - function was executed 692 * Returns: 0 - function was executed
574 * 1 - function was scheduled for execution 693 * 1 - function was scheduled for execution
575 */ 694 */
576int execute_in_process_context(void (*fn)(void *data), void *data, 695int execute_in_process_context(work_func_t fn, struct execute_work *ew)
577 struct execute_work *ew)
578{ 696{
579 if (!in_interrupt()) { 697 if (!in_interrupt()) {
580 fn(data); 698 fn(&ew->work);
581 return 0; 699 return 0;
582 } 700 }
583 701
584 INIT_WORK(&ew->work, fn, data); 702 INIT_WORK(&ew->work, fn);
585 schedule_work(&ew->work); 703 schedule_work(&ew->work);
586 704
587 return 1; 705 return 1;
@@ -609,7 +727,6 @@ int current_is_keventd(void)
609 727
610} 728}
611 729
612#ifdef CONFIG_HOTPLUG_CPU
613/* Take the work from this (downed) CPU. */ 730/* Take the work from this (downed) CPU. */
614static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 731static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
615{ 732{
@@ -642,7 +759,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
642 mutex_lock(&workqueue_mutex); 759 mutex_lock(&workqueue_mutex);
643 /* Create a new workqueue thread for it. */ 760 /* Create a new workqueue thread for it. */
644 list_for_each_entry(wq, &workqueues, list) { 761 list_for_each_entry(wq, &workqueues, list) {
645 if (!create_workqueue_thread(wq, hotcpu)) { 762 if (!create_workqueue_thread(wq, hotcpu, 0)) {
646 printk("workqueue for %i failed\n", hotcpu); 763 printk("workqueue for %i failed\n", hotcpu);
647 return NOTIFY_BAD; 764 return NOTIFY_BAD;
648 } 765 }
@@ -692,7 +809,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
692 809
693 return NOTIFY_OK; 810 return NOTIFY_OK;
694} 811}
695#endif
696 812
697void init_workqueues(void) 813void init_workqueues(void)
698{ 814{