aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz20
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/delayacct.c4
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c44
-rw-r--r--kernel/futex.c45
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/kallsyms.c17
-rw-r--r--kernel/kexec.c59
-rw-r--r--kernel/kprobes.c117
-rw-r--r--kernel/lockdep.c52
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c6
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/mutex-debug.c3
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c66
-rw-r--r--kernel/power/main.c14
-rw-r--r--kernel/power/power.h32
-rw-r--r--kernel/power/process.c130
-rw-r--r--kernel/power/snapshot.c860
-rw-r--r--kernel/power/swap.c347
-rw-r--r--kernel/power/swsusp.c98
-rw-r--r--kernel/power/user.c102
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/profile.c47
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/sched.c39
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c26
-rw-r--r--kernel/taskstats.c169
-rw-r--r--kernel/unwind.c203
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/workqueue.c108
49 files changed, 1885 insertions, 845 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8b..4af15802ccd4 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
7 default HZ_250 7 default HZ_250
8 help 8 help
9 Allows the configuration of the timer frequency. It is customary 9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more 10 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
11 beneficial for servers and NUMA systems that do not need to have 11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus 12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts. 13 contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
19 config HZ_100 19 config HZ_100
20 bool "100 HZ" 20 bool "100 HZ"
21 help 21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems 22 100 Hz is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if 23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring. 24 too many timer interrupts are occurring.
25 25
26 config HZ_250 26 config HZ_250
27 bool "250 HZ" 27 bool "250 HZ"
28 help 28 help
29 250 HZ is a good compromise choice allowing server performance 29 250 Hz is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even 30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems. 31 on SMP and NUMA systems. If you are going to be using NTSC video
32 or multimedia, selected 300Hz instead.
33
34 config HZ_300
35 bool "300 HZ"
36 help
37 300 Hz is a good compromise choice allowing server performance
38 while also showing good interactive responsiveness even
39 on SMP and NUMA systems and exactly dividing by both PAL and
40 NTSC frame rates for video and multimedia work.
32 41
33 config HZ_1000 42 config HZ_1000
34 bool "1000 HZ" 43 bool "1000 HZ"
35 help 44 help
36 1000 HZ is the preferred choice for desktop systems and other 45 1000 Hz is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events. 46 systems requiring fast interactive responses to events.
38 47
39endchoice 48endchoice
@@ -42,5 +51,6 @@ config HZ
42 int 51 int
43 default 100 if HZ_100 52 default 100 if HZ_100
44 default 250 if HZ_250 53 default 250 if HZ_250
54 default 300 if HZ_300
45 default 1000 if HZ_1000 55 default 1000 if HZ_1000
46 56
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a81..dc12db8600e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
89 struct timer_list timer; 89 struct timer_list timer;
90}; 90};
91 91
92static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; 92static struct acct_glbs acct_globals __cacheline_aligned =
93 {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
93 94
94/* 95/*
95 * Called whenever the timer says to check the free space. 96 * Called whenever the timer says to check the free space.
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b0..d9b690ac684b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h> 59#include <linux/inotify.h>
60#include <linux/freezer.h>
60 61
61#include "audit.h" 62#include "audit.h"
62 63
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8e..2e896f8ae29e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
636 struct audit_rule *rule; 636 struct audit_rule *rule;
637 int i; 637 int i;
638 638
639 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 639 rule = kzalloc(sizeof(*rule), GFP_KERNEL);
640 if (unlikely(!rule)) 640 if (unlikely(!rule))
641 return NULL; 641 return NULL;
642 memset(rule, 0, sizeof(*rule));
643 642
644 rule->flags = krule->flags | krule->listnr; 643 rule->flags = krule->flags | krule->listnr;
645 rule->action = krule->action; 644 rule->action = krule->action;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ab97e5101232..40722e26de98 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -731,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context)
731 printk(KERN_ERR "audit: freed %d contexts\n", count); 731 printk(KERN_ERR "audit: freed %d contexts\n", count);
732} 732}
733 733
734static void audit_log_task_context(struct audit_buffer *ab) 734void audit_log_task_context(struct audit_buffer *ab)
735{ 735{
736 char *ctx = NULL; 736 char *ctx = NULL;
737 ssize_t len = 0; 737 ssize_t len = 0;
@@ -760,6 +760,8 @@ error_path:
760 return; 760 return;
761} 761}
762 762
763EXPORT_SYMBOL(audit_log_task_context);
764
763static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 765static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
764{ 766{
765 char name[sizeof(tsk->comm)]; 767 char name[sizeof(tsk->comm)];
@@ -1488,6 +1490,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1488 return ctx ? ctx->loginuid : -1; 1490 return ctx ? ctx->loginuid : -1;
1489} 1491}
1490 1492
1493EXPORT_SYMBOL(audit_get_loginuid);
1494
1491/** 1495/**
1492 * __audit_mq_open - record audit data for a POSIX MQ open 1496 * __audit_mq_open - record audit data for a POSIX MQ open
1493 * @oflag: open flag 1497 * @oflag: open flag
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4ad..8fa1fb28f8a7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
75 return count; 75 return count;
76} 76}
77 77
78static struct file_operations ikconfig_file_ops = { 78static const struct file_operations ikconfig_file_ops = {
79 .owner = THIS_MODULE, 79 .owner = THIS_MODULE,
80 .read = ikconfig_read_current, 80 .read = ikconfig_read_current,
81}; 81};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 272254f20d97..9124669f4586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
270 goto out; 270 goto out;
271 } 271 }
272 } 272 }
273 error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); 273
274 if (error) {
275 printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
276 goto out;
277 }
278 /* We take down all of the non-boot CPUs in one shot to avoid races 274 /* We take down all of the non-boot CPUs in one shot to avoid races
279 * with the userspace trying to use the CPU hotplug at the same time 275 * with the userspace trying to use the CPU hotplug at the same time
280 */ 276 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930e..0a6b4d89f9a0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
729 } 729 }
730 730
731 /* Remaining checks don't apply to root cpuset */ 731 /* Remaining checks don't apply to root cpuset */
732 if ((par = cur->parent) == NULL) 732 if (cur == &top_cpuset)
733 return 0; 733 return 0;
734 734
735 par = cur->parent;
736
735 /* We must be a subset of our parent cpuset */ 737 /* We must be a subset of our parent cpuset */
736 if (!is_cpuset_subset(trial, par)) 738 if (!is_cpuset_subset(trial, par))
737 return -EACCES; 739 return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1060 cpu_exclusive_changed = 1062 cpu_exclusive_changed =
1061 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 1063 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
1062 mutex_lock(&callback_mutex); 1064 mutex_lock(&callback_mutex);
1063 if (turning_on) 1065 cs->flags = trialcs.flags;
1064 set_bit(bit, &cs->flags);
1065 else
1066 clear_bit(bit, &cs->flags);
1067 mutex_unlock(&callback_mutex); 1066 mutex_unlock(&callback_mutex);
1068 1067
1069 if (cpu_exclusive_changed) 1068 if (cpu_exclusive_changed)
@@ -1281,7 +1280,8 @@ typedef enum {
1281 FILE_TASKLIST, 1280 FILE_TASKLIST,
1282} cpuset_filetype_t; 1281} cpuset_filetype_t;
1283 1282
1284static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, 1283static ssize_t cpuset_common_file_write(struct file *file,
1284 const char __user *userbuf,
1285 size_t nbytes, loff_t *unused_ppos) 1285 size_t nbytes, loff_t *unused_ppos)
1286{ 1286{
1287 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1287 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1292,7 +1292,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1292 int retval = 0; 1292 int retval = 0;
1293 1293
1294 /* Crude upper limit on largest legitimate cpulist user might write. */ 1294 /* Crude upper limit on largest legitimate cpulist user might write. */
1295 if (nbytes > 100 + 6 * NR_CPUS) 1295 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
1296 return -E2BIG; 1296 return -E2BIG;
1297 1297
1298 /* +1 for nul-terminator */ 1298 /* +1 for nul-terminator */
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1532 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1533} 1533}
1534 1534
1535static struct file_operations cpuset_file_operations = { 1535static const struct file_operations cpuset_file_operations = {
1536 .read = cpuset_file_read, 1536 .read = cpuset_file_read,
1537 .write = cpuset_file_write, 1537 .write = cpuset_file_write,
1538 .llseek = generic_file_llseek, 1538 .llseek = generic_file_llseek,
@@ -2045,7 +2045,6 @@ out:
2045 return err; 2045 return err;
2046} 2046}
2047 2047
2048#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
2049/* 2048/*
2050 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 2049 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
2051 * or memory nodes, we need to walk over the cpuset hierarchy, 2050 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void)
2109 mutex_unlock(&callback_mutex); 2108 mutex_unlock(&callback_mutex);
2110 mutex_unlock(&manage_mutex); 2109 mutex_unlock(&manage_mutex);
2111} 2110}
2112#endif
2113 2111
2114#ifdef CONFIG_HOTPLUG_CPU
2115/* 2112/*
2116 * The top_cpuset tracks what CPUs and Memory Nodes are online, 2113 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2117 * period. This is necessary in order to make cpusets transparent 2114 * period. This is necessary in order to make cpusets transparent
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
2128 common_cpu_mem_hotplug_unplug(); 2125 common_cpu_mem_hotplug_unplug();
2129 return 0; 2126 return 0;
2130} 2127}
2131#endif
2132 2128
2133#ifdef CONFIG_MEMORY_HOTPLUG 2129#ifdef CONFIG_MEMORY_HOTPLUG
2134/* 2130/*
@@ -2610,7 +2606,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
2610 return single_open(file, proc_cpuset_show, pid); 2606 return single_open(file, proc_cpuset_show, pid);
2611} 2607}
2612 2608
2613struct file_operations proc_cpuset_operations = { 2609const struct file_operations proc_cpuset_operations = {
2614 .open = cpuset_open, 2610 .open = cpuset_open,
2615 .read = seq_read, 2611 .read = seq_read,
2616 .llseek = seq_lseek, 2612 .llseek = seq_lseek,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 66a0ea48751d..766d5912b26a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
20#include <linux/delayacct.h> 20#include <linux/delayacct.h>
21 21
22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
23kmem_cache_t *delayacct_cache; 23struct kmem_cache *delayacct_cache;
24 24
25static int __init delayacct_setup_disable(char *str) 25static int __init delayacct_setup_disable(char *str)
26{ 26{
@@ -41,7 +41,7 @@ void delayacct_init(void)
41 41
42void __delayacct_tsk_init(struct task_struct *tsk) 42void __delayacct_tsk_init(struct task_struct *tsk)
43{ 43{
44 tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); 44 tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
45 if (tsk->delays) 45 if (tsk->delays)
46 spin_lock_init(&tsk->delays->lock); 46 spin_lock_init(&tsk->delays->lock);
47} 47}
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938a..937b13ca33ba 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
140 return single_open(file, proc_dma_show, NULL); 140 return single_open(file, proc_dma_show, NULL);
141} 141}
142 142
143static struct file_operations proc_dma_operations = { 143static const struct file_operations proc_dma_operations = {
144 .open = proc_dma_open, 144 .open = proc_dma_open,
145 .read = seq_read, 145 .read = seq_read,
146 .llseek = seq_lseek, 146 .llseek = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4e8ca3..4e3f919edc48 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -850,9 +850,7 @@ static void exit_notify(struct task_struct *tsk)
850fastcall NORET_TYPE void do_exit(long code) 850fastcall NORET_TYPE void do_exit(long code)
851{ 851{
852 struct task_struct *tsk = current; 852 struct task_struct *tsk = current;
853 struct taskstats *tidstats;
854 int group_dead; 853 int group_dead;
855 unsigned int mycpu;
856 854
857 profile_task_exit(tsk); 855 profile_task_exit(tsk);
858 856
@@ -890,8 +888,6 @@ fastcall NORET_TYPE void do_exit(long code)
890 current->comm, current->pid, 888 current->comm, current->pid,
891 preempt_count()); 889 preempt_count());
892 890
893 taskstats_exit_alloc(&tidstats, &mycpu);
894
895 acct_update_integrals(tsk); 891 acct_update_integrals(tsk);
896 if (tsk->mm) { 892 if (tsk->mm) {
897 update_hiwater_rss(tsk->mm); 893 update_hiwater_rss(tsk->mm);
@@ -911,8 +907,8 @@ fastcall NORET_TYPE void do_exit(long code)
911#endif 907#endif
912 if (unlikely(tsk->audit_context)) 908 if (unlikely(tsk->audit_context))
913 audit_free(tsk); 909 audit_free(tsk);
914 taskstats_exit_send(tsk, tidstats, group_dead, mycpu); 910
915 taskstats_exit_free(tidstats); 911 taskstats_exit(tsk, group_dead);
916 912
917 exit_mm(tsk); 913 exit_mm(tsk);
918 914
diff --git a/kernel/fork.c b/kernel/fork.c
index 8cdd3e72ba55..7f2e31ba33af 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -82,26 +82,26 @@ int nr_processes(void)
82#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 82#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
83# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) 83# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
84# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) 84# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
85static kmem_cache_t *task_struct_cachep; 85static struct kmem_cache *task_struct_cachep;
86#endif 86#endif
87 87
88/* SLAB cache for signal_struct structures (tsk->signal) */ 88/* SLAB cache for signal_struct structures (tsk->signal) */
89static kmem_cache_t *signal_cachep; 89static struct kmem_cache *signal_cachep;
90 90
91/* SLAB cache for sighand_struct structures (tsk->sighand) */ 91/* SLAB cache for sighand_struct structures (tsk->sighand) */
92kmem_cache_t *sighand_cachep; 92struct kmem_cache *sighand_cachep;
93 93
94/* SLAB cache for files_struct structures (tsk->files) */ 94/* SLAB cache for files_struct structures (tsk->files) */
95kmem_cache_t *files_cachep; 95struct kmem_cache *files_cachep;
96 96
97/* SLAB cache for fs_struct structures (tsk->fs) */ 97/* SLAB cache for fs_struct structures (tsk->fs) */
98kmem_cache_t *fs_cachep; 98struct kmem_cache *fs_cachep;
99 99
100/* SLAB cache for vm_area_struct structures */ 100/* SLAB cache for vm_area_struct structures */
101kmem_cache_t *vm_area_cachep; 101struct kmem_cache *vm_area_cachep;
102 102
103/* SLAB cache for mm_struct structures (tsk->mm) */ 103/* SLAB cache for mm_struct structures (tsk->mm) */
104static kmem_cache_t *mm_cachep; 104static struct kmem_cache *mm_cachep;
105 105
106void free_task(struct task_struct *tsk) 106void free_task(struct task_struct *tsk)
107{ 107{
@@ -237,7 +237,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
237 goto fail_nomem; 237 goto fail_nomem;
238 charge = len; 238 charge = len;
239 } 239 }
240 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 240 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
241 if (!tmp) 241 if (!tmp)
242 goto fail_nomem; 242 goto fail_nomem;
243 *tmp = *mpnt; 243 *tmp = *mpnt;
@@ -319,7 +319,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
319 319
320 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 320 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
321 321
322#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) 322#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
323#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 323#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
324 324
325#include <linux/init_task.h> 325#include <linux/init_task.h>
@@ -448,7 +448,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
448 tsk->vfork_done = NULL; 448 tsk->vfork_done = NULL;
449 complete(vfork_done); 449 complete(vfork_done);
450 } 450 }
451 if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { 451
452 /*
453 * If we're exiting normally, clear a user-space tid field if
454 * requested. We leave this alone when dying by signal, to leave
455 * the value intact in a core dump, and to save the unnecessary
456 * trouble otherwise. Userland only wants this done for a sys_exit.
457 */
458 if (tsk->clear_child_tid
459 && !(tsk->flags & PF_SIGNALED)
460 && atomic_read(&mm->mm_users) > 1) {
452 u32 __user * tidptr = tsk->clear_child_tid; 461 u32 __user * tidptr = tsk->clear_child_tid;
453 tsk->clear_child_tid = NULL; 462 tsk->clear_child_tid = NULL;
454 463
@@ -479,6 +488,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
479 488
480 memcpy(mm, oldmm, sizeof(*mm)); 489 memcpy(mm, oldmm, sizeof(*mm));
481 490
491 /* Initializing for Swap token stuff */
492 mm->token_priority = 0;
493 mm->last_interval = 0;
494
482 if (!mm_init(mm)) 495 if (!mm_init(mm))
483 goto fail_nomem; 496 goto fail_nomem;
484 497
@@ -542,6 +555,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
542 goto fail_nomem; 555 goto fail_nomem;
543 556
544good_mm: 557good_mm:
558 /* Initializing for Swap token stuff */
559 mm->token_priority = 0;
560 mm->last_interval = 0;
561
545 tsk->mm = mm; 562 tsk->mm = mm;
546 tsk->active_mm = mm; 563 tsk->active_mm = mm;
547 return 0; 564 return 0;
@@ -613,7 +630,7 @@ static struct files_struct *alloc_files(void)
613 struct files_struct *newf; 630 struct files_struct *newf;
614 struct fdtable *fdt; 631 struct fdtable *fdt;
615 632
616 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); 633 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
617 if (!newf) 634 if (!newf)
618 goto out; 635 goto out;
619 636
@@ -830,7 +847,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
830 if (clone_flags & CLONE_THREAD) { 847 if (clone_flags & CLONE_THREAD) {
831 atomic_inc(&current->signal->count); 848 atomic_inc(&current->signal->count);
832 atomic_inc(&current->signal->live); 849 atomic_inc(&current->signal->live);
833 taskstats_tgid_alloc(current);
834 return 0; 850 return 0;
835 } 851 }
836 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 852 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -1303,7 +1319,7 @@ fork_out:
1303 return ERR_PTR(retval); 1319 return ERR_PTR(retval);
1304} 1320}
1305 1321
1306struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) 1322noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1307{ 1323{
1308 memset(regs, 0, sizeof(struct pt_regs)); 1324 memset(regs, 0, sizeof(struct pt_regs));
1309 return regs; 1325 return regs;
@@ -1413,7 +1429,7 @@ long do_fork(unsigned long clone_flags,
1413#define ARCH_MIN_MMSTRUCT_ALIGN 0 1429#define ARCH_MIN_MMSTRUCT_ALIGN 0
1414#endif 1430#endif
1415 1431
1416static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) 1432static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
1417{ 1433{
1418 struct sighand_struct *sighand = data; 1434 struct sighand_struct *sighand = data;
1419 1435
diff --git a/kernel/futex.c b/kernel/futex.c
index 93ef30ba209f..95989a3b4168 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
282{ 282{
283 int ret; 283 int ret;
284 284
285 inc_preempt_count(); 285 pagefault_disable();
286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
287 dec_preempt_count(); 287 pagefault_enable();
288 288
289 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
290} 290}
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
324 if (likely(current->pi_state_cache)) 324 if (likely(current->pi_state_cache))
325 return 0; 325 return 0;
326 326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); 327 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
328 328
329 if (!pi_state) 329 if (!pi_state)
330 return -ENOMEM; 330 return -ENOMEM;
331 331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list); 332 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */ 333 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL; 334 pi_state->owner = NULL;
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q)
553 * at the end of wake_up_all() does not prevent this store from 552 * at the end of wake_up_all() does not prevent this store from
554 * moving. 553 * moving.
555 */ 554 */
556 wmb(); 555 smp_wmb();
557 q->lock_ptr = NULL; 556 q->lock_ptr = NULL;
558} 557}
559 558
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
585 if (!(uval & FUTEX_OWNER_DIED)) { 584 if (!(uval & FUTEX_OWNER_DIED)) {
586 newval = FUTEX_WAITERS | new_owner->pid; 585 newval = FUTEX_WAITERS | new_owner->pid;
587 586
588 inc_preempt_count(); 587 pagefault_disable();
589 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 588 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
590 dec_preempt_count(); 589 pagefault_enable();
591 if (curval == -EFAULT) 590 if (curval == -EFAULT)
592 return -EFAULT; 591 return -EFAULT;
593 if (curval != uval) 592 if (curval != uval)
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
618 * There is no waiter, so we unlock the futex. The owner died 617 * There is no waiter, so we unlock the futex. The owner died
619 * bit has not to be preserved here. We are the owner: 618 * bit has not to be preserved here. We are the owner:
620 */ 619 */
621 inc_preempt_count(); 620 pagefault_disable();
622 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); 621 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
623 dec_preempt_count(); 622 pagefault_enable();
624 623
625 if (oldval == -EFAULT) 624 if (oldval == -EFAULT)
626 return oldval; 625 return oldval;
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1158 */ 1157 */
1159 newval = current->pid; 1158 newval = current->pid;
1160 1159
1161 inc_preempt_count(); 1160 pagefault_disable();
1162 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); 1161 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1163 dec_preempt_count(); 1162 pagefault_enable();
1164 1163
1165 if (unlikely(curval == -EFAULT)) 1164 if (unlikely(curval == -EFAULT))
1166 goto uaddr_faulted; 1165 goto uaddr_faulted;
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1183 uval = curval; 1182 uval = curval;
1184 newval = uval | FUTEX_WAITERS; 1183 newval = uval | FUTEX_WAITERS;
1185 1184
1186 inc_preempt_count(); 1185 pagefault_disable();
1187 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1186 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1188 dec_preempt_count(); 1187 pagefault_enable();
1189 1188
1190 if (unlikely(curval == -EFAULT)) 1189 if (unlikely(curval == -EFAULT))
1191 goto uaddr_faulted; 1190 goto uaddr_faulted;
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1215 newval = current->pid | 1214 newval = current->pid |
1216 FUTEX_OWNER_DIED | FUTEX_WAITERS; 1215 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1217 1216
1218 inc_preempt_count(); 1217 pagefault_disable();
1219 curval = futex_atomic_cmpxchg_inatomic(uaddr, 1218 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1220 uval, newval); 1219 uval, newval);
1221 dec_preempt_count(); 1220 pagefault_enable();
1222 1221
1223 if (unlikely(curval == -EFAULT)) 1222 if (unlikely(curval == -EFAULT))
1224 goto uaddr_faulted; 1223 goto uaddr_faulted;
@@ -1390,9 +1389,9 @@ retry_locked:
1390 * anyone else up: 1389 * anyone else up:
1391 */ 1390 */
1392 if (!(uval & FUTEX_OWNER_DIED)) { 1391 if (!(uval & FUTEX_OWNER_DIED)) {
1393 inc_preempt_count(); 1392 pagefault_disable();
1394 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1393 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1395 dec_preempt_count(); 1394 pagefault_enable();
1396 } 1395 }
1397 1396
1398 if (unlikely(uval == -EFAULT)) 1397 if (unlikely(uval == -EFAULT))
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
1493 return ret; 1492 return ret;
1494} 1493}
1495 1494
1496static struct file_operations futex_fops = { 1495static const struct file_operations futex_fops = {
1497 .release = futex_close, 1496 .release = futex_close,
1498 .poll = futex_poll, 1497 .poll = futex_poll,
1499}; 1498};
@@ -1858,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
1858 1857
1859static int __init init(void) 1858static int __init init(void)
1860{ 1859{
1861 unsigned int i; 1860 int i = register_filesystem(&futex_fs_type);
1861
1862 if (i)
1863 return i;
1862 1864
1863 register_filesystem(&futex_fs_type);
1864 futex_mnt = kern_mount(&futex_fs_type); 1865 futex_mnt = kern_mount(&futex_fs_type);
1866 if (IS_ERR(futex_mnt)) {
1867 unregister_filesystem(&futex_fs_type);
1868 return PTR_ERR(futex_mnt);
1869 }
1865 1870
1866 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 1871 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1867 INIT_LIST_HEAD(&futex_queues[i].chain); 1872 INIT_LIST_HEAD(&futex_queues[i].chain);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a681912bc89a..aff1f0fabb0d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
54 .chip = &no_irq_chip, 54 .chip = &no_irq_chip,
55 .handle_irq = handle_bad_irq, 55 .handle_irq = handle_bad_irq,
56 .depth = 1, 56 .depth = 1,
57 .lock = SPIN_LOCK_UNLOCKED, 57 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
58#ifdef CONFIG_SMP 58#ifdef CONFIG_SMP
59 .affinity = CPU_MASK_ALL 59 .affinity = CPU_MASK_ALL
60#endif 60#endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2b..ab63cfc42992 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/ctype.h>
23 24
24#include <asm/sections.h> 25#include <asm/sections.h>
25 26
@@ -301,13 +302,6 @@ struct kallsym_iter
301 char name[KSYM_NAME_LEN+1]; 302 char name[KSYM_NAME_LEN+1];
302}; 303};
303 304
304/* Only label it "global" if it is exported. */
305static void upcase_if_global(struct kallsym_iter *iter)
306{
307 if (is_exported(iter->name, iter->owner))
308 iter->type += 'A' - 'a';
309}
310
311static int get_ksymbol_mod(struct kallsym_iter *iter) 305static int get_ksymbol_mod(struct kallsym_iter *iter)
312{ 306{
313 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 307 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
316 if (iter->owner == NULL) 310 if (iter->owner == NULL)
317 return 0; 311 return 0;
318 312
319 upcase_if_global(iter); 313 /* Label it "global" if it is exported, "local" if not exported. */
314 iter->type = is_exported(iter->name, iter->owner)
315 ? toupper(iter->type) : tolower(iter->type);
316
320 return 1; 317 return 1;
321} 318}
322 319
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
401 return 0; 398 return 0;
402} 399}
403 400
404static struct seq_operations kallsyms_op = { 401static const struct seq_operations kallsyms_op = {
405 .start = s_start, 402 .start = s_start,
406 .next = s_next, 403 .next = s_next,
407 .stop = s_stop, 404 .stop = s_stop,
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
436 return seq_release(inode, file); 433 return seq_release(inode, file);
437} 434}
438 435
439static struct file_operations kallsyms_operations = { 436static const struct file_operations kallsyms_operations = {
440 .open = kallsyms_open, 437 .open = kallsyms_open,
441 .read = seq_read, 438 .read = seq_read,
442 .llseek = seq_lseek, 439 .llseek = seq_lseek,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 05aada293592..2a59c8a01ae0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/elf.h>
24#include <linux/elfcore.h>
23 25
24#include <asm/page.h> 26#include <asm/page.h>
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
108 110
109 /* Allocate a controlling structure */ 111 /* Allocate a controlling structure */
110 result = -ENOMEM; 112 result = -ENOMEM;
111 image = kmalloc(sizeof(*image), GFP_KERNEL); 113 image = kzalloc(sizeof(*image), GFP_KERNEL);
112 if (!image) 114 if (!image)
113 goto out; 115 goto out;
114 116
115 memset(image, 0, sizeof(*image));
116 image->head = 0; 117 image->head = 0;
117 image->entry = &image->head; 118 image->entry = &image->head;
118 image->last_entry = &image->head; 119 image->last_entry = &image->head;
@@ -1068,6 +1069,60 @@ void crash_kexec(struct pt_regs *regs)
1068 } 1069 }
1069} 1070}
1070 1071
1072static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1073 size_t data_len)
1074{
1075 struct elf_note note;
1076
1077 note.n_namesz = strlen(name) + 1;
1078 note.n_descsz = data_len;
1079 note.n_type = type;
1080 memcpy(buf, &note, sizeof(note));
1081 buf += (sizeof(note) + 3)/4;
1082 memcpy(buf, name, note.n_namesz);
1083 buf += (note.n_namesz + 3)/4;
1084 memcpy(buf, data, note.n_descsz);
1085 buf += (note.n_descsz + 3)/4;
1086
1087 return buf;
1088}
1089
1090static void final_note(u32 *buf)
1091{
1092 struct elf_note note;
1093
1094 note.n_namesz = 0;
1095 note.n_descsz = 0;
1096 note.n_type = 0;
1097 memcpy(buf, &note, sizeof(note));
1098}
1099
1100void crash_save_cpu(struct pt_regs *regs, int cpu)
1101{
1102 struct elf_prstatus prstatus;
1103 u32 *buf;
1104
1105 if ((cpu < 0) || (cpu >= NR_CPUS))
1106 return;
1107
1108 /* Using ELF notes here is opportunistic.
1109 * I need a well defined structure format
1110 * for the data I pass, and I need tags
1111 * on the data to indicate what information I have
1112 * squirrelled away. ELF notes happen to provide
1113 * all of that, so there is no need to invent something new.
1114 */
1115 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1116 if (!buf)
1117 return;
1118 memset(&prstatus, 0, sizeof(prstatus));
1119 prstatus.pr_pid = current->pid;
1120 elf_core_copy_regs(&prstatus.pr_reg, regs);
1121 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
1122 sizeof(prstatus));
1123 final_note(buf);
1124}
1125
1071static int __init crash_notes_memory_init(void) 1126static int __init crash_notes_memory_init(void)
1072{ 1127{
1073 /* Allocate memory for saving cpu registers. */ 1128 /* Allocate memory for saving cpu registers. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e0..17ec4afb0994 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <linux/kallsyms.h> 40#include <linux/kallsyms.h>
41#include <linux/freezer.h>
41#include <asm-generic/sections.h> 42#include <asm-generic/sections.h>
42#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
43#include <asm/errno.h> 44#include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
83 kprobe_opcode_t *insns; /* Page of instruction slots */ 84 kprobe_opcode_t *insns; /* Page of instruction slots */
84 char slot_used[INSNS_PER_PAGE]; 85 char slot_used[INSNS_PER_PAGE];
85 int nused; 86 int nused;
87 int ngarbage;
86}; 88};
87 89
88static struct hlist_head kprobe_insn_pages; 90static struct hlist_head kprobe_insn_pages;
91static int kprobe_garbage_slots;
92static int collect_garbage_slots(void);
93
94static int __kprobes check_safety(void)
95{
96 int ret = 0;
97#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
98 ret = freeze_processes();
99 if (ret == 0) {
100 struct task_struct *p, *q;
101 do_each_thread(p, q) {
102 if (p != current && p->state == TASK_RUNNING &&
103 p->pid != 0) {
104 printk("Check failed: %s is running\n",p->comm);
105 ret = -1;
106 goto loop_end;
107 }
108 } while_each_thread(p, q);
109 }
110loop_end:
111 thaw_processes();
112#else
113 synchronize_sched();
114#endif
115 return ret;
116}
89 117
90/** 118/**
91 * get_insn_slot() - Find a slot on an executable page for an instruction. 119 * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
96 struct kprobe_insn_page *kip; 124 struct kprobe_insn_page *kip;
97 struct hlist_node *pos; 125 struct hlist_node *pos;
98 126
127 retry:
99 hlist_for_each(pos, &kprobe_insn_pages) { 128 hlist_for_each(pos, &kprobe_insn_pages) {
100 kip = hlist_entry(pos, struct kprobe_insn_page, hlist); 129 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
101 if (kip->nused < INSNS_PER_PAGE) { 130 if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
112 } 141 }
113 } 142 }
114 143
115 /* All out of space. Need to allocate a new page. Use slot 0.*/ 144 /* If there are any garbage slots, collect it and try again. */
145 if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
146 goto retry;
147 }
148 /* All out of space. Need to allocate a new page. Use slot 0. */
116 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 149 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
117 if (!kip) { 150 if (!kip) {
118 return NULL; 151 return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
133 memset(kip->slot_used, 0, INSNS_PER_PAGE); 166 memset(kip->slot_used, 0, INSNS_PER_PAGE);
134 kip->slot_used[0] = 1; 167 kip->slot_used[0] = 1;
135 kip->nused = 1; 168 kip->nused = 1;
169 kip->ngarbage = 0;
136 return kip->insns; 170 return kip->insns;
137} 171}
138 172
139void __kprobes free_insn_slot(kprobe_opcode_t *slot) 173/* Return 1 if all garbages are collected, otherwise 0. */
174static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
175{
176 kip->slot_used[idx] = 0;
177 kip->nused--;
178 if (kip->nused == 0) {
179 /*
180 * Page is no longer in use. Free it unless
181 * it's the last one. We keep the last one
182 * so as not to have to set it up again the
183 * next time somebody inserts a probe.
184 */
185 hlist_del(&kip->hlist);
186 if (hlist_empty(&kprobe_insn_pages)) {
187 INIT_HLIST_NODE(&kip->hlist);
188 hlist_add_head(&kip->hlist,
189 &kprobe_insn_pages);
190 } else {
191 module_free(NULL, kip->insns);
192 kfree(kip);
193 }
194 return 1;
195 }
196 return 0;
197}
198
199static int __kprobes collect_garbage_slots(void)
200{
201 struct kprobe_insn_page *kip;
202 struct hlist_node *pos, *next;
203
204 /* Ensure no-one is preepmted on the garbages */
205 if (check_safety() != 0)
206 return -EAGAIN;
207
208 hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
209 int i;
210 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
211 if (kip->ngarbage == 0)
212 continue;
213 kip->ngarbage = 0; /* we will collect all garbages */
214 for (i = 0; i < INSNS_PER_PAGE; i++) {
215 if (kip->slot_used[i] == -1 &&
216 collect_one_slot(kip, i))
217 break;
218 }
219 }
220 kprobe_garbage_slots = 0;
221 return 0;
222}
223
224void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
140{ 225{
141 struct kprobe_insn_page *kip; 226 struct kprobe_insn_page *kip;
142 struct hlist_node *pos; 227 struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
146 if (kip->insns <= slot && 231 if (kip->insns <= slot &&
147 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 232 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
148 int i = (slot - kip->insns) / MAX_INSN_SIZE; 233 int i = (slot - kip->insns) / MAX_INSN_SIZE;
149 kip->slot_used[i] = 0; 234 if (dirty) {
150 kip->nused--; 235 kip->slot_used[i] = -1;
151 if (kip->nused == 0) { 236 kip->ngarbage++;
152 /* 237 } else {
153 * Page is no longer in use. Free it unless 238 collect_one_slot(kip, i);
154 * it's the last one. We keep the last one
155 * so as not to have to set it up again the
156 * next time somebody inserts a probe.
157 */
158 hlist_del(&kip->hlist);
159 if (hlist_empty(&kprobe_insn_pages)) {
160 INIT_HLIST_NODE(&kip->hlist);
161 hlist_add_head(&kip->hlist,
162 &kprobe_insn_pages);
163 } else {
164 module_free(NULL, kip->insns);
165 kfree(kip);
166 }
167 } 239 }
168 return; 240 break;
169 } 241 }
170 } 242 }
243 if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
244 collect_garbage_slots();
245 }
171} 246}
172#endif 247#endif
173 248
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c9fefdb1a7db..b02032476dc2 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -140,13 +140,6 @@ void lockdep_on(void)
140 140
141EXPORT_SYMBOL(lockdep_on); 141EXPORT_SYMBOL(lockdep_on);
142 142
143int lockdep_internal(void)
144{
145 return current->lockdep_recursion != 0;
146}
147
148EXPORT_SYMBOL(lockdep_internal);
149
150/* 143/*
151 * Debugging switches: 144 * Debugging switches:
152 */ 145 */
@@ -228,17 +221,15 @@ static int save_trace(struct stack_trace *trace)
228 trace->skip = 3; 221 trace->skip = 3;
229 trace->all_contexts = 0; 222 trace->all_contexts = 0;
230 223
231 /* Make sure to not recurse in case the the unwinder needs to tak
232e locks. */
233 lockdep_off();
234 save_stack_trace(trace, NULL); 224 save_stack_trace(trace, NULL);
235 lockdep_on();
236 225
237 trace->max_entries = trace->nr_entries; 226 trace->max_entries = trace->nr_entries;
238 227
239 nr_stack_trace_entries += trace->nr_entries; 228 nr_stack_trace_entries += trace->nr_entries;
240 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) 229 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) {
230 __raw_spin_unlock(&hash_lock);
241 return 0; 231 return 0;
232 }
242 233
243 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 234 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
244 __raw_spin_unlock(&hash_lock); 235 __raw_spin_unlock(&hash_lock);
@@ -357,7 +348,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
357 348
358static void print_lock_name(struct lock_class *class) 349static void print_lock_name(struct lock_class *class)
359{ 350{
360 char str[128], c1, c2, c3, c4; 351 char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
361 const char *name; 352 const char *name;
362 353
363 get_usage_chars(class, &c1, &c2, &c3, &c4); 354 get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -379,7 +370,7 @@ static void print_lock_name(struct lock_class *class)
379static void print_lockdep_cache(struct lockdep_map *lock) 370static void print_lockdep_cache(struct lockdep_map *lock)
380{ 371{
381 const char *name; 372 const char *name;
382 char str[128]; 373 char str[KSYM_NAME_LEN + 1];
383 374
384 name = lock->name; 375 name = lock->name;
385 if (!name) 376 if (!name)
@@ -449,7 +440,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
449 print_lock_class_header(class, depth); 440 print_lock_class_header(class, depth);
450 441
451 list_for_each_entry(entry, &class->locks_after, entry) { 442 list_for_each_entry(entry, &class->locks_after, entry) {
452 DEBUG_LOCKS_WARN_ON(!entry->class); 443 if (DEBUG_LOCKS_WARN_ON(!entry->class))
444 return;
445
453 print_lock_dependencies(entry->class, depth + 1); 446 print_lock_dependencies(entry->class, depth + 1);
454 447
455 printk("%*s ... acquired at:\n",depth,""); 448 printk("%*s ... acquired at:\n",depth,"");
@@ -474,7 +467,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
474 return 0; 467 return 0;
475 468
476 entry->class = this; 469 entry->class = this;
477 save_trace(&entry->trace); 470 if (!save_trace(&entry->trace))
471 return 0;
478 472
479 /* 473 /*
480 * Since we never remove from the dependency list, the list can 474 * Since we never remove from the dependency list, the list can
@@ -562,8 +556,12 @@ static noinline int print_circular_bug_tail(void)
562 if (debug_locks_silent) 556 if (debug_locks_silent)
563 return 0; 557 return 0;
564 558
559 /* hash_lock unlocked by the header */
560 __raw_spin_lock(&hash_lock);
565 this.class = check_source->class; 561 this.class = check_source->class;
566 save_trace(&this.trace); 562 if (!save_trace(&this.trace))
563 return 0;
564 __raw_spin_unlock(&hash_lock);
567 print_circular_bug_entry(&this, 0); 565 print_circular_bug_entry(&this, 0);
568 566
569 printk("\nother info that might help us debug this:\n\n"); 567 printk("\nother info that might help us debug this:\n\n");
@@ -966,14 +964,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
966 &prev->class->locks_after, next->acquire_ip); 964 &prev->class->locks_after, next->acquire_ip);
967 if (!ret) 965 if (!ret)
968 return 0; 966 return 0;
969 /* 967
970 * Return value of 2 signals 'dependency already added',
971 * in that case we dont have to add the backlink either.
972 */
973 if (ret == 2)
974 return 2;
975 ret = add_lock_to_list(next->class, prev->class, 968 ret = add_lock_to_list(next->class, prev->class,
976 &next->class->locks_before, next->acquire_ip); 969 &next->class->locks_before, next->acquire_ip);
970 if (!ret)
971 return 0;
977 972
978 /* 973 /*
979 * Debugging printouts: 974 * Debugging printouts:
@@ -1025,7 +1020,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1025 * added: 1020 * added:
1026 */ 1021 */
1027 if (hlock->read != 2) { 1022 if (hlock->read != 2) {
1028 check_prev_add(curr, hlock, next); 1023 if (!check_prev_add(curr, hlock, next))
1024 return 0;
1029 /* 1025 /*
1030 * Stop after the first non-trylock entry, 1026 * Stop after the first non-trylock entry,
1031 * as non-trylock entries have added their 1027 * as non-trylock entries have added their
@@ -1182,6 +1178,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1182 struct lockdep_subclass_key *key; 1178 struct lockdep_subclass_key *key;
1183 struct list_head *hash_head; 1179 struct list_head *hash_head;
1184 struct lock_class *class; 1180 struct lock_class *class;
1181 unsigned long flags;
1185 1182
1186 class = look_up_lock_class(lock, subclass); 1183 class = look_up_lock_class(lock, subclass);
1187 if (likely(class)) 1184 if (likely(class))
@@ -1203,6 +1200,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1203 key = lock->key->subkeys + subclass; 1200 key = lock->key->subkeys + subclass;
1204 hash_head = classhashentry(key); 1201 hash_head = classhashentry(key);
1205 1202
1203 raw_local_irq_save(flags);
1206 __raw_spin_lock(&hash_lock); 1204 __raw_spin_lock(&hash_lock);
1207 /* 1205 /*
1208 * We have to do the hash-walk again, to avoid races 1206 * We have to do the hash-walk again, to avoid races
@@ -1217,6 +1215,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1217 */ 1215 */
1218 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { 1216 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1219 __raw_spin_unlock(&hash_lock); 1217 __raw_spin_unlock(&hash_lock);
1218 raw_local_irq_restore(flags);
1220 debug_locks_off(); 1219 debug_locks_off();
1221 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 1220 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1222 printk("turning off the locking correctness validator.\n"); 1221 printk("turning off the locking correctness validator.\n");
@@ -1239,15 +1238,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1239 1238
1240 if (verbose(class)) { 1239 if (verbose(class)) {
1241 __raw_spin_unlock(&hash_lock); 1240 __raw_spin_unlock(&hash_lock);
1241 raw_local_irq_restore(flags);
1242 printk("\nnew class %p: %s", class->key, class->name); 1242 printk("\nnew class %p: %s", class->key, class->name);
1243 if (class->name_version > 1) 1243 if (class->name_version > 1)
1244 printk("#%d", class->name_version); 1244 printk("#%d", class->name_version);
1245 printk("\n"); 1245 printk("\n");
1246 dump_stack(); 1246 dump_stack();
1247 raw_local_irq_save(flags);
1247 __raw_spin_lock(&hash_lock); 1248 __raw_spin_lock(&hash_lock);
1248 } 1249 }
1249out_unlock_set: 1250out_unlock_set:
1250 __raw_spin_unlock(&hash_lock); 1251 __raw_spin_unlock(&hash_lock);
1252 raw_local_irq_restore(flags);
1251 1253
1252 if (!subclass || force) 1254 if (!subclass || force)
1253 lock->class_cache = class; 1255 lock->class_cache = class;
@@ -1728,6 +1730,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1728 debug_atomic_dec(&nr_unused_locks); 1730 debug_atomic_dec(&nr_unused_locks);
1729 break; 1731 break;
1730 default: 1732 default:
1733 __raw_spin_unlock(&hash_lock);
1731 debug_locks_off(); 1734 debug_locks_off();
1732 WARN_ON(1); 1735 WARN_ON(1);
1733 return 0; 1736 return 0;
@@ -2645,6 +2648,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
2645 } 2648 }
2646 local_irq_restore(flags); 2649 local_irq_restore(flags);
2647} 2650}
2651EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
2648 2652
2649static void print_held_locks_bug(struct task_struct *curr) 2653static void print_held_locks_bug(struct task_struct *curr)
2650{ 2654{
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb2..8ce09bc4613d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
20#define MAX_LOCKDEP_KEYS_BITS 11 20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) 21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22 22
23#define MAX_LOCKDEP_CHAINS_BITS 13 23#define MAX_LOCKDEP_CHAINS_BITS 14
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25 25
26/* 26/*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3fa..b554b40a4aa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
113 return 0; 113 return 0;
114} 114}
115 115
116static struct seq_operations lockdep_ops = { 116static const struct seq_operations lockdep_ops = {
117 .start = l_start, 117 .start = l_start,
118 .next = l_next, 118 .next = l_next,
119 .stop = l_stop, 119 .stop = l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
135 return res; 135 return res;
136} 136}
137 137
138static struct file_operations proc_lockdep_operations = { 138static const struct file_operations proc_lockdep_operations = {
139 .open = lockdep_open, 139 .open = lockdep_open,
140 .read = seq_read, 140 .read = seq_read,
141 .llseek = seq_lseek, 141 .llseek = seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
319 return single_open(file, lockdep_stats_show, NULL); 319 return single_open(file, lockdep_stats_show, NULL);
320} 320}
321 321
322static struct file_operations proc_lockdep_stats_operations = { 322static const struct file_operations proc_lockdep_stats_operations = {
323 .open = lockdep_stats_open, 323 .open = lockdep_stats_open,
324 .read = seq_read, 324 .read = seq_read,
325 .llseek = seq_lseek, 325 .llseek = seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index e2d09d604ca0..d9eae45d0145 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2209,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p)
2209 Where refcount is a number or -, and deps is a comma-separated list 2209 Where refcount is a number or -, and deps is a comma-separated list
2210 of depends or -. 2210 of depends or -.
2211*/ 2211*/
2212struct seq_operations modules_op = { 2212const struct seq_operations modules_op = {
2213 .start = m_start, 2213 .start = m_start,
2214 .next = m_next, 2214 .next = m_next,
2215 .stop = m_stop, 2215 .stop = m_stop,
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 18651641a7b5..841539d72c55 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
77 77
78void debug_mutex_unlock(struct mutex *lock) 78void debug_mutex_unlock(struct mutex *lock)
79{ 79{
80 if (unlikely(!debug_locks))
81 return;
82
80 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
81 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 84 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 85 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f9..a48879b0b921 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -31,7 +31,7 @@
31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 31#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
32static struct hlist_head *pid_hash; 32static struct hlist_head *pid_hash;
33static int pidhash_shift; 33static int pidhash_shift;
34static kmem_cache_t *pid_cachep; 34static struct kmem_cache *pid_cachep;
35 35
36int pid_max = PID_MAX_DEFAULT; 36int pid_max = PID_MAX_DEFAULT;
37 37
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06f..5fe87de10ff0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
70/* 70/*
71 * Lets keep our timers in a slab cache :-) 71 * Lets keep our timers in a slab cache :-)
72 */ 72 */
73static kmem_cache_t *posix_timers_cache; 73static struct kmem_cache *posix_timers_cache;
74static struct idr posix_timers_id; 74static struct idr posix_timers_id;
75static DEFINE_SPINLOCK(idr_lock); 75static DEFINE_SPINLOCK(idr_lock);
76 76
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca3479..710ed084e7c5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED
78 78
79config SOFTWARE_SUSPEND 79config SOFTWARE_SUSPEND
80 bool "Software Suspend" 80 bool "Software Suspend"
81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) 81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
82 ---help--- 82 ---help---
83 Enable the possibility of suspending the machine. 83 Enable the possibility of suspending the machine.
84 It doesn't need ACPI or APM. 84 It doesn't need ACPI or APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index b1fb7866b0b3..0b00f56c2ad0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/console.h> 21#include <linux/console.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/freezer.h>
23 24
24#include "power.h" 25#include "power.h"
25 26
@@ -27,6 +28,23 @@
27static int noresume = 0; 28static int noresume = 0;
28char resume_file[256] = CONFIG_PM_STD_PARTITION; 29char resume_file[256] = CONFIG_PM_STD_PARTITION;
29dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block;
32
33/**
34 * platform_prepare - prepare the machine for hibernation using the
35 * platform driver if so configured and return an error code if it fails
36 */
37
38static inline int platform_prepare(void)
39{
40 int error = 0;
41
42 if (pm_disk_mode == PM_DISK_PLATFORM) {
43 if (pm_ops && pm_ops->prepare)
44 error = pm_ops->prepare(PM_SUSPEND_DISK);
45 }
46 return error;
47}
30 48
31/** 49/**
32 * power_down - Shut machine down for hibernate. 50 * power_down - Shut machine down for hibernate.
@@ -40,12 +58,10 @@ dev_t swsusp_resume_device;
40 58
41static void power_down(suspend_disk_method_t mode) 59static void power_down(suspend_disk_method_t mode)
42{ 60{
43 int error = 0;
44
45 switch(mode) { 61 switch(mode) {
46 case PM_DISK_PLATFORM: 62 case PM_DISK_PLATFORM:
47 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 63 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
48 error = pm_ops->enter(PM_SUSPEND_DISK); 64 pm_ops->enter(PM_SUSPEND_DISK);
49 break; 65 break;
50 case PM_DISK_SHUTDOWN: 66 case PM_DISK_SHUTDOWN:
51 kernel_power_off(); 67 kernel_power_off();
@@ -90,12 +106,18 @@ static int prepare_processes(void)
90 goto thaw; 106 goto thaw;
91 } 107 }
92 108
109 error = platform_prepare();
110 if (error)
111 goto thaw;
112
93 /* Free memory before shutting down devices. */ 113 /* Free memory before shutting down devices. */
94 if (!(error = swsusp_shrink_memory())) 114 if (!(error = swsusp_shrink_memory()))
95 return 0; 115 return 0;
96thaw: 116
117 platform_finish();
118 thaw:
97 thaw_processes(); 119 thaw_processes();
98enable_cpus: 120 enable_cpus:
99 enable_nonboot_cpus(); 121 enable_nonboot_cpus();
100 pm_restore_console(); 122 pm_restore_console();
101 return error; 123 return error;
@@ -127,7 +149,7 @@ int pm_suspend_disk(void)
127 return error; 149 return error;
128 150
129 if (pm_disk_mode == PM_DISK_TESTPROC) 151 if (pm_disk_mode == PM_DISK_TESTPROC)
130 goto Thaw; 152 return 0;
131 153
132 suspend_console(); 154 suspend_console();
133 error = device_suspend(PMSG_FREEZE); 155 error = device_suspend(PMSG_FREEZE);
@@ -189,10 +211,10 @@ static int software_resume(void)
189{ 211{
190 int error; 212 int error;
191 213
192 down(&pm_sem); 214 mutex_lock(&pm_mutex);
193 if (!swsusp_resume_device) { 215 if (!swsusp_resume_device) {
194 if (!strlen(resume_file)) { 216 if (!strlen(resume_file)) {
195 up(&pm_sem); 217 mutex_unlock(&pm_mutex);
196 return -ENOENT; 218 return -ENOENT;
197 } 219 }
198 swsusp_resume_device = name_to_dev_t(resume_file); 220 swsusp_resume_device = name_to_dev_t(resume_file);
@@ -207,7 +229,7 @@ static int software_resume(void)
207 * FIXME: If noresume is specified, we need to find the partition 229 * FIXME: If noresume is specified, we need to find the partition
208 * and reset it back to normal swap space. 230 * and reset it back to normal swap space.
209 */ 231 */
210 up(&pm_sem); 232 mutex_unlock(&pm_mutex);
211 return 0; 233 return 0;
212 } 234 }
213 235
@@ -251,7 +273,7 @@ static int software_resume(void)
251 unprepare_processes(); 273 unprepare_processes();
252 Done: 274 Done:
253 /* For success case, the suspend path will release the lock */ 275 /* For success case, the suspend path will release the lock */
254 up(&pm_sem); 276 mutex_unlock(&pm_mutex);
255 pr_debug("PM: Resume from disk failed.\n"); 277 pr_debug("PM: Resume from disk failed.\n");
256 return 0; 278 return 0;
257} 279}
@@ -312,7 +334,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
312 p = memchr(buf, '\n', n); 334 p = memchr(buf, '\n', n);
313 len = p ? p - buf : n; 335 len = p ? p - buf : n;
314 336
315 down(&pm_sem); 337 mutex_lock(&pm_mutex);
316 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { 338 for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
317 if (!strncmp(buf, pm_disk_modes[i], len)) { 339 if (!strncmp(buf, pm_disk_modes[i], len)) {
318 mode = i; 340 mode = i;
@@ -336,7 +358,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
336 358
337 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 359 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
338 pm_disk_modes[mode]); 360 pm_disk_modes[mode]);
339 up(&pm_sem); 361 mutex_unlock(&pm_mutex);
340 return error ? error : n; 362 return error ? error : n;
341} 363}
342 364
@@ -361,14 +383,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
361 if (maj != MAJOR(res) || min != MINOR(res)) 383 if (maj != MAJOR(res) || min != MINOR(res))
362 goto out; 384 goto out;
363 385
364 down(&pm_sem); 386 mutex_lock(&pm_mutex);
365 swsusp_resume_device = res; 387 swsusp_resume_device = res;
366 up(&pm_sem); 388 mutex_unlock(&pm_mutex);
367 printk("Attempting manual resume\n"); 389 printk("Attempting manual resume\n");
368 noresume = 0; 390 noresume = 0;
369 software_resume(); 391 software_resume();
370 ret = n; 392 ret = n;
371out: 393 out:
372 return ret; 394 return ret;
373} 395}
374 396
@@ -423,6 +445,19 @@ static int __init resume_setup(char *str)
423 return 1; 445 return 1;
424} 446}
425 447
448static int __init resume_offset_setup(char *str)
449{
450 unsigned long long offset;
451
452 if (noresume)
453 return 1;
454
455 if (sscanf(str, "%llu", &offset) == 1)
456 swsusp_resume_block = offset;
457
458 return 1;
459}
460
426static int __init noresume_setup(char *str) 461static int __init noresume_setup(char *str)
427{ 462{
428 noresume = 1; 463 noresume = 1;
@@ -430,4 +465,5 @@ static int __init noresume_setup(char *str)
430} 465}
431 466
432__setup("noresume", noresume_setup); 467__setup("noresume", noresume_setup);
468__setup("resume_offset=", resume_offset_setup);
433__setup("resume=", resume_setup); 469__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 873228c71dab..500eb87f643d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/module.h>
11#include <linux/suspend.h> 12#include <linux/suspend.h>
12#include <linux/kobject.h> 13#include <linux/kobject.h>
13#include <linux/string.h> 14#include <linux/string.h>
@@ -18,13 +19,14 @@
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/resume-trace.h> 21#include <linux/resume-trace.h>
22#include <linux/freezer.h>
21 23
22#include "power.h" 24#include "power.h"
23 25
24/*This is just an arbitrary number */ 26/*This is just an arbitrary number */
25#define FREE_PAGE_NUMBER (100) 27#define FREE_PAGE_NUMBER (100)
26 28
27DECLARE_MUTEX(pm_sem); 29DEFINE_MUTEX(pm_mutex);
28 30
29struct pm_ops *pm_ops; 31struct pm_ops *pm_ops;
30suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; 32suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
36 38
37void pm_set_ops(struct pm_ops * ops) 39void pm_set_ops(struct pm_ops * ops)
38{ 40{
39 down(&pm_sem); 41 mutex_lock(&pm_mutex);
40 pm_ops = ops; 42 pm_ops = ops;
41 up(&pm_sem); 43 mutex_unlock(&pm_mutex);
42} 44}
43 45
44 46
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state)
182 184
183 if (!valid_state(state)) 185 if (!valid_state(state))
184 return -ENODEV; 186 return -ENODEV;
185 if (down_trylock(&pm_sem)) 187 if (!mutex_trylock(&pm_mutex))
186 return -EBUSY; 188 return -EBUSY;
187 189
188 if (state == PM_SUSPEND_DISK) { 190 if (state == PM_SUSPEND_DISK) {
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state)
200 pr_debug("PM: Finishing wakeup.\n"); 202 pr_debug("PM: Finishing wakeup.\n");
201 suspend_finish(state); 203 suspend_finish(state);
202 Unlock: 204 Unlock:
203 up(&pm_sem); 205 mutex_unlock(&pm_mutex);
204 return error; 206 return error;
205} 207}
206 208
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state)
229 return -EINVAL; 231 return -EINVAL;
230} 232}
231 233
232 234EXPORT_SYMBOL(pm_suspend);
233 235
234decl_subsys(power,NULL,NULL); 236decl_subsys(power,NULL,NULL);
235 237
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b272..eb461b816bf4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
22 return -EPERM; 22 return -EPERM;
23} 23}
24#endif 24#endif
25extern struct semaphore pm_sem; 25
26extern struct mutex pm_mutex;
27
26#define power_attr(_name) \ 28#define power_attr(_name) \
27static struct subsys_attribute _name##_attr = { \ 29static struct subsys_attribute _name##_attr = { \
28 .attr = { \ 30 .attr = { \
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end;
42extern unsigned long image_size; 44extern unsigned long image_size;
43extern int in_suspend; 45extern int in_suspend;
44extern dev_t swsusp_resume_device; 46extern dev_t swsusp_resume_device;
47extern sector_t swsusp_resume_block;
45 48
46extern asmlinkage int swsusp_arch_suspend(void); 49extern asmlinkage int swsusp_arch_suspend(void);
47extern asmlinkage int swsusp_arch_resume(void); 50extern asmlinkage int swsusp_arch_resume(void);
@@ -102,8 +105,18 @@ struct snapshot_handle {
102extern unsigned int snapshot_additional_pages(struct zone *zone); 105extern unsigned int snapshot_additional_pages(struct zone *zone);
103extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 106extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
104extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 107extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
108extern void snapshot_write_finalize(struct snapshot_handle *handle);
105extern int snapshot_image_loaded(struct snapshot_handle *handle); 109extern int snapshot_image_loaded(struct snapshot_handle *handle);
106extern void snapshot_free_unused_memory(struct snapshot_handle *handle); 110
111/*
112 * This structure is used to pass the values needed for the identification
113 * of the resume swap area from a user space to the kernel via the
114 * SNAPSHOT_SET_SWAP_AREA ioctl
115 */
116struct resume_swap_area {
117 loff_t offset;
118 u_int32_t dev;
119} __attribute__((packed));
107 120
108#define SNAPSHOT_IOC_MAGIC '3' 121#define SNAPSHOT_IOC_MAGIC '3'
109#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) 122#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
117#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) 130#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
118#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) 131#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
119#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) 132#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
120#define SNAPSHOT_IOC_MAXNR 11 133#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
134#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \
135 struct resume_swap_area)
136#define SNAPSHOT_IOC_MAXNR 13
137
138#define PMOPS_PREPARE 1
139#define PMOPS_ENTER 2
140#define PMOPS_FINISH 3
121 141
122/** 142/**
123 * The bitmap is used for tracing allocated swap pages 143 * The bitmap is used for tracing allocated swap pages
@@ -141,7 +161,7 @@ struct bitmap_page {
141 161
142extern void free_bitmap(struct bitmap_page *bitmap); 162extern void free_bitmap(struct bitmap_page *bitmap);
143extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); 163extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
144extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); 164extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
145extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); 165extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
146 166
147extern int swsusp_check(void); 167extern int swsusp_check(void);
@@ -153,3 +173,7 @@ extern int swsusp_read(void);
153extern int swsusp_write(void); 173extern int swsusp_write(void);
154extern void swsusp_close(void); 174extern void swsusp_close(void);
155extern int suspend_enter(suspend_state_t state); 175extern int suspend_enter(suspend_state_t state);
176
177struct timeval;
178extern void swsusp_show_speed(struct timeval *, struct timeval *,
179 unsigned int, char *);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e6..99eeb119b06d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,12 +13,15 @@
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/freezer.h>
16 17
17/* 18/*
18 * Timeout for stopping processes 19 * Timeout for stopping processes
19 */ 20 */
20#define TIMEOUT (20 * HZ) 21#define TIMEOUT (20 * HZ)
21 22
23#define FREEZER_KERNEL_THREADS 0
24#define FREEZER_USER_SPACE 1
22 25
23static inline int freezeable(struct task_struct * p) 26static inline int freezeable(struct task_struct * p)
24{ 27{
@@ -39,7 +42,6 @@ void refrigerator(void)
39 long save; 42 long save;
40 save = current->state; 43 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 44 pr_debug("%s entered refrigerator\n", current->comm);
42 printk("=");
43 45
44 frozen_process(current); 46 frozen_process(current);
45 spin_lock_irq(&current->sighand->siglock); 47 spin_lock_irq(&current->sighand->siglock);
@@ -79,96 +81,136 @@ static void cancel_freezing(struct task_struct *p)
79 } 81 }
80} 82}
81 83
82/* 0 = success, else # of processes that we failed to stop */ 84static inline int is_user_space(struct task_struct *p)
83int freeze_processes(void) 85{
86 return p->mm && !(p->flags & PF_BORROWED_MM);
87}
88
89static unsigned int try_to_freeze_tasks(int freeze_user_space)
84{ 90{
85 int todo, nr_user, user_frozen;
86 unsigned long start_time;
87 struct task_struct *g, *p; 91 struct task_struct *g, *p;
92 unsigned long end_time;
93 unsigned int todo;
88 94
89 printk( "Stopping tasks: " ); 95 end_time = jiffies + TIMEOUT;
90 start_time = jiffies;
91 user_frozen = 0;
92 do { 96 do {
93 nr_user = todo = 0; 97 todo = 0;
94 read_lock(&tasklist_lock); 98 read_lock(&tasklist_lock);
95 do_each_thread(g, p) { 99 do_each_thread(g, p) {
96 if (!freezeable(p)) 100 if (!freezeable(p))
97 continue; 101 continue;
102
98 if (frozen(p)) 103 if (frozen(p))
99 continue; 104 continue;
100 if (p->state == TASK_TRACED && frozen(p->parent)) { 105
106 if (p->state == TASK_TRACED &&
107 (frozen(p->parent) ||
108 p->parent->state == TASK_STOPPED)) {
101 cancel_freezing(p); 109 cancel_freezing(p);
102 continue; 110 continue;
103 } 111 }
104 if (p->mm && !(p->flags & PF_BORROWED_MM)) { 112 if (is_user_space(p)) {
105 /* The task is a user-space one. 113 if (!freeze_user_space)
106 * Freeze it unless there's a vfork completion 114 continue;
107 * pending 115
116 /* Freeze the task unless there is a vfork
117 * completion pending
108 */ 118 */
109 if (!p->vfork_done) 119 if (!p->vfork_done)
110 freeze_process(p); 120 freeze_process(p);
111 nr_user++;
112 } else { 121 } else {
113 /* Freeze only if the user space is frozen */ 122 if (freeze_user_space)
114 if (user_frozen) 123 continue;
115 freeze_process(p); 124
116 todo++; 125 freeze_process(p);
117 } 126 }
127 todo++;
118 } while_each_thread(g, p); 128 } while_each_thread(g, p);
119 read_unlock(&tasklist_lock); 129 read_unlock(&tasklist_lock);
120 todo += nr_user;
121 if (!user_frozen && !nr_user) {
122 sys_sync();
123 start_time = jiffies;
124 }
125 user_frozen = !nr_user;
126 yield(); /* Yield is okay here */ 130 yield(); /* Yield is okay here */
127 if (todo && time_after(jiffies, start_time + TIMEOUT)) 131 if (todo && time_after(jiffies, end_time))
128 break; 132 break;
129 } while(todo); 133 } while (todo);
130 134
131 /* This does not unfreeze processes that are already frozen
132 * (we have slightly ugly calling convention in that respect,
133 * and caller must call thaw_processes() if something fails),
134 * but it cleans up leftover PF_FREEZE requests.
135 */
136 if (todo) { 135 if (todo) {
137 printk( "\n" ); 136 /* This does not unfreeze processes that are already frozen
138 printk(KERN_ERR " stopping tasks timed out " 137 * (we have slightly ugly calling convention in that respect,
139 "after %d seconds (%d tasks remaining):\n", 138 * and caller must call thaw_processes() if something fails),
140 TIMEOUT / HZ, todo); 139 * but it cleans up leftover PF_FREEZE requests.
140 */
141 printk("\n");
142 printk(KERN_ERR "Stopping %s timed out after %d seconds "
143 "(%d tasks refusing to freeze):\n",
144 freeze_user_space ? "user space processes" :
145 "kernel threads",
146 TIMEOUT / HZ, todo);
141 read_lock(&tasklist_lock); 147 read_lock(&tasklist_lock);
142 do_each_thread(g, p) { 148 do_each_thread(g, p) {
149 if (is_user_space(p) == !freeze_user_space)
150 continue;
151
143 if (freezeable(p) && !frozen(p)) 152 if (freezeable(p) && !frozen(p))
144 printk(KERN_ERR " %s\n", p->comm); 153 printk(KERN_ERR " %s\n", p->comm);
154
145 cancel_freezing(p); 155 cancel_freezing(p);
146 } while_each_thread(g, p); 156 } while_each_thread(g, p);
147 read_unlock(&tasklist_lock); 157 read_unlock(&tasklist_lock);
148 return todo;
149 } 158 }
150 159
151 printk( "|\n" ); 160 return todo;
161}
162
163/**
164 * freeze_processes - tell processes to enter the refrigerator
165 *
166 * Returns 0 on success, or the number of processes that didn't freeze,
167 * although they were told to.
168 */
169int freeze_processes(void)
170{
171 unsigned int nr_unfrozen;
172
173 printk("Stopping tasks ... ");
174 nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
175 if (nr_unfrozen)
176 return nr_unfrozen;
177
178 sys_sync();
179 nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
180 if (nr_unfrozen)
181 return nr_unfrozen;
182
183 printk("done.\n");
152 BUG_ON(in_atomic()); 184 BUG_ON(in_atomic());
153 return 0; 185 return 0;
154} 186}
155 187
156void thaw_processes(void) 188static void thaw_tasks(int thaw_user_space)
157{ 189{
158 struct task_struct *g, *p; 190 struct task_struct *g, *p;
159 191
160 printk( "Restarting tasks..." );
161 read_lock(&tasklist_lock); 192 read_lock(&tasklist_lock);
162 do_each_thread(g, p) { 193 do_each_thread(g, p) {
163 if (!freezeable(p)) 194 if (!freezeable(p))
164 continue; 195 continue;
196
197 if (is_user_space(p) == !thaw_user_space)
198 continue;
199
165 if (!thaw_process(p)) 200 if (!thaw_process(p))
166 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 201 printk(KERN_WARNING " Strange, %s not stopped\n",
202 p->comm );
167 } while_each_thread(g, p); 203 } while_each_thread(g, p);
168
169 read_unlock(&tasklist_lock); 204 read_unlock(&tasklist_lock);
205}
206
207void thaw_processes(void)
208{
209 printk("Restarting tasks ... ");
210 thaw_tasks(FREEZER_KERNEL_THREADS);
211 thaw_tasks(FREEZER_USER_SPACE);
170 schedule(); 212 schedule();
171 printk( " done\n" ); 213 printk("done.\n");
172} 214}
173 215
174EXPORT_SYMBOL(refrigerator); 216EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d6..c024606221c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
1/* 1/*
2 * linux/kernel/power/snapshot.c 2 * linux/kernel/power/snapshot.c
3 * 3 *
4 * This file provide system snapshot/restore functionality. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 * 8 *
8 * This file is released under the GPLv2, and is based on swsusp.c. 9 * This file is released under the GPLv2.
9 * 10 *
10 */ 11 */
11 12
12
13#include <linux/version.h> 13#include <linux/version.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
@@ -34,137 +34,24 @@
34 34
35#include "power.h" 35#include "power.h"
36 36
37/* List of PBEs used for creating and restoring the suspend image */ 37/* List of PBEs needed for restoring the pages that were allocated before
38 * the suspend and included in the suspend image, but have also been
39 * allocated by the "resume" kernel, so their contents cannot be written
40 * directly to their "original" page frames.
41 */
38struct pbe *restore_pblist; 42struct pbe *restore_pblist;
39 43
40static unsigned int nr_copy_pages; 44/* Pointer to an auxiliary buffer (1 page) */
41static unsigned int nr_meta_pages;
42static void *buffer; 45static void *buffer;
43 46
44#ifdef CONFIG_HIGHMEM
45unsigned int count_highmem_pages(void)
46{
47 struct zone *zone;
48 unsigned long zone_pfn;
49 unsigned int n = 0;
50
51 for_each_zone (zone)
52 if (is_highmem(zone)) {
53 mark_free_pages(zone);
54 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
55 struct page *page;
56 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
57 if (!pfn_valid(pfn))
58 continue;
59 page = pfn_to_page(pfn);
60 if (PageReserved(page))
61 continue;
62 if (PageNosaveFree(page))
63 continue;
64 n++;
65 }
66 }
67 return n;
68}
69
70struct highmem_page {
71 char *data;
72 struct page *page;
73 struct highmem_page *next;
74};
75
76static struct highmem_page *highmem_copy;
77
78static int save_highmem_zone(struct zone *zone)
79{
80 unsigned long zone_pfn;
81 mark_free_pages(zone);
82 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
83 struct page *page;
84 struct highmem_page *save;
85 void *kaddr;
86 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
87
88 if (!(pfn%10000))
89 printk(".");
90 if (!pfn_valid(pfn))
91 continue;
92 page = pfn_to_page(pfn);
93 /*
94 * This condition results from rvmalloc() sans vmalloc_32()
95 * and architectural memory reservations. This should be
96 * corrected eventually when the cases giving rise to this
97 * are better understood.
98 */
99 if (PageReserved(page))
100 continue;
101 BUG_ON(PageNosave(page));
102 if (PageNosaveFree(page))
103 continue;
104 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
105 if (!save)
106 return -ENOMEM;
107 save->next = highmem_copy;
108 save->page = page;
109 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
110 if (!save->data) {
111 kfree(save);
112 return -ENOMEM;
113 }
114 kaddr = kmap_atomic(page, KM_USER0);
115 memcpy(save->data, kaddr, PAGE_SIZE);
116 kunmap_atomic(kaddr, KM_USER0);
117 highmem_copy = save;
118 }
119 return 0;
120}
121
122int save_highmem(void)
123{
124 struct zone *zone;
125 int res = 0;
126
127 pr_debug("swsusp: Saving Highmem");
128 drain_local_pages();
129 for_each_zone (zone) {
130 if (is_highmem(zone))
131 res = save_highmem_zone(zone);
132 if (res)
133 return res;
134 }
135 printk("\n");
136 return 0;
137}
138
139int restore_highmem(void)
140{
141 printk("swsusp: Restoring Highmem\n");
142 while (highmem_copy) {
143 struct highmem_page *save = highmem_copy;
144 void *kaddr;
145 highmem_copy = save->next;
146
147 kaddr = kmap_atomic(save->page, KM_USER0);
148 memcpy(kaddr, save->data, PAGE_SIZE);
149 kunmap_atomic(kaddr, KM_USER0);
150 free_page((long) save->data);
151 kfree(save);
152 }
153 return 0;
154}
155#else
156static inline unsigned int count_highmem_pages(void) {return 0;}
157static inline int save_highmem(void) {return 0;}
158static inline int restore_highmem(void) {return 0;}
159#endif
160
161/** 47/**
162 * @safe_needed - on resume, for storing the PBE list and the image, 48 * @safe_needed - on resume, for storing the PBE list and the image,
163 * we can only use memory pages that do not conflict with the pages 49 * we can only use memory pages that do not conflict with the pages
164 * used before suspend. 50 * used before suspend. The unsafe pages have PageNosaveFree set
51 * and we count them using unsafe_pages.
165 * 52 *
166 * The unsafe pages are marked with the PG_nosave_free flag 53 * Each allocated image page is marked as PageNosave and PageNosaveFree
167 * and we count them using unsafe_pages 54 * so that swsusp_free() can release it.
168 */ 55 */
169 56
170#define PG_ANY 0 57#define PG_ANY 0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
174 61
175static unsigned int allocated_unsafe_pages; 62static unsigned int allocated_unsafe_pages;
176 63
177static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 64static void *get_image_page(gfp_t gfp_mask, int safe_needed)
178{ 65{
179 void *res; 66 void *res;
180 67
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
195 82
196unsigned long get_safe_page(gfp_t gfp_mask) 83unsigned long get_safe_page(gfp_t gfp_mask)
197{ 84{
198 return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); 85 return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
86}
87
88static struct page *alloc_image_page(gfp_t gfp_mask)
89{
90 struct page *page;
91
92 page = alloc_page(gfp_mask);
93 if (page) {
94 SetPageNosave(page);
95 SetPageNosaveFree(page);
96 }
97 return page;
199} 98}
200 99
201/** 100/**
202 * free_image_page - free page represented by @addr, allocated with 101 * free_image_page - free page represented by @addr, allocated with
203 * alloc_image_page (page flags set by it must be cleared) 102 * get_image_page (page flags set by it must be cleared)
204 */ 103 */
205 104
206static inline void free_image_page(void *addr, int clear_nosave_free) 105static inline void free_image_page(void *addr, int clear_nosave_free)
207{ 106{
208 ClearPageNosave(virt_to_page(addr)); 107 struct page *page;
108
109 BUG_ON(!virt_addr_valid(addr));
110
111 page = virt_to_page(addr);
112
113 ClearPageNosave(page);
209 if (clear_nosave_free) 114 if (clear_nosave_free)
210 ClearPageNosaveFree(virt_to_page(addr)); 115 ClearPageNosaveFree(page);
211 free_page((unsigned long)addr); 116
117 __free_page(page);
212} 118}
213 119
214/* struct linked_page is used to build chains of pages */ 120/* struct linked_page is used to build chains of pages */
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
269 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { 175 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
270 struct linked_page *lp; 176 struct linked_page *lp;
271 177
272 lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); 178 lp = get_image_page(ca->gfp_mask, ca->safe_needed);
273 if (!lp) 179 if (!lp)
274 return NULL; 180 return NULL;
275 181
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
446 352
447 /* Compute the number of zones */ 353 /* Compute the number of zones */
448 nr = 0; 354 nr = 0;
449 for_each_zone (zone) 355 for_each_zone(zone)
450 if (populated_zone(zone) && !is_highmem(zone)) 356 if (populated_zone(zone))
451 nr++; 357 nr++;
452 358
453 /* Allocate the list of zones bitmap objects */ 359 /* Allocate the list of zones bitmap objects */
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
459 } 365 }
460 366
461 /* Initialize the zone bitmap objects */ 367 /* Initialize the zone bitmap objects */
462 for_each_zone (zone) { 368 for_each_zone(zone) {
463 unsigned long pfn; 369 unsigned long pfn;
464 370
465 if (!populated_zone(zone) || is_highmem(zone)) 371 if (!populated_zone(zone))
466 continue; 372 continue;
467 373
468 zone_bm->start_pfn = zone->zone_start_pfn; 374 zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
481 while (bb) { 387 while (bb) {
482 unsigned long *ptr; 388 unsigned long *ptr;
483 389
484 ptr = alloc_image_page(gfp_mask, safe_needed); 390 ptr = get_image_page(gfp_mask, safe_needed);
485 bb->data = ptr; 391 bb->data = ptr;
486 if (!ptr) 392 if (!ptr)
487 goto Free; 393 goto Free;
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
505 memory_bm_position_reset(bm); 411 memory_bm_position_reset(bm);
506 return 0; 412 return 0;
507 413
508Free: 414 Free:
509 bm->p_list = ca.chain; 415 bm->p_list = ca.chain;
510 memory_bm_free(bm, PG_UNSAFE_CLEAR); 416 memory_bm_free(bm, PG_UNSAFE_CLEAR);
511 return -ENOMEM; 417 return -ENOMEM;
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
651 memory_bm_position_reset(bm); 557 memory_bm_position_reset(bm);
652 return BM_END_OF_MAP; 558 return BM_END_OF_MAP;
653 559
654Return_pfn: 560 Return_pfn:
655 bm->cur.chunk = chunk; 561 bm->cur.chunk = chunk;
656 bm->cur.bit = bit; 562 bm->cur.bit = bit;
657 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; 563 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone)
669 575
670 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 576 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
671 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); 577 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
672 return res; 578 return 2 * res;
579}
580
581#ifdef CONFIG_HIGHMEM
582/**
583 * count_free_highmem_pages - compute the total number of free highmem
584 * pages, system-wide.
585 */
586
587static unsigned int count_free_highmem_pages(void)
588{
589 struct zone *zone;
590 unsigned int cnt = 0;
591
592 for_each_zone(zone)
593 if (populated_zone(zone) && is_highmem(zone))
594 cnt += zone->free_pages;
595
596 return cnt;
597}
598
599/**
600 * saveable_highmem_page - Determine whether a highmem page should be
601 * included in the suspend image.
602 *
603 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
604 * and it isn't a part of a free chunk of pages.
605 */
606
607static struct page *saveable_highmem_page(unsigned long pfn)
608{
609 struct page *page;
610
611 if (!pfn_valid(pfn))
612 return NULL;
613
614 page = pfn_to_page(pfn);
615
616 BUG_ON(!PageHighMem(page));
617
618 if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
619 return NULL;
620
621 return page;
673} 622}
674 623
675/** 624/**
625 * count_highmem_pages - compute the total number of saveable highmem
626 * pages.
627 */
628
629unsigned int count_highmem_pages(void)
630{
631 struct zone *zone;
632 unsigned int n = 0;
633
634 for_each_zone(zone) {
635 unsigned long pfn, max_zone_pfn;
636
637 if (!is_highmem(zone))
638 continue;
639
640 mark_free_pages(zone);
641 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
642 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
643 if (saveable_highmem_page(pfn))
644 n++;
645 }
646 return n;
647}
648#else
649static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
650static inline unsigned int count_highmem_pages(void) { return 0; }
651#endif /* CONFIG_HIGHMEM */
652
653/**
676 * pfn_is_nosave - check if given pfn is in the 'nosave' section 654 * pfn_is_nosave - check if given pfn is in the 'nosave' section
677 */ 655 */
678 656
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
684} 662}
685 663
686/** 664/**
687 * saveable - Determine whether a page should be cloned or not. 665 * saveable - Determine whether a non-highmem page should be included in
688 * @pfn: The page 666 * the suspend image.
689 * 667 *
690 * We save a page if it isn't Nosave, and is not in the range of pages 668 * We should save the page if it isn't Nosave, and is not in the range
691 * statically defined as 'unsaveable', and it 669 * of pages statically defined as 'unsaveable', and it isn't a part of
692 * isn't a part of a free chunk of pages. 670 * a free chunk of pages.
693 */ 671 */
694 672
695static struct page *saveable_page(unsigned long pfn) 673static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn)
701 679
702 page = pfn_to_page(pfn); 680 page = pfn_to_page(pfn);
703 681
704 if (PageNosave(page)) 682 BUG_ON(PageHighMem(page));
683
684 if (PageNosave(page) || PageNosaveFree(page))
705 return NULL; 685 return NULL;
686
706 if (PageReserved(page) && pfn_is_nosave(pfn)) 687 if (PageReserved(page) && pfn_is_nosave(pfn))
707 return NULL; 688 return NULL;
708 if (PageNosaveFree(page))
709 return NULL;
710 689
711 return page; 690 return page;
712} 691}
713 692
693/**
694 * count_data_pages - compute the total number of saveable non-highmem
695 * pages.
696 */
697
714unsigned int count_data_pages(void) 698unsigned int count_data_pages(void)
715{ 699{
716 struct zone *zone; 700 struct zone *zone;
717 unsigned long pfn, max_zone_pfn; 701 unsigned long pfn, max_zone_pfn;
718 unsigned int n = 0; 702 unsigned int n = 0;
719 703
720 for_each_zone (zone) { 704 for_each_zone(zone) {
721 if (is_highmem(zone)) 705 if (is_highmem(zone))
722 continue; 706 continue;
707
723 mark_free_pages(zone); 708 mark_free_pages(zone);
724 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 709 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
725 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 710 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
726 n += !!saveable_page(pfn); 711 if(saveable_page(pfn))
712 n++;
727 } 713 }
728 return n; 714 return n;
729} 715}
730 716
731static inline void copy_data_page(long *dst, long *src) 717/* This is needed, because copy_page and memcpy are not usable for copying
718 * task structs.
719 */
720static inline void do_copy_page(long *dst, long *src)
732{ 721{
733 int n; 722 int n;
734 723
735 /* copy_page and memcpy are not usable for copying task structs. */
736 for (n = PAGE_SIZE / sizeof(long); n; n--) 724 for (n = PAGE_SIZE / sizeof(long); n; n--)
737 *dst++ = *src++; 725 *dst++ = *src++;
738} 726}
739 727
728#ifdef CONFIG_HIGHMEM
729static inline struct page *
730page_is_saveable(struct zone *zone, unsigned long pfn)
731{
732 return is_highmem(zone) ?
733 saveable_highmem_page(pfn) : saveable_page(pfn);
734}
735
736static inline void
737copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
738{
739 struct page *s_page, *d_page;
740 void *src, *dst;
741
742 s_page = pfn_to_page(src_pfn);
743 d_page = pfn_to_page(dst_pfn);
744 if (PageHighMem(s_page)) {
745 src = kmap_atomic(s_page, KM_USER0);
746 dst = kmap_atomic(d_page, KM_USER1);
747 do_copy_page(dst, src);
748 kunmap_atomic(src, KM_USER0);
749 kunmap_atomic(dst, KM_USER1);
750 } else {
751 src = page_address(s_page);
752 if (PageHighMem(d_page)) {
753 /* Page pointed to by src may contain some kernel
754 * data modified by kmap_atomic()
755 */
756 do_copy_page(buffer, src);
757 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
758 memcpy(dst, buffer, PAGE_SIZE);
759 kunmap_atomic(dst, KM_USER0);
760 } else {
761 dst = page_address(d_page);
762 do_copy_page(dst, src);
763 }
764 }
765}
766#else
767#define page_is_saveable(zone, pfn) saveable_page(pfn)
768
769static inline void
770copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
771{
772 do_copy_page(page_address(pfn_to_page(dst_pfn)),
773 page_address(pfn_to_page(src_pfn)));
774}
775#endif /* CONFIG_HIGHMEM */
776
740static void 777static void
741copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) 778copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
742{ 779{
743 struct zone *zone; 780 struct zone *zone;
744 unsigned long pfn; 781 unsigned long pfn;
745 782
746 for_each_zone (zone) { 783 for_each_zone(zone) {
747 unsigned long max_zone_pfn; 784 unsigned long max_zone_pfn;
748 785
749 if (is_highmem(zone))
750 continue;
751
752 mark_free_pages(zone); 786 mark_free_pages(zone);
753 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 787 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
754 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 788 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
755 if (saveable_page(pfn)) 789 if (page_is_saveable(zone, pfn))
756 memory_bm_set_bit(orig_bm, pfn); 790 memory_bm_set_bit(orig_bm, pfn);
757 } 791 }
758 memory_bm_position_reset(orig_bm); 792 memory_bm_position_reset(orig_bm);
759 memory_bm_position_reset(copy_bm); 793 memory_bm_position_reset(copy_bm);
760 do { 794 do {
761 pfn = memory_bm_next_pfn(orig_bm); 795 pfn = memory_bm_next_pfn(orig_bm);
762 if (likely(pfn != BM_END_OF_MAP)) { 796 if (likely(pfn != BM_END_OF_MAP))
763 struct page *page; 797 copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
764 void *src;
765
766 page = pfn_to_page(pfn);
767 src = page_address(page);
768 page = pfn_to_page(memory_bm_next_pfn(copy_bm));
769 copy_data_page(page_address(page), src);
770 }
771 } while (pfn != BM_END_OF_MAP); 798 } while (pfn != BM_END_OF_MAP);
772} 799}
773 800
801/* Total number of image pages */
802static unsigned int nr_copy_pages;
803/* Number of pages needed for saving the original pfns of the image pages */
804static unsigned int nr_meta_pages;
805
774/** 806/**
775 * swsusp_free - free pages allocated for the suspend. 807 * swsusp_free - free pages allocated for the suspend.
776 * 808 *
@@ -792,7 +824,7 @@ void swsusp_free(void)
792 if (PageNosave(page) && PageNosaveFree(page)) { 824 if (PageNosave(page) && PageNosaveFree(page)) {
793 ClearPageNosave(page); 825 ClearPageNosave(page);
794 ClearPageNosaveFree(page); 826 ClearPageNosaveFree(page);
795 free_page((long) page_address(page)); 827 __free_page(page);
796 } 828 }
797 } 829 }
798 } 830 }
@@ -802,34 +834,108 @@ void swsusp_free(void)
802 buffer = NULL; 834 buffer = NULL;
803} 835}
804 836
837#ifdef CONFIG_HIGHMEM
838/**
839 * count_pages_for_highmem - compute the number of non-highmem pages
840 * that will be necessary for creating copies of highmem pages.
841 */
842
843static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
844{
845 unsigned int free_highmem = count_free_highmem_pages();
846
847 if (free_highmem >= nr_highmem)
848 nr_highmem = 0;
849 else
850 nr_highmem -= free_highmem;
851
852 return nr_highmem;
853}
854#else
855static unsigned int
856count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
857#endif /* CONFIG_HIGHMEM */
805 858
806/** 859/**
807 * enough_free_mem - Make sure we enough free memory to snapshot. 860 * enough_free_mem - Make sure we have enough free memory for the
808 * 861 * snapshot image.
809 * Returns TRUE or FALSE after checking the number of available
810 * free pages.
811 */ 862 */
812 863
813static int enough_free_mem(unsigned int nr_pages) 864static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
814{ 865{
815 struct zone *zone; 866 struct zone *zone;
816 unsigned int free = 0, meta = 0; 867 unsigned int free = 0, meta = 0;
817 868
818 for_each_zone (zone) 869 for_each_zone(zone) {
819 if (!is_highmem(zone)) { 870 meta += snapshot_additional_pages(zone);
871 if (!is_highmem(zone))
820 free += zone->free_pages; 872 free += zone->free_pages;
821 meta += snapshot_additional_pages(zone); 873 }
822 }
823 874
824 pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", 875 nr_pages += count_pages_for_highmem(nr_highmem);
876 pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
825 nr_pages, PAGES_FOR_IO, meta, free); 877 nr_pages, PAGES_FOR_IO, meta, free);
826 878
827 return free > nr_pages + PAGES_FOR_IO + meta; 879 return free > nr_pages + PAGES_FOR_IO + meta;
828} 880}
829 881
882#ifdef CONFIG_HIGHMEM
883/**
884 * get_highmem_buffer - if there are some highmem pages in the suspend
885 * image, we may need the buffer to copy them and/or load their data.
886 */
887
888static inline int get_highmem_buffer(int safe_needed)
889{
890 buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
891 return buffer ? 0 : -ENOMEM;
892}
893
894/**
895 * alloc_highmem_image_pages - allocate some highmem pages for the image.
896 * Try to allocate as many pages as needed, but if the number of free
897 * highmem pages is lesser than that, allocate them all.
898 */
899
900static inline unsigned int
901alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
902{
903 unsigned int to_alloc = count_free_highmem_pages();
904
905 if (to_alloc > nr_highmem)
906 to_alloc = nr_highmem;
907
908 nr_highmem -= to_alloc;
909 while (to_alloc-- > 0) {
910 struct page *page;
911
912 page = alloc_image_page(__GFP_HIGHMEM);
913 memory_bm_set_bit(bm, page_to_pfn(page));
914 }
915 return nr_highmem;
916}
917#else
918static inline int get_highmem_buffer(int safe_needed) { return 0; }
919
920static inline unsigned int
921alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
922#endif /* CONFIG_HIGHMEM */
923
924/**
925 * swsusp_alloc - allocate memory for the suspend image
926 *
927 * We first try to allocate as many highmem pages as there are
928 * saveable highmem pages in the system. If that fails, we allocate
929 * non-highmem pages for the copies of the remaining highmem ones.
930 *
931 * In this approach it is likely that the copies of highmem pages will
932 * also be located in the high memory, because of the way in which
933 * copy_data_pages() works.
934 */
935
830static int 936static int
831swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 937swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
832 unsigned int nr_pages) 938 unsigned int nr_pages, unsigned int nr_highmem)
833{ 939{
834 int error; 940 int error;
835 941
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
841 if (error) 947 if (error)
842 goto Free; 948 goto Free;
843 949
950 if (nr_highmem > 0) {
951 error = get_highmem_buffer(PG_ANY);
952 if (error)
953 goto Free;
954
955 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
956 }
844 while (nr_pages-- > 0) { 957 while (nr_pages-- > 0) {
845 struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); 958 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
959
846 if (!page) 960 if (!page)
847 goto Free; 961 goto Free;
848 962
849 SetPageNosave(page);
850 SetPageNosaveFree(page);
851 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 963 memory_bm_set_bit(copy_bm, page_to_pfn(page));
852 } 964 }
853 return 0; 965 return 0;
854 966
855Free: 967 Free:
856 swsusp_free(); 968 swsusp_free();
857 return -ENOMEM; 969 return -ENOMEM;
858} 970}
859 971
860/* Memory bitmap used for marking saveable pages */ 972/* Memory bitmap used for marking saveable pages (during suspend) or the
973 * suspend image pages (during resume)
974 */
861static struct memory_bitmap orig_bm; 975static struct memory_bitmap orig_bm;
862/* Memory bitmap used for marking allocated pages that will contain the copies 976/* Memory bitmap used on suspend for marking allocated pages that will contain
863 * of saveable pages 977 * the copies of saveable pages. During resume it is initially used for
978 * marking the suspend image pages, but then its set bits are duplicated in
979 * @orig_bm and it is released. Next, on systems with high memory, it may be
980 * used for marking "safe" highmem pages, but it has to be reinitialized for
981 * this purpose.
864 */ 982 */
865static struct memory_bitmap copy_bm; 983static struct memory_bitmap copy_bm;
866 984
867asmlinkage int swsusp_save(void) 985asmlinkage int swsusp_save(void)
868{ 986{
869 unsigned int nr_pages; 987 unsigned int nr_pages, nr_highmem;
870 988
871 pr_debug("swsusp: critical section: \n"); 989 printk("swsusp: critical section: \n");
872 990
873 drain_local_pages(); 991 drain_local_pages();
874 nr_pages = count_data_pages(); 992 nr_pages = count_data_pages();
875 printk("swsusp: Need to copy %u pages\n", nr_pages); 993 nr_highmem = count_highmem_pages();
994 printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
876 995
877 if (!enough_free_mem(nr_pages)) { 996 if (!enough_free_mem(nr_pages, nr_highmem)) {
878 printk(KERN_ERR "swsusp: Not enough free memory\n"); 997 printk(KERN_ERR "swsusp: Not enough free memory\n");
879 return -ENOMEM; 998 return -ENOMEM;
880 } 999 }
881 1000
882 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages)) 1001 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
1002 printk(KERN_ERR "swsusp: Memory allocation failed\n");
883 return -ENOMEM; 1003 return -ENOMEM;
1004 }
884 1005
885 /* During allocating of suspend pagedir, new cold pages may appear. 1006 /* During allocating of suspend pagedir, new cold pages may appear.
886 * Kill them. 1007 * Kill them.
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void)
894 * touch swap space! Except we must write out our image of course. 1015 * touch swap space! Except we must write out our image of course.
895 */ 1016 */
896 1017
1018 nr_pages += nr_highmem;
897 nr_copy_pages = nr_pages; 1019 nr_copy_pages = nr_pages;
898 nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1020 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
899 1021
900 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 1022 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
1023
901 return 0; 1024 return 0;
902} 1025}
903 1026
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
960 1083
961 if (!buffer) { 1084 if (!buffer) {
962 /* This makes the buffer be freed by swsusp_free() */ 1085 /* This makes the buffer be freed by swsusp_free() */
963 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1086 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
964 if (!buffer) 1087 if (!buffer)
965 return -ENOMEM; 1088 return -ENOMEM;
966 } 1089 }
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
975 memset(buffer, 0, PAGE_SIZE); 1098 memset(buffer, 0, PAGE_SIZE);
976 pack_pfns(buffer, &orig_bm); 1099 pack_pfns(buffer, &orig_bm);
977 } else { 1100 } else {
978 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1101 struct page *page;
979 1102
980 handle->buffer = page_address(pfn_to_page(pfn)); 1103 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1104 if (PageHighMem(page)) {
1105 /* Highmem pages are copied to the buffer,
1106 * because we can't return with a kmapped
1107 * highmem page (we may not be called again).
1108 */
1109 void *kaddr;
1110
1111 kaddr = kmap_atomic(page, KM_USER0);
1112 memcpy(buffer, kaddr, PAGE_SIZE);
1113 kunmap_atomic(kaddr, KM_USER0);
1114 handle->buffer = buffer;
1115 } else {
1116 handle->buffer = page_address(page);
1117 }
981 } 1118 }
982 handle->prev = handle->cur; 1119 handle->prev = handle->cur;
983 } 1120 }
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1005 unsigned long pfn, max_zone_pfn; 1142 unsigned long pfn, max_zone_pfn;
1006 1143
1007 /* Clear page flags */ 1144 /* Clear page flags */
1008 for_each_zone (zone) { 1145 for_each_zone(zone) {
1009 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1146 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1010 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1147 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1011 if (pfn_valid(pfn)) 1148 if (pfn_valid(pfn))
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1101 } 1238 }
1102} 1239}
1103 1240
1241/* List of "safe" pages that may be used to store data loaded from the suspend
1242 * image
1243 */
1244static struct linked_page *safe_pages_list;
1245
1246#ifdef CONFIG_HIGHMEM
1247/* struct highmem_pbe is used for creating the list of highmem pages that
1248 * should be restored atomically during the resume from disk, because the page
1249 * frames they have occupied before the suspend are in use.
1250 */
1251struct highmem_pbe {
1252 struct page *copy_page; /* data is here now */
1253 struct page *orig_page; /* data was here before the suspend */
1254 struct highmem_pbe *next;
1255};
1256
1257/* List of highmem PBEs needed for restoring the highmem pages that were
1258 * allocated before the suspend and included in the suspend image, but have
1259 * also been allocated by the "resume" kernel, so their contents cannot be
1260 * written directly to their "original" page frames.
1261 */
1262static struct highmem_pbe *highmem_pblist;
1263
1264/**
1265 * count_highmem_image_pages - compute the number of highmem pages in the
1266 * suspend image. The bits in the memory bitmap @bm that correspond to the
1267 * image pages are assumed to be set.
1268 */
1269
1270static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
1271{
1272 unsigned long pfn;
1273 unsigned int cnt = 0;
1274
1275 memory_bm_position_reset(bm);
1276 pfn = memory_bm_next_pfn(bm);
1277 while (pfn != BM_END_OF_MAP) {
1278 if (PageHighMem(pfn_to_page(pfn)))
1279 cnt++;
1280
1281 pfn = memory_bm_next_pfn(bm);
1282 }
1283 return cnt;
1284}
1285
1286/**
1287 * prepare_highmem_image - try to allocate as many highmem pages as
1288 * there are highmem image pages (@nr_highmem_p points to the variable
1289 * containing the number of highmem image pages). The pages that are
1290 * "safe" (ie. will not be overwritten when the suspend image is
1291 * restored) have the corresponding bits set in @bm (it must be
1292 * unitialized).
1293 *
1294 * NOTE: This function should not be called if there are no highmem
1295 * image pages.
1296 */
1297
1298static unsigned int safe_highmem_pages;
1299
1300static struct memory_bitmap *safe_highmem_bm;
1301
1302static int
1303prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1304{
1305 unsigned int to_alloc;
1306
1307 if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
1308 return -ENOMEM;
1309
1310 if (get_highmem_buffer(PG_SAFE))
1311 return -ENOMEM;
1312
1313 to_alloc = count_free_highmem_pages();
1314 if (to_alloc > *nr_highmem_p)
1315 to_alloc = *nr_highmem_p;
1316 else
1317 *nr_highmem_p = to_alloc;
1318
1319 safe_highmem_pages = 0;
1320 while (to_alloc-- > 0) {
1321 struct page *page;
1322
1323 page = alloc_page(__GFP_HIGHMEM);
1324 if (!PageNosaveFree(page)) {
1325 /* The page is "safe", set its bit the bitmap */
1326 memory_bm_set_bit(bm, page_to_pfn(page));
1327 safe_highmem_pages++;
1328 }
1329 /* Mark the page as allocated */
1330 SetPageNosave(page);
1331 SetPageNosaveFree(page);
1332 }
1333 memory_bm_position_reset(bm);
1334 safe_highmem_bm = bm;
1335 return 0;
1336}
1337
1338/**
1339 * get_highmem_page_buffer - for given highmem image page find the buffer
1340 * that suspend_write_next() should set for its caller to write to.
1341 *
1342 * If the page is to be saved to its "original" page frame or a copy of
1343 * the page is to be made in the highmem, @buffer is returned. Otherwise,
1344 * the copy of the page is to be made in normal memory, so the address of
1345 * the copy is returned.
1346 *
1347 * If @buffer is returned, the caller of suspend_write_next() will write
1348 * the page's contents to @buffer, so they will have to be copied to the
1349 * right location on the next call to suspend_write_next() and it is done
1350 * with the help of copy_last_highmem_page(). For this purpose, if
1351 * @buffer is returned, @last_highmem page is set to the page to which
1352 * the data will have to be copied from @buffer.
1353 */
1354
1355static struct page *last_highmem_page;
1356
1357static void *
1358get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1359{
1360 struct highmem_pbe *pbe;
1361 void *kaddr;
1362
1363 if (PageNosave(page) && PageNosaveFree(page)) {
1364 /* We have allocated the "original" page frame and we can
1365 * use it directly to store the loaded page.
1366 */
1367 last_highmem_page = page;
1368 return buffer;
1369 }
1370 /* The "original" page frame has not been allocated and we have to
1371 * use a "safe" page frame to store the loaded page.
1372 */
1373 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1374 if (!pbe) {
1375 swsusp_free();
1376 return NULL;
1377 }
1378 pbe->orig_page = page;
1379 if (safe_highmem_pages > 0) {
1380 struct page *tmp;
1381
1382 /* Copy of the page will be stored in high memory */
1383 kaddr = buffer;
1384 tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
1385 safe_highmem_pages--;
1386 last_highmem_page = tmp;
1387 pbe->copy_page = tmp;
1388 } else {
1389 /* Copy of the page will be stored in normal memory */
1390 kaddr = safe_pages_list;
1391 safe_pages_list = safe_pages_list->next;
1392 pbe->copy_page = virt_to_page(kaddr);
1393 }
1394 pbe->next = highmem_pblist;
1395 highmem_pblist = pbe;
1396 return kaddr;
1397}
1398
1399/**
1400 * copy_last_highmem_page - copy the contents of a highmem image from
1401 * @buffer, where the caller of snapshot_write_next() has place them,
1402 * to the right location represented by @last_highmem_page .
1403 */
1404
1405static void copy_last_highmem_page(void)
1406{
1407 if (last_highmem_page) {
1408 void *dst;
1409
1410 dst = kmap_atomic(last_highmem_page, KM_USER0);
1411 memcpy(dst, buffer, PAGE_SIZE);
1412 kunmap_atomic(dst, KM_USER0);
1413 last_highmem_page = NULL;
1414 }
1415}
1416
1417static inline int last_highmem_page_copied(void)
1418{
1419 return !last_highmem_page;
1420}
1421
1422static inline void free_highmem_data(void)
1423{
1424 if (safe_highmem_bm)
1425 memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
1426
1427 if (buffer)
1428 free_image_page(buffer, PG_UNSAFE_CLEAR);
1429}
1430#else
1431static inline int get_safe_write_buffer(void) { return 0; }
1432
1433static unsigned int
1434count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
1435
1436static inline int
1437prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1438{
1439 return 0;
1440}
1441
1442static inline void *
1443get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1444{
1445 return NULL;
1446}
1447
1448static inline void copy_last_highmem_page(void) {}
1449static inline int last_highmem_page_copied(void) { return 1; }
1450static inline void free_highmem_data(void) {}
1451#endif /* CONFIG_HIGHMEM */
1452
1104/** 1453/**
1105 * prepare_image - use the memory bitmap @bm to mark the pages that will 1454 * prepare_image - use the memory bitmap @bm to mark the pages that will
1106 * be overwritten in the process of restoring the system memory state 1455 * be overwritten in the process of restoring the system memory state
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1110 * The idea is to allocate a new memory bitmap first and then allocate 1459 * The idea is to allocate a new memory bitmap first and then allocate
1111 * as many pages as needed for the image data, but not to assign these 1460 * as many pages as needed for the image data, but not to assign these
1112 * pages to specific tasks initially. Instead, we just mark them as 1461 * pages to specific tasks initially. Instead, we just mark them as
1113 * allocated and create a list of "safe" pages that will be used later. 1462 * allocated and create a lists of "safe" pages that will be used
1463 * later. On systems with high memory a list of "safe" highmem pages is
1464 * also created.
1114 */ 1465 */
1115 1466
1116#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) 1467#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
1117 1468
1118static struct linked_page *safe_pages_list;
1119
1120static int 1469static int
1121prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) 1470prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1122{ 1471{
1123 unsigned int nr_pages; 1472 unsigned int nr_pages, nr_highmem;
1124 struct linked_page *sp_list, *lp; 1473 struct linked_page *sp_list, *lp;
1125 int error; 1474 int error;
1126 1475
1476 /* If there is no highmem, the buffer will not be necessary */
1477 free_image_page(buffer, PG_UNSAFE_CLEAR);
1478 buffer = NULL;
1479
1480 nr_highmem = count_highmem_image_pages(bm);
1127 error = mark_unsafe_pages(bm); 1481 error = mark_unsafe_pages(bm);
1128 if (error) 1482 if (error)
1129 goto Free; 1483 goto Free;
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1134 1488
1135 duplicate_memory_bitmap(new_bm, bm); 1489 duplicate_memory_bitmap(new_bm, bm);
1136 memory_bm_free(bm, PG_UNSAFE_KEEP); 1490 memory_bm_free(bm, PG_UNSAFE_KEEP);
1491 if (nr_highmem > 0) {
1492 error = prepare_highmem_image(bm, &nr_highmem);
1493 if (error)
1494 goto Free;
1495 }
1137 /* Reserve some safe pages for potential later use. 1496 /* Reserve some safe pages for potential later use.
1138 * 1497 *
1139 * NOTE: This way we make sure there will be enough safe pages for the 1498 * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1142 */ 1501 */
1143 sp_list = NULL; 1502 sp_list = NULL;
1144 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ 1503 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1145 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1504 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1146 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); 1505 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1147 while (nr_pages > 0) { 1506 while (nr_pages > 0) {
1148 lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); 1507 lp = get_image_page(GFP_ATOMIC, PG_SAFE);
1149 if (!lp) { 1508 if (!lp) {
1150 error = -ENOMEM; 1509 error = -ENOMEM;
1151 goto Free; 1510 goto Free;
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1156 } 1515 }
1157 /* Preallocate memory for the image */ 1516 /* Preallocate memory for the image */
1158 safe_pages_list = NULL; 1517 safe_pages_list = NULL;
1159 nr_pages = nr_copy_pages - allocated_unsafe_pages; 1518 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
1160 while (nr_pages > 0) { 1519 while (nr_pages > 0) {
1161 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); 1520 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
1162 if (!lp) { 1521 if (!lp) {
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1181 } 1540 }
1182 return 0; 1541 return 0;
1183 1542
1184Free: 1543 Free:
1185 swsusp_free(); 1544 swsusp_free();
1186 return error; 1545 return error;
1187} 1546}
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1196 struct pbe *pbe; 1555 struct pbe *pbe;
1197 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1556 struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
1198 1557
1558 if (PageHighMem(page))
1559 return get_highmem_page_buffer(page, ca);
1560
1199 if (PageNosave(page) && PageNosaveFree(page)) 1561 if (PageNosave(page) && PageNosaveFree(page))
1200 /* We have allocated the "original" page frame and we can 1562 /* We have allocated the "original" page frame and we can
1201 * use it directly to store the loaded page. 1563 * use it directly to store the loaded page.
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1210 swsusp_free(); 1572 swsusp_free();
1211 return NULL; 1573 return NULL;
1212 } 1574 }
1213 pbe->orig_address = (unsigned long)page_address(page); 1575 pbe->orig_address = page_address(page);
1214 pbe->address = (unsigned long)safe_pages_list; 1576 pbe->address = safe_pages_list;
1215 safe_pages_list = safe_pages_list->next; 1577 safe_pages_list = safe_pages_list->next;
1216 pbe->next = restore_pblist; 1578 pbe->next = restore_pblist;
1217 restore_pblist = pbe; 1579 restore_pblist = pbe;
1218 return (void *)pbe->address; 1580 return pbe->address;
1219} 1581}
1220 1582
1221/** 1583/**
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1249 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1611 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1250 return 0; 1612 return 0;
1251 1613
1252 if (!buffer) { 1614 if (handle->offset == 0) {
1253 /* This makes the buffer be freed by swsusp_free() */ 1615 if (!buffer)
1254 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); 1616 /* This makes the buffer be freed by swsusp_free() */
1617 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
1618
1255 if (!buffer) 1619 if (!buffer)
1256 return -ENOMEM; 1620 return -ENOMEM;
1257 } 1621
1258 if (!handle->offset)
1259 handle->buffer = buffer; 1622 handle->buffer = buffer;
1623 }
1260 handle->sync_read = 1; 1624 handle->sync_read = 1;
1261 if (handle->prev < handle->cur) { 1625 if (handle->prev < handle->cur) {
1262 if (handle->prev == 0) { 1626 if (handle->prev == 0) {
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1284 return -ENOMEM; 1648 return -ENOMEM;
1285 } 1649 }
1286 } else { 1650 } else {
1651 copy_last_highmem_page();
1287 handle->buffer = get_buffer(&orig_bm, &ca); 1652 handle->buffer = get_buffer(&orig_bm, &ca);
1288 handle->sync_read = 0; 1653 if (handle->buffer != buffer)
1654 handle->sync_read = 0;
1289 } 1655 }
1290 handle->prev = handle->cur; 1656 handle->prev = handle->cur;
1291 } 1657 }
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1301 return count; 1667 return count;
1302} 1668}
1303 1669
1670/**
1671 * snapshot_write_finalize - must be called after the last call to
1672 * snapshot_write_next() in case the last page in the image happens
1673 * to be a highmem page and its contents should be stored in the
1674 * highmem. Additionally, it releases the memory that will not be
1675 * used any more.
1676 */
1677
1678void snapshot_write_finalize(struct snapshot_handle *handle)
1679{
1680 copy_last_highmem_page();
1681 /* Free only if we have loaded the image entirely */
1682 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
1683 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
1684 free_highmem_data();
1685 }
1686}
1687
1304int snapshot_image_loaded(struct snapshot_handle *handle) 1688int snapshot_image_loaded(struct snapshot_handle *handle)
1305{ 1689{
1306 return !(!nr_copy_pages || 1690 return !(!nr_copy_pages || !last_highmem_page_copied() ||
1307 handle->cur <= nr_meta_pages + nr_copy_pages); 1691 handle->cur <= nr_meta_pages + nr_copy_pages);
1308} 1692}
1309 1693
1310void snapshot_free_unused_memory(struct snapshot_handle *handle) 1694#ifdef CONFIG_HIGHMEM
1695/* Assumes that @buf is ready and points to a "safe" page */
1696static inline void
1697swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
1311{ 1698{
1312 /* Free only if we have loaded the image entirely */ 1699 void *kaddr1, *kaddr2;
1313 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 1700
1314 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 1701 kaddr1 = kmap_atomic(p1, KM_USER0);
1702 kaddr2 = kmap_atomic(p2, KM_USER1);
1703 memcpy(buf, kaddr1, PAGE_SIZE);
1704 memcpy(kaddr1, kaddr2, PAGE_SIZE);
1705 memcpy(kaddr2, buf, PAGE_SIZE);
1706 kunmap_atomic(kaddr1, KM_USER0);
1707 kunmap_atomic(kaddr2, KM_USER1);
1708}
1709
1710/**
1711 * restore_highmem - for each highmem page that was allocated before
1712 * the suspend and included in the suspend image, and also has been
1713 * allocated by the "resume" kernel swap its current (ie. "before
1714 * resume") contents with the previous (ie. "before suspend") one.
1715 *
1716 * If the resume eventually fails, we can call this function once
1717 * again and restore the "before resume" highmem state.
1718 */
1719
1720int restore_highmem(void)
1721{
1722 struct highmem_pbe *pbe = highmem_pblist;
1723 void *buf;
1724
1725 if (!pbe)
1726 return 0;
1727
1728 buf = get_image_page(GFP_ATOMIC, PG_SAFE);
1729 if (!buf)
1730 return -ENOMEM;
1731
1732 while (pbe) {
1733 swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
1734 pbe = pbe->next;
1735 }
1736 free_image_page(buf, PG_UNSAFE_CLEAR);
1737 return 0;
1315} 1738}
1739#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd2c3fc..f133d4a6d817 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,34 +34,123 @@ extern char resume_file[];
34#define SWSUSP_SIG "S1SUSPEND" 34#define SWSUSP_SIG "S1SUSPEND"
35 35
36static struct swsusp_header { 36static struct swsusp_header {
37 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; 37 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
38 swp_entry_t image; 38 sector_t image;
39 char orig_sig[10]; 39 char orig_sig[10];
40 char sig[10]; 40 char sig[10];
41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 41} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
42 42
43/* 43/*
44 * Saving part... 44 * General things
45 */ 45 */
46 46
47static unsigned short root_swap = 0xffff; 47static unsigned short root_swap = 0xffff;
48static struct block_device *resume_bdev;
49
50/**
51 * submit - submit BIO request.
52 * @rw: READ or WRITE.
53 * @off physical offset of page.
54 * @page: page we're reading or writing.
55 * @bio_chain: list of pending biod (for async reading)
56 *
57 * Straight from the textbook - allocate and initialize the bio.
58 * If we're reading, make sure the page is marked as dirty.
59 * Then submit it and, if @bio_chain == NULL, wait.
60 */
61static int submit(int rw, pgoff_t page_off, struct page *page,
62 struct bio **bio_chain)
63{
64 struct bio *bio;
65
66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
67 if (!bio)
68 return -ENOMEM;
69 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
70 bio->bi_bdev = resume_bdev;
71 bio->bi_end_io = end_swap_bio_read;
72
73 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
74 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
75 bio_put(bio);
76 return -EFAULT;
77 }
78
79 lock_page(page);
80 bio_get(bio);
81
82 if (bio_chain == NULL) {
83 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
84 wait_on_page_locked(page);
85 if (rw == READ)
86 bio_set_pages_dirty(bio);
87 bio_put(bio);
88 } else {
89 if (rw == READ)
90 get_page(page); /* These pages are freed later */
91 bio->bi_private = *bio_chain;
92 *bio_chain = bio;
93 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
94 }
95 return 0;
96}
97
98static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
99{
100 return submit(READ, page_off, virt_to_page(addr), bio_chain);
101}
102
103static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
104{
105 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
106}
107
108static int wait_on_bio_chain(struct bio **bio_chain)
109{
110 struct bio *bio;
111 struct bio *next_bio;
112 int ret = 0;
113
114 if (bio_chain == NULL)
115 return 0;
116
117 bio = *bio_chain;
118 if (bio == NULL)
119 return 0;
120 while (bio) {
121 struct page *page;
122
123 next_bio = bio->bi_private;
124 page = bio->bi_io_vec[0].bv_page;
125 wait_on_page_locked(page);
126 if (!PageUptodate(page) || PageError(page))
127 ret = -EIO;
128 put_page(page);
129 bio_put(bio);
130 bio = next_bio;
131 }
132 *bio_chain = NULL;
133 return ret;
134}
135
136/*
137 * Saving part
138 */
48 139
49static int mark_swapfiles(swp_entry_t start) 140static int mark_swapfiles(sector_t start)
50{ 141{
51 int error; 142 int error;
52 143
53 rw_swap_page_sync(READ, swp_entry(root_swap, 0), 144 bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
54 virt_to_page((unsigned long)&swsusp_header), NULL);
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 145 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 146 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 147 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 148 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start; 149 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), 150 error = bio_write_page(swsusp_resume_block,
61 virt_to_page((unsigned long)&swsusp_header), 151 &swsusp_header, NULL);
62 NULL);
63 } else { 152 } else {
64 pr_debug("swsusp: Partition is not swap space.\n"); 153 printk(KERN_ERR "swsusp: Swap header not found!\n");
65 error = -ENODEV; 154 error = -ENODEV;
66 } 155 }
67 return error; 156 return error;
@@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start)
74 163
75static int swsusp_swap_check(void) /* This is called before saving image */ 164static int swsusp_swap_check(void) /* This is called before saving image */
76{ 165{
77 int res = swap_type_of(swsusp_resume_device); 166 int res;
167
168 res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
169 if (res < 0)
170 return res;
171
172 root_swap = res;
173 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
174 if (IS_ERR(resume_bdev))
175 return PTR_ERR(resume_bdev);
176
177 res = set_blocksize(resume_bdev, PAGE_SIZE);
178 if (res < 0)
179 blkdev_put(resume_bdev);
78 180
79 if (res >= 0) {
80 root_swap = res;
81 return 0;
82 }
83 return res; 181 return res;
84} 182}
85 183
@@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
90 * @bio_chain: Link the next write BIO here 188 * @bio_chain: Link the next write BIO here
91 */ 189 */
92 190
93static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) 191static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
94{ 192{
95 swp_entry_t entry; 193 void *src;
96 int error = -ENOSPC; 194
97 195 if (!offset)
98 if (offset) { 196 return -ENOSPC;
99 struct page *page = virt_to_page(buf); 197
100 198 if (bio_chain) {
101 if (bio_chain) { 199 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
102 /* 200 if (src) {
103 * Whether or not we successfully allocated a copy page, 201 memcpy(src, buf, PAGE_SIZE);
104 * we take a ref on the page here. It gets undone in 202 } else {
105 * wait_on_bio_chain(). 203 WARN_ON_ONCE(1);
106 */ 204 bio_chain = NULL; /* Go synchronous */
107 struct page *page_copy; 205 src = buf;
108 page_copy = alloc_page(GFP_ATOMIC);
109 if (page_copy == NULL) {
110 WARN_ON_ONCE(1);
111 bio_chain = NULL; /* Go synchronous */
112 get_page(page);
113 } else {
114 memcpy(page_address(page_copy),
115 page_address(page), PAGE_SIZE);
116 page = page_copy;
117 }
118 } 206 }
119 entry = swp_entry(root_swap, offset); 207 } else {
120 error = rw_swap_page_sync(WRITE, entry, page, bio_chain); 208 src = buf;
121 } 209 }
122 return error; 210 return bio_write_page(offset, src, bio_chain);
123} 211}
124 212
125/* 213/*
@@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
137 * at a time. 225 * at a time.
138 */ 226 */
139 227
140#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) 228#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
141 229
142struct swap_map_page { 230struct swap_map_page {
143 unsigned long entries[MAP_PAGE_ENTRIES]; 231 sector_t entries[MAP_PAGE_ENTRIES];
144 unsigned long next_swap; 232 sector_t next_swap;
145}; 233};
146 234
147/** 235/**
@@ -151,7 +239,7 @@ struct swap_map_page {
151 239
152struct swap_map_handle { 240struct swap_map_handle {
153 struct swap_map_page *cur; 241 struct swap_map_page *cur;
154 unsigned long cur_swap; 242 sector_t cur_swap;
155 struct bitmap_page *bitmap; 243 struct bitmap_page *bitmap;
156 unsigned int k; 244 unsigned int k;
157}; 245};
@@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
166 handle->bitmap = NULL; 254 handle->bitmap = NULL;
167} 255}
168 256
169static void show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
185 centisecs / 100, centisecs % 100,
186 kps / 1000, (kps % 1000) / 10);
187}
188
189static int get_swap_writer(struct swap_map_handle *handle) 257static int get_swap_writer(struct swap_map_handle *handle)
190{ 258{
191 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 259 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
196 release_swap_writer(handle); 264 release_swap_writer(handle);
197 return -ENOMEM; 265 return -ENOMEM;
198 } 266 }
199 handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); 267 handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
200 if (!handle->cur_swap) { 268 if (!handle->cur_swap) {
201 release_swap_writer(handle); 269 release_swap_writer(handle);
202 return -ENOSPC; 270 return -ENOSPC;
@@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle)
205 return 0; 273 return 0;
206} 274}
207 275
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235
236static int swap_write_page(struct swap_map_handle *handle, void *buf, 276static int swap_write_page(struct swap_map_handle *handle, void *buf,
237 struct bio **bio_chain) 277 struct bio **bio_chain)
238{ 278{
239 int error = 0; 279 int error = 0;
240 unsigned long offset; 280 sector_t offset;
241 281
242 if (!handle->cur) 282 if (!handle->cur)
243 return -EINVAL; 283 return -EINVAL;
244 offset = alloc_swap_page(root_swap, handle->bitmap); 284 offset = alloc_swapdev_block(root_swap, handle->bitmap);
245 error = write_page(buf, offset, bio_chain); 285 error = write_page(buf, offset, bio_chain);
246 if (error) 286 if (error)
247 return error; 287 return error;
@@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
250 error = wait_on_bio_chain(bio_chain); 290 error = wait_on_bio_chain(bio_chain);
251 if (error) 291 if (error)
252 goto out; 292 goto out;
253 offset = alloc_swap_page(root_swap, handle->bitmap); 293 offset = alloc_swapdev_block(root_swap, handle->bitmap);
254 if (!offset) 294 if (!offset)
255 return -ENOSPC; 295 return -ENOSPC;
256 handle->cur->next_swap = offset; 296 handle->cur->next_swap = offset;
@@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
261 handle->cur_swap = offset; 301 handle->cur_swap = offset;
262 handle->k = 0; 302 handle->k = 0;
263 } 303 }
264out: 304 out:
265 return error; 305 return error;
266} 306}
267 307
@@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
315 error = err2; 355 error = err2;
316 if (!error) 356 if (!error)
317 printk("\b\b\b\bdone\n"); 357 printk("\b\b\b\bdone\n");
318 show_speed(&start, &stop, nr_to_write, "Wrote"); 358 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
319 return error; 359 return error;
320} 360}
321 361
@@ -350,100 +390,50 @@ int swsusp_write(void)
350 struct swsusp_info *header; 390 struct swsusp_info *header;
351 int error; 391 int error;
352 392
353 if ((error = swsusp_swap_check())) { 393 error = swsusp_swap_check();
394 if (error) {
354 printk(KERN_ERR "swsusp: Cannot find swap device, try " 395 printk(KERN_ERR "swsusp: Cannot find swap device, try "
355 "swapon -a.\n"); 396 "swapon -a.\n");
356 return error; 397 return error;
357 } 398 }
358 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 399 memset(&snapshot, 0, sizeof(struct snapshot_handle));
359 error = snapshot_read_next(&snapshot, PAGE_SIZE); 400 error = snapshot_read_next(&snapshot, PAGE_SIZE);
360 if (error < PAGE_SIZE) 401 if (error < PAGE_SIZE) {
361 return error < 0 ? error : -EFAULT; 402 if (error >= 0)
403 error = -EFAULT;
404
405 goto out;
406 }
362 header = (struct swsusp_info *)data_of(snapshot); 407 header = (struct swsusp_info *)data_of(snapshot);
363 if (!enough_swap(header->pages)) { 408 if (!enough_swap(header->pages)) {
364 printk(KERN_ERR "swsusp: Not enough free swap\n"); 409 printk(KERN_ERR "swsusp: Not enough free swap\n");
365 return -ENOSPC; 410 error = -ENOSPC;
411 goto out;
366 } 412 }
367 error = get_swap_writer(&handle); 413 error = get_swap_writer(&handle);
368 if (!error) { 414 if (!error) {
369 unsigned long start = handle.cur_swap; 415 sector_t start = handle.cur_swap;
416
370 error = swap_write_page(&handle, header, NULL); 417 error = swap_write_page(&handle, header, NULL);
371 if (!error) 418 if (!error)
372 error = save_image(&handle, &snapshot, 419 error = save_image(&handle, &snapshot,
373 header->pages - 1); 420 header->pages - 1);
421
374 if (!error) { 422 if (!error) {
375 flush_swap_writer(&handle); 423 flush_swap_writer(&handle);
376 printk("S"); 424 printk("S");
377 error = mark_swapfiles(swp_entry(root_swap, start)); 425 error = mark_swapfiles(start);
378 printk("|\n"); 426 printk("|\n");
379 } 427 }
380 } 428 }
381 if (error) 429 if (error)
382 free_all_swap_pages(root_swap, handle.bitmap); 430 free_all_swap_pages(root_swap, handle.bitmap);
383 release_swap_writer(&handle); 431 release_swap_writer(&handle);
432 out:
433 swsusp_close();
384 return error; 434 return error;
385} 435}
386 436
387static struct block_device *resume_bdev;
388
389/**
390 * submit - submit BIO request.
391 * @rw: READ or WRITE.
392 * @off physical offset of page.
393 * @page: page we're reading or writing.
394 * @bio_chain: list of pending biod (for async reading)
395 *
396 * Straight from the textbook - allocate and initialize the bio.
397 * If we're reading, make sure the page is marked as dirty.
398 * Then submit it and, if @bio_chain == NULL, wait.
399 */
400static int submit(int rw, pgoff_t page_off, struct page *page,
401 struct bio **bio_chain)
402{
403 struct bio *bio;
404
405 bio = bio_alloc(GFP_ATOMIC, 1);
406 if (!bio)
407 return -ENOMEM;
408 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
409 bio->bi_bdev = resume_bdev;
410 bio->bi_end_io = end_swap_bio_read;
411
412 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
413 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
414 bio_put(bio);
415 return -EFAULT;
416 }
417
418 lock_page(page);
419 bio_get(bio);
420
421 if (bio_chain == NULL) {
422 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
423 wait_on_page_locked(page);
424 if (rw == READ)
425 bio_set_pages_dirty(bio);
426 bio_put(bio);
427 } else {
428 if (rw == READ)
429 get_page(page); /* These pages are freed later */
430 bio->bi_private = *bio_chain;
431 *bio_chain = bio;
432 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
433 }
434 return 0;
435}
436
437static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
438{
439 return submit(READ, page_off, virt_to_page(addr), bio_chain);
440}
441
442static int bio_write_page(pgoff_t page_off, void *addr)
443{
444 return submit(WRITE, page_off, virt_to_page(addr), NULL);
445}
446
447/** 437/**
448 * The following functions allow us to read data using a swap map 438 * The following functions allow us to read data using a swap map
449 * in a file-alike way 439 * in a file-alike way
@@ -456,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
456 handle->cur = NULL; 446 handle->cur = NULL;
457} 447}
458 448
459static int get_swap_reader(struct swap_map_handle *handle, 449static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
460 swp_entry_t start)
461{ 450{
462 int error; 451 int error;
463 452
464 if (!swp_offset(start)) 453 if (!start)
465 return -EINVAL; 454 return -EINVAL;
466 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 455
456 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
467 if (!handle->cur) 457 if (!handle->cur)
468 return -ENOMEM; 458 return -ENOMEM;
469 error = bio_read_page(swp_offset(start), handle->cur, NULL); 459
460 error = bio_read_page(start, handle->cur, NULL);
470 if (error) { 461 if (error) {
471 release_swap_reader(handle); 462 release_swap_reader(handle);
472 return error; 463 return error;
@@ -478,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
478static int swap_read_page(struct swap_map_handle *handle, void *buf, 469static int swap_read_page(struct swap_map_handle *handle, void *buf,
479 struct bio **bio_chain) 470 struct bio **bio_chain)
480{ 471{
481 unsigned long offset; 472 sector_t offset;
482 int error; 473 int error;
483 474
484 if (!handle->cur) 475 if (!handle->cur)
@@ -547,11 +538,11 @@ static int load_image(struct swap_map_handle *handle,
547 error = err2; 538 error = err2;
548 if (!error) { 539 if (!error) {
549 printk("\b\b\b\bdone\n"); 540 printk("\b\b\b\bdone\n");
550 snapshot_free_unused_memory(snapshot); 541 snapshot_write_finalize(snapshot);
551 if (!snapshot_image_loaded(snapshot)) 542 if (!snapshot_image_loaded(snapshot))
552 error = -ENODATA; 543 error = -ENODATA;
553 } 544 }
554 show_speed(&start, &stop, nr_to_read, "Read"); 545 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
555 return error; 546 return error;
556} 547}
557 548
@@ -600,12 +591,16 @@ int swsusp_check(void)
600 if (!IS_ERR(resume_bdev)) { 591 if (!IS_ERR(resume_bdev)) {
601 set_blocksize(resume_bdev, PAGE_SIZE); 592 set_blocksize(resume_bdev, PAGE_SIZE);
602 memset(&swsusp_header, 0, sizeof(swsusp_header)); 593 memset(&swsusp_header, 0, sizeof(swsusp_header));
603 if ((error = bio_read_page(0, &swsusp_header, NULL))) 594 error = bio_read_page(swsusp_resume_block,
595 &swsusp_header, NULL);
596 if (error)
604 return error; 597 return error;
598
605 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 599 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
606 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 600 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
607 /* Reset swap signature now */ 601 /* Reset swap signature now */
608 error = bio_write_page(0, &swsusp_header); 602 error = bio_write_page(swsusp_resume_block,
603 &swsusp_header, NULL);
609 } else { 604 } else {
610 return -EINVAL; 605 return -EINVAL;
611 } 606 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc516..31aa0390c777 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
49#include <linux/bootmem.h> 49#include <linux/bootmem.h>
50#include <linux/syscalls.h> 50#include <linux/syscalls.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h>
52 53
53#include "power.h" 54#include "power.h"
54 55
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0;
64 65
65#ifdef CONFIG_HIGHMEM 66#ifdef CONFIG_HIGHMEM
66unsigned int count_highmem_pages(void); 67unsigned int count_highmem_pages(void);
67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static inline int save_highmem(void) { return 0; }
71static inline int restore_highmem(void) { return 0; } 70static inline int restore_highmem(void) { return 0; }
72static inline unsigned int count_highmem_pages(void) { return 0; } 71static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 72#endif
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
134 return 0; 133 return 0;
135} 134}
136 135
137unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) 136sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
138{ 137{
139 unsigned long offset; 138 unsigned long offset;
140 139
141 offset = swp_offset(get_swap_page_of_type(swap)); 140 offset = swp_offset(get_swap_page_of_type(swap));
142 if (offset) { 141 if (offset) {
143 if (bitmap_set(bitmap, offset)) { 142 if (bitmap_set(bitmap, offset))
144 swap_free(swp_entry(swap, offset)); 143 swap_free(swp_entry(swap, offset));
145 offset = 0; 144 else
146 } 145 return swapdev_block(swap, offset);
147 } 146 }
148 return offset; 147 return 0;
149} 148}
150 149
151void free_all_swap_pages(int swap, struct bitmap_page *bitmap) 150void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
166} 165}
167 166
168/** 167/**
168 * swsusp_show_speed - print the time elapsed between two events represented by
169 * @start and @stop
170 *
171 * @nr_pages - number of pages processed between @start and @stop
172 * @msg - introductory message to print
173 */
174
175void swsusp_show_speed(struct timeval *start, struct timeval *stop,
176 unsigned nr_pages, char *msg)
177{
178 s64 elapsed_centisecs64;
179 int centisecs;
180 int k;
181 int kps;
182
183 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
184 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
185 centisecs = elapsed_centisecs64;
186 if (centisecs == 0)
187 centisecs = 1; /* avoid div-by-zero */
188 k = nr_pages * (PAGE_SIZE / 1024);
189 kps = (k * 100) / centisecs;
190 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
191 centisecs / 100, centisecs % 100,
192 kps / 1000, (kps % 1000) / 10);
193}
194
195/**
169 * swsusp_shrink_memory - Try to free as much memory as needed 196 * swsusp_shrink_memory - Try to free as much memory as needed
170 * 197 *
171 * ... but do not OOM-kill anyone 198 * ... but do not OOM-kill anyone
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp)
184 211
185int swsusp_shrink_memory(void) 212int swsusp_shrink_memory(void)
186{ 213{
187 long size, tmp; 214 long tmp;
188 struct zone *zone; 215 struct zone *zone;
189 unsigned long pages = 0; 216 unsigned long pages = 0;
190 unsigned int i = 0; 217 unsigned int i = 0;
191 char *p = "-\\|/"; 218 char *p = "-\\|/";
219 struct timeval start, stop;
192 220
193 printk("Shrinking memory... "); 221 printk("Shrinking memory... ");
222 do_gettimeofday(&start);
194 do { 223 do {
195 size = 2 * count_highmem_pages(); 224 long size, highmem_size;
196 size += size / 50 + count_data_pages() + PAGES_FOR_IO; 225
226 highmem_size = count_highmem_pages();
227 size = count_data_pages() + PAGES_FOR_IO;
197 tmp = size; 228 tmp = size;
229 size += highmem_size;
198 for_each_zone (zone) 230 for_each_zone (zone)
199 if (!is_highmem(zone) && populated_zone(zone)) { 231 if (populated_zone(zone)) {
200 tmp -= zone->free_pages; 232 if (is_highmem(zone)) {
201 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 233 highmem_size -= zone->free_pages;
202 tmp += snapshot_additional_pages(zone); 234 } else {
235 tmp -= zone->free_pages;
236 tmp += zone->lowmem_reserve[ZONE_NORMAL];
237 tmp += snapshot_additional_pages(zone);
238 }
203 } 239 }
240
241 if (highmem_size < 0)
242 highmem_size = 0;
243
244 tmp += highmem_size;
204 if (tmp > 0) { 245 if (tmp > 0) {
205 tmp = __shrink_memory(tmp); 246 tmp = __shrink_memory(tmp);
206 if (!tmp) 247 if (!tmp)
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void)
212 } 253 }
213 printk("\b%c", p[i++%4]); 254 printk("\b%c", p[i++%4]);
214 } while (tmp > 0); 255 } while (tmp > 0);
256 do_gettimeofday(&stop);
215 printk("\bdone (%lu pages freed)\n", pages); 257 printk("\bdone (%lu pages freed)\n", pages);
258 swsusp_show_speed(&start, &stop, pages, "Freed");
216 259
217 return 0; 260 return 0;
218} 261}
@@ -223,6 +266,7 @@ int swsusp_suspend(void)
223 266
224 if ((error = arch_prepare_suspend())) 267 if ((error = arch_prepare_suspend()))
225 return error; 268 return error;
269
226 local_irq_disable(); 270 local_irq_disable();
227 /* At this point, device_suspend() has been called, but *not* 271 /* At this point, device_suspend() has been called, but *not*
228 * device_power_down(). We *must* device_power_down() now. 272 * device_power_down(). We *must* device_power_down() now.
@@ -235,23 +279,16 @@ int swsusp_suspend(void)
235 goto Enable_irqs; 279 goto Enable_irqs;
236 } 280 }
237 281
238 if ((error = save_highmem())) {
239 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
240 goto Restore_highmem;
241 }
242
243 save_processor_state(); 282 save_processor_state();
244 if ((error = swsusp_arch_suspend())) 283 if ((error = swsusp_arch_suspend()))
245 printk(KERN_ERR "Error %d suspending\n", error); 284 printk(KERN_ERR "Error %d suspending\n", error);
246 /* Restore control flow magically appears here */ 285 /* Restore control flow magically appears here */
247 restore_processor_state(); 286 restore_processor_state();
248Restore_highmem:
249 restore_highmem();
250 /* NOTE: device_power_up() is just a resume() for devices 287 /* NOTE: device_power_up() is just a resume() for devices
251 * that suspended with irqs off ... no overall powerup. 288 * that suspended with irqs off ... no overall powerup.
252 */ 289 */
253 device_power_up(); 290 device_power_up();
254Enable_irqs: 291 Enable_irqs:
255 local_irq_enable(); 292 local_irq_enable();
256 return error; 293 return error;
257} 294}
@@ -268,18 +305,23 @@ int swsusp_resume(void)
268 printk(KERN_ERR "Some devices failed to power down, very bad\n"); 305 printk(KERN_ERR "Some devices failed to power down, very bad\n");
269 /* We'll ignore saved state, but this gets preempt count (etc) right */ 306 /* We'll ignore saved state, but this gets preempt count (etc) right */
270 save_processor_state(); 307 save_processor_state();
271 error = swsusp_arch_resume(); 308 error = restore_highmem();
272 /* Code below is only ever reached in case of failure. Otherwise 309 if (!error) {
273 * execution continues at place where swsusp_arch_suspend was called 310 error = swsusp_arch_resume();
274 */ 311 /* The code below is only ever reached in case of a failure.
275 BUG_ON(!error); 312 * Otherwise execution continues at place where
313 * swsusp_arch_suspend() was called
314 */
315 BUG_ON(!error);
316 /* This call to restore_highmem() undos the previous one */
317 restore_highmem();
318 }
276 /* The only reason why swsusp_arch_resume() can fail is memory being 319 /* The only reason why swsusp_arch_resume() can fail is memory being
277 * very tight, so we have to free it as soon as we can to avoid 320 * very tight, so we have to free it as soon as we can to avoid
278 * subsequent failures 321 * subsequent failures
279 */ 322 */
280 swsusp_free(); 323 swsusp_free();
281 restore_processor_state(); 324 restore_processor_state();
282 restore_highmem();
283 touch_softlockup_watchdog(); 325 touch_softlockup_watchdog();
284 device_power_up(); 326 device_power_up();
285 local_irq_enable(); 327 local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a4..89443b85163b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/reboot.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/device.h> 16#include <linux/device.h>
16#include <linux/miscdevice.h> 17#include <linux/miscdevice.h>
@@ -21,6 +22,7 @@
21#include <linux/fs.h> 22#include <linux/fs.h>
22#include <linux/console.h> 23#include <linux/console.h>
23#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h>
24 26
25#include <asm/uaccess.h> 27#include <asm/uaccess.h>
26 28
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
54 filp->private_data = data; 56 filp->private_data = data;
55 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 57 memset(&data->handle, 0, sizeof(struct snapshot_handle));
56 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { 58 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
57 data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; 59 data->swap = swsusp_resume_device ?
60 swap_type_of(swsusp_resume_device, 0) : -1;
58 data->mode = O_RDONLY; 61 data->mode = O_RDONLY;
59 } else { 62 } else {
60 data->swap = -1; 63 data->swap = -1;
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
76 free_all_swap_pages(data->swap, data->bitmap); 79 free_all_swap_pages(data->swap, data->bitmap);
77 free_bitmap(data->bitmap); 80 free_bitmap(data->bitmap);
78 if (data->frozen) { 81 if (data->frozen) {
79 down(&pm_sem); 82 mutex_lock(&pm_mutex);
80 thaw_processes(); 83 thaw_processes();
81 enable_nonboot_cpus(); 84 enable_nonboot_cpus();
82 up(&pm_sem); 85 mutex_unlock(&pm_mutex);
83 } 86 }
84 atomic_inc(&device_available); 87 atomic_inc(&device_available);
85 return 0; 88 return 0;
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
124{ 127{
125 int error = 0; 128 int error = 0;
126 struct snapshot_data *data; 129 struct snapshot_data *data;
127 loff_t offset, avail; 130 loff_t avail;
131 sector_t offset;
128 132
129 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) 133 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
130 return -ENOTTY; 134 return -ENOTTY;
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
140 case SNAPSHOT_FREEZE: 144 case SNAPSHOT_FREEZE:
141 if (data->frozen) 145 if (data->frozen)
142 break; 146 break;
143 down(&pm_sem); 147 mutex_lock(&pm_mutex);
144 error = disable_nonboot_cpus(); 148 error = disable_nonboot_cpus();
145 if (!error) { 149 if (!error) {
146 error = freeze_processes(); 150 error = freeze_processes();
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
150 error = -EBUSY; 154 error = -EBUSY;
151 } 155 }
152 } 156 }
153 up(&pm_sem); 157 mutex_unlock(&pm_mutex);
154 if (!error) 158 if (!error)
155 data->frozen = 1; 159 data->frozen = 1;
156 break; 160 break;
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
158 case SNAPSHOT_UNFREEZE: 162 case SNAPSHOT_UNFREEZE:
159 if (!data->frozen) 163 if (!data->frozen)
160 break; 164 break;
161 down(&pm_sem); 165 mutex_lock(&pm_mutex);
162 thaw_processes(); 166 thaw_processes();
163 enable_nonboot_cpus(); 167 enable_nonboot_cpus();
164 up(&pm_sem); 168 mutex_unlock(&pm_mutex);
165 data->frozen = 0; 169 data->frozen = 0;
166 break; 170 break;
167 171
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
170 error = -EPERM; 174 error = -EPERM;
171 break; 175 break;
172 } 176 }
173 down(&pm_sem); 177 mutex_lock(&pm_mutex);
174 /* Free memory before shutting down devices. */ 178 /* Free memory before shutting down devices. */
175 error = swsusp_shrink_memory(); 179 error = swsusp_shrink_memory();
176 if (!error) { 180 if (!error) {
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
183 } 187 }
184 resume_console(); 188 resume_console();
185 } 189 }
186 up(&pm_sem); 190 mutex_unlock(&pm_mutex);
187 if (!error) 191 if (!error)
188 error = put_user(in_suspend, (unsigned int __user *)arg); 192 error = put_user(in_suspend, (unsigned int __user *)arg);
189 if (!error) 193 if (!error)
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
191 break; 195 break;
192 196
193 case SNAPSHOT_ATOMIC_RESTORE: 197 case SNAPSHOT_ATOMIC_RESTORE:
198 snapshot_write_finalize(&data->handle);
194 if (data->mode != O_WRONLY || !data->frozen || 199 if (data->mode != O_WRONLY || !data->frozen ||
195 !snapshot_image_loaded(&data->handle)) { 200 !snapshot_image_loaded(&data->handle)) {
196 error = -EPERM; 201 error = -EPERM;
197 break; 202 break;
198 } 203 }
199 snapshot_free_unused_memory(&data->handle); 204 mutex_lock(&pm_mutex);
200 down(&pm_sem);
201 pm_prepare_console(); 205 pm_prepare_console();
202 suspend_console(); 206 suspend_console();
203 error = device_suspend(PMSG_PRETHAW); 207 error = device_suspend(PMSG_PRETHAW);
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
207 } 211 }
208 resume_console(); 212 resume_console();
209 pm_restore_console(); 213 pm_restore_console();
210 up(&pm_sem); 214 mutex_unlock(&pm_mutex);
211 break; 215 break;
212 216
213 case SNAPSHOT_FREE: 217 case SNAPSHOT_FREE:
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
238 break; 242 break;
239 } 243 }
240 } 244 }
241 offset = alloc_swap_page(data->swap, data->bitmap); 245 offset = alloc_swapdev_block(data->swap, data->bitmap);
242 if (offset) { 246 if (offset) {
243 offset <<= PAGE_SHIFT; 247 offset <<= PAGE_SHIFT;
244 error = put_user(offset, (loff_t __user *)arg); 248 error = put_user(offset, (sector_t __user *)arg);
245 } else { 249 } else {
246 error = -ENOSPC; 250 error = -ENOSPC;
247 } 251 }
@@ -264,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
264 * so we need to recode them 268 * so we need to recode them
265 */ 269 */
266 if (old_decode_dev(arg)) { 270 if (old_decode_dev(arg)) {
267 data->swap = swap_type_of(old_decode_dev(arg)); 271 data->swap = swap_type_of(old_decode_dev(arg), 0);
268 if (data->swap < 0) 272 if (data->swap < 0)
269 error = -ENODEV; 273 error = -ENODEV;
270 } else { 274 } else {
@@ -282,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
282 break; 286 break;
283 } 287 }
284 288
285 if (down_trylock(&pm_sem)) { 289 if (!mutex_trylock(&pm_mutex)) {
286 error = -EBUSY; 290 error = -EBUSY;
287 break; 291 break;
288 } 292 }
@@ -309,8 +313,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
309 if (pm_ops->finish) 313 if (pm_ops->finish)
310 pm_ops->finish(PM_SUSPEND_MEM); 314 pm_ops->finish(PM_SUSPEND_MEM);
311 315
312OutS3: 316 OutS3:
313 up(&pm_sem); 317 mutex_unlock(&pm_mutex);
318 break;
319
320 case SNAPSHOT_PMOPS:
321 switch (arg) {
322
323 case PMOPS_PREPARE:
324 if (pm_ops->prepare) {
325 error = pm_ops->prepare(PM_SUSPEND_DISK);
326 }
327 break;
328
329 case PMOPS_ENTER:
330 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
331 error = pm_ops->enter(PM_SUSPEND_DISK);
332 break;
333
334 case PMOPS_FINISH:
335 if (pm_ops && pm_ops->finish) {
336 pm_ops->finish(PM_SUSPEND_DISK);
337 }
338 break;
339
340 default:
341 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
342 error = -EINVAL;
343
344 }
345 break;
346
347 case SNAPSHOT_SET_SWAP_AREA:
348 if (data->bitmap) {
349 error = -EPERM;
350 } else {
351 struct resume_swap_area swap_area;
352 dev_t swdev;
353
354 error = copy_from_user(&swap_area, (void __user *)arg,
355 sizeof(struct resume_swap_area));
356 if (error) {
357 error = -EFAULT;
358 break;
359 }
360
361 /*
362 * User space encodes device types as two-byte values,
363 * so we need to recode them
364 */
365 swdev = old_decode_dev(swap_area.dev);
366 if (swdev) {
367 offset = swap_area.offset;
368 data->swap = swap_type_of(swdev, offset);
369 if (data->swap < 0)
370 error = -ENODEV;
371 } else {
372 data->swap = -1;
373 error = -EINVAL;
374 }
375 }
314 break; 376 break;
315 377
316 default: 378 default:
@@ -321,7 +383,7 @@ OutS3:
321 return error; 383 return error;
322} 384}
323 385
324static struct file_operations snapshot_fops = { 386static const struct file_operations snapshot_fops = {
325 .open = snapshot_open, 387 .open = snapshot_open,
326 .release = snapshot_release, 388 .release = snapshot_release,
327 .read = snapshot_read, 389 .read = snapshot_read,
diff --git a/kernel/printk.c b/kernel/printk.c
index 66426552fbfe..185bb45eacf7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,8 +53,6 @@ int console_printk[4] = {
53 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 53 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
54}; 54};
55 55
56EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
57
58/* 56/*
59 * Low lever drivers may need that to know if they can schedule in 57 * Low lever drivers may need that to know if they can schedule in
60 * their unblank() callback or not. So let's export it. 58 * their unblank() callback or not. So let's export it.
@@ -335,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
335 } 333 }
336} 334}
337 335
336static int __read_mostly ignore_loglevel;
337
338int __init ignore_loglevel_setup(char *str)
339{
340 ignore_loglevel = 1;
341 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
342
343 return 1;
344}
345
346__setup("ignore_loglevel", ignore_loglevel_setup);
347
338/* 348/*
339 * Write out chars from start to end - 1 inclusive 349 * Write out chars from start to end - 1 inclusive
340 */ 350 */
341static void _call_console_drivers(unsigned long start, 351static void _call_console_drivers(unsigned long start,
342 unsigned long end, int msg_log_level) 352 unsigned long end, int msg_log_level)
343{ 353{
344 if (msg_log_level < console_loglevel && 354 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
345 console_drivers && start != end) { 355 console_drivers && start != end) {
346 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { 356 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
347 /* wrapped write */ 357 /* wrapped write */
@@ -631,12 +641,7 @@ EXPORT_SYMBOL(vprintk);
631 641
632asmlinkage long sys_syslog(int type, char __user *buf, int len) 642asmlinkage long sys_syslog(int type, char __user *buf, int len)
633{ 643{
634 return 0; 644 return -ENOSYS;
635}
636
637int do_syslog(int type, char __user *buf, int len)
638{
639 return 0;
640} 645}
641 646
642static void call_console_drivers(unsigned long start, unsigned long end) 647static void call_console_drivers(unsigned long start, unsigned long end)
@@ -777,7 +782,6 @@ int is_console_locked(void)
777{ 782{
778 return console_locked; 783 return console_locked;
779} 784}
780EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
781 785
782/** 786/**
783 * release_console_sem - unlock the console system 787 * release_console_sem - unlock the console system
diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec9..fb5e03d57e9d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
40 40
41static atomic_t *prof_buffer; 41static atomic_t *prof_buffer;
42static unsigned long prof_len, prof_shift; 42static unsigned long prof_len, prof_shift;
43static int prof_on __read_mostly; 43int prof_on __read_mostly;
44static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 44static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
45#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
51static int __init profile_setup(char * str) 51static int __init profile_setup(char * str)
52{ 52{
53 static char __initdata schedstr[] = "schedule"; 53 static char __initdata schedstr[] = "schedule";
54 static char __initdata sleepstr[] = "sleep";
54 int par; 55 int par;
55 56
56 if (!strncmp(str, schedstr, strlen(schedstr))) { 57 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
58 prof_on = SLEEP_PROFILING;
59 if (str[strlen(sleepstr)] == ',')
60 str += strlen(sleepstr) + 1;
61 if (get_option(&str, &par))
62 prof_shift = par;
63 printk(KERN_INFO
64 "kernel sleep profiling enabled (shift: %ld)\n",
65 prof_shift);
66 } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
57 prof_on = SCHED_PROFILING; 67 prof_on = SCHED_PROFILING;
58 if (str[strlen(schedstr)] == ',') 68 if (str[strlen(schedstr)] == ',')
59 str += strlen(schedstr) + 1; 69 str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
204 * positions to which hits are accounted during short intervals (e.g. 214 * positions to which hits are accounted during short intervals (e.g.
205 * several seconds) is usually very small. Exclusion from buffer 215 * several seconds) is usually very small. Exclusion from buffer
206 * flipping is provided by interrupt disablement (note that for 216 * flipping is provided by interrupt disablement (note that for
207 * SCHED_PROFILING profile_hit() may be called from process context). 217 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
218 * process context).
208 * The hash function is meant to be lightweight as opposed to strong, 219 * The hash function is meant to be lightweight as opposed to strong,
209 * and was vaguely inspired by ppc64 firmware-supported inverted 220 * and was vaguely inspired by ppc64 firmware-supported inverted
210 * pagetable hash functions, but uses a full hashtable full of finite 221 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
257 mutex_unlock(&profile_flip_mutex); 268 mutex_unlock(&profile_flip_mutex);
258} 269}
259 270
260void profile_hit(int type, void *__pc) 271void profile_hits(int type, void *__pc, unsigned int nr_hits)
261{ 272{
262 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 273 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
263 int i, j, cpu; 274 int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
274 put_cpu(); 285 put_cpu();
275 return; 286 return;
276 } 287 }
288 /*
289 * We buffer the global profiler buffer into a per-CPU
290 * queue and thus reduce the number of global (and possibly
291 * NUMA-alien) accesses. The write-queue is self-coalescing:
292 */
277 local_irq_save(flags); 293 local_irq_save(flags);
278 do { 294 do {
279 for (j = 0; j < PROFILE_GRPSZ; ++j) { 295 for (j = 0; j < PROFILE_GRPSZ; ++j) {
280 if (hits[i + j].pc == pc) { 296 if (hits[i + j].pc == pc) {
281 hits[i + j].hits++; 297 hits[i + j].hits += nr_hits;
282 goto out; 298 goto out;
283 } else if (!hits[i + j].hits) { 299 } else if (!hits[i + j].hits) {
284 hits[i + j].pc = pc; 300 hits[i + j].pc = pc;
285 hits[i + j].hits = 1; 301 hits[i + j].hits = nr_hits;
286 goto out; 302 goto out;
287 } 303 }
288 } 304 }
289 i = (i + secondary) & (NR_PROFILE_HIT - 1); 305 i = (i + secondary) & (NR_PROFILE_HIT - 1);
290 } while (i != primary); 306 } while (i != primary);
291 atomic_inc(&prof_buffer[pc]); 307
308 /*
309 * Add the current hit(s) and flush the write-queue out
310 * to the global buffer:
311 */
312 atomic_add(nr_hits, &prof_buffer[pc]);
292 for (i = 0; i < NR_PROFILE_HIT; ++i) { 313 for (i = 0; i < NR_PROFILE_HIT; ++i) {
293 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 314 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
294 hits[i].pc = hits[i].hits = 0; 315 hits[i].pc = hits[i].hits = 0;
@@ -298,7 +319,6 @@ out:
298 put_cpu(); 319 put_cpu();
299} 320}
300 321
301#ifdef CONFIG_HOTPLUG_CPU
302static int __devinit profile_cpu_callback(struct notifier_block *info, 322static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 323 unsigned long action, void *__cpu)
304{ 324{
@@ -351,19 +371,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
351 } 371 }
352 return NOTIFY_OK; 372 return NOTIFY_OK;
353} 373}
354#endif /* CONFIG_HOTPLUG_CPU */
355#else /* !CONFIG_SMP */ 374#else /* !CONFIG_SMP */
356#define profile_flip_buffers() do { } while (0) 375#define profile_flip_buffers() do { } while (0)
357#define profile_discard_flip_buffers() do { } while (0) 376#define profile_discard_flip_buffers() do { } while (0)
377#define profile_cpu_callback NULL
358 378
359void profile_hit(int type, void *__pc) 379void profile_hits(int type, void *__pc, unsigned int nr_hits)
360{ 380{
361 unsigned long pc; 381 unsigned long pc;
362 382
363 if (prof_on != type || !prof_buffer) 383 if (prof_on != type || !prof_buffer)
364 return; 384 return;
365 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 385 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
366 atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); 386 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
367} 387}
368#endif /* !CONFIG_SMP */ 388#endif /* !CONFIG_SMP */
369 389
@@ -442,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
442 read = 0; 462 read = 0;
443 463
444 while (p < sizeof(unsigned int) && count > 0) { 464 while (p < sizeof(unsigned int) && count > 0) {
445 put_user(*((char *)(&sample_step)+p),buf); 465 if (put_user(*((char *)(&sample_step)+p),buf))
466 return -EFAULT;
446 buf++; p++; count--; read++; 467 buf++; p++; count--; read++;
447 } 468 }
448 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 469 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
@@ -480,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
480 return count; 501 return count;
481} 502}
482 503
483static struct file_operations proc_profile_operations = { 504static const struct file_operations proc_profile_operations = {
484 .read = read_profile, 505 .read = read_profile,
485 .write = write_profile, 506 .write = write_profile,
486}; 507};
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef1..3554b76da84c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
235 235
236 list = rdp->donelist; 236 list = rdp->donelist;
237 while (list) { 237 while (list) {
238 next = rdp->donelist = list->next; 238 next = list->next;
239 prefetch(next);
239 list->func(list); 240 list->func(list);
240 list = next; 241 list = next;
241 if (++count >= rdp->blimit) 242 if (++count >= rdp->blimit)
242 break; 243 break;
243 } 244 }
245 rdp->donelist = list;
244 246
245 local_irq_disable(); 247 local_irq_disable();
246 rdp->qlen -= count; 248 rdp->qlen -= count;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f42..c52f981ea008 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
401 cleanup_srcu_struct(&srcu_ctl); 401 cleanup_srcu_struct(&srcu_ctl);
402} 402}
403 403
404static int srcu_torture_read_lock(void) 404static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
405{ 405{
406 return srcu_read_lock(&srcu_ctl); 406 return srcu_read_lock(&srcu_ctl);
407} 407}
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
419 schedule_timeout_interruptible(longdelay); 419 schedule_timeout_interruptible(longdelay);
420} 420}
421 421
422static void srcu_torture_read_unlock(int idx) 422static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
423{ 423{
424 srcu_read_unlock(&srcu_ctl, idx); 424 srcu_read_unlock(&srcu_ctl, idx);
425} 425}
diff --git a/kernel/relay.c b/kernel/relay.c
index 2b92e8ece85b..75a3a9a7efc2 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1013,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
1013 actor, &desc); 1013 actor, &desc);
1014} 1014}
1015 1015
1016struct file_operations relay_file_operations = { 1016const struct file_operations relay_file_operations = {
1017 .open = relay_file_open, 1017 .open = relay_file_open,
1018 .poll = relay_file_poll, 1018 .poll = relay_file_poll,
1019 .mmap = relay_file_mmap, 1019 .mmap = relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143e..7b9a497419d9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
88 return 0; 88 return 0;
89} 89}
90 90
91static struct seq_operations resource_op = { 91static const struct seq_operations resource_op = {
92 .start = r_start, 92 .start = r_start,
93 .next = r_next, 93 .next = r_next,
94 .stop = r_stop, 94 .stop = r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
115 return res; 115 return res;
116} 116}
117 117
118static struct file_operations proc_ioports_operations = { 118static const struct file_operations proc_ioports_operations = {
119 .open = ioports_open, 119 .open = ioports_open,
120 .read = seq_read, 120 .read = seq_read,
121 .llseek = seq_lseek, 121 .llseek = seq_lseek,
122 .release = seq_release, 122 .release = seq_release,
123}; 123};
124 124
125static struct file_operations proc_iomem_operations = { 125static const struct file_operations proc_iomem_operations = {
126 .open = iomem_open, 126 .open = iomem_open,
127 .read = seq_read, 127 .read = seq_read,
128 .llseek = seq_lseek, 128 .llseek = seq_lseek,
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c94..015fc633c96c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/sysdev.h> 14#include <linux/sysdev.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16#include <linux/freezer.h>
16 17
17#include "rtmutex.h" 18#include "rtmutex.h"
18 19
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701c680e..f385eff4682d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
34#include <linux/security.h> 34#include <linux/security.h>
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/profile.h> 36#include <linux/profile.h>
37#include <linux/suspend.h> 37#include <linux/freezer.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
@@ -505,7 +505,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
505 return res; 505 return res;
506} 506}
507 507
508struct file_operations proc_schedstat_operations = { 508const struct file_operations proc_schedstat_operations = {
509 .open = schedstat_open, 509 .open = schedstat_open,
510 .read = seq_read, 510 .read = seq_read,
511 .llseek = seq_lseek, 511 .llseek = seq_lseek,
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
948 } 948 }
949#endif 949#endif
950 950
951 /*
952 * Sleep time is in units of nanosecs, so shift by 20 to get a
953 * milliseconds-range estimation of the amount of time that the task
954 * spent sleeping:
955 */
956 if (unlikely(prof_on == SLEEP_PROFILING)) {
957 if (p->state == TASK_UNINTERRUPTIBLE)
958 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
959 (now - p->timestamp) >> 20);
960 }
961
951 if (!rt_task(p)) 962 if (!rt_task(p))
952 p->prio = recalc_task_prio(p, now); 963 p->prio = recalc_task_prio(p, now);
953 964
@@ -3333,6 +3344,7 @@ asmlinkage void __sched schedule(void)
3333 printk(KERN_ERR "BUG: scheduling while atomic: " 3344 printk(KERN_ERR "BUG: scheduling while atomic: "
3334 "%s/0x%08x/%d\n", 3345 "%s/0x%08x/%d\n",
3335 current->comm, preempt_count(), current->pid); 3346 current->comm, preempt_count(), current->pid);
3347 debug_show_held_locks(current);
3336 dump_stack(); 3348 dump_stack();
3337 } 3349 }
3338 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3350 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4804,18 +4816,18 @@ static void show_task(struct task_struct *p)
4804 show_stack(p, NULL); 4816 show_stack(p, NULL);
4805} 4817}
4806 4818
4807void show_state(void) 4819void show_state_filter(unsigned long state_filter)
4808{ 4820{
4809 struct task_struct *g, *p; 4821 struct task_struct *g, *p;
4810 4822
4811#if (BITS_PER_LONG == 32) 4823#if (BITS_PER_LONG == 32)
4812 printk("\n" 4824 printk("\n"
4813 " sibling\n"); 4825 " free sibling\n");
4814 printk(" task PC pid father child younger older\n"); 4826 printk(" task PC stack pid father child younger older\n");
4815#else 4827#else
4816 printk("\n" 4828 printk("\n"
4817 " sibling\n"); 4829 " free sibling\n");
4818 printk(" task PC pid father child younger older\n"); 4830 printk(" task PC stack pid father child younger older\n");
4819#endif 4831#endif
4820 read_lock(&tasklist_lock); 4832 read_lock(&tasklist_lock);
4821 do_each_thread(g, p) { 4833 do_each_thread(g, p) {
@@ -4824,11 +4836,16 @@ void show_state(void)
4824 * console might take alot of time: 4836 * console might take alot of time:
4825 */ 4837 */
4826 touch_nmi_watchdog(); 4838 touch_nmi_watchdog();
4827 show_task(p); 4839 if (p->state & state_filter)
4840 show_task(p);
4828 } while_each_thread(g, p); 4841 } while_each_thread(g, p);
4829 4842
4830 read_unlock(&tasklist_lock); 4843 read_unlock(&tasklist_lock);
4831 debug_show_all_locks(); 4844 /*
4845 * Only show locks if all tasks are dumped:
4846 */
4847 if (state_filter == -1)
4848 debug_show_all_locks();
4832} 4849}
4833 4850
4834/** 4851/**
@@ -6723,8 +6740,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6723 sched_smt_power_savings_store); 6740 sched_smt_power_savings_store);
6724#endif 6741#endif
6725 6742
6726
6727#ifdef CONFIG_HOTPLUG_CPU
6728/* 6743/*
6729 * Force a reinitialization of the sched domains hierarchy. The domains 6744 * Force a reinitialization of the sched domains hierarchy. The domains
6730 * and groups cannot be updated in place without racing with the balancing 6745 * and groups cannot be updated in place without racing with the balancing
@@ -6757,7 +6772,6 @@ static int update_sched_domains(struct notifier_block *nfb,
6757 6772
6758 return NOTIFY_OK; 6773 return NOTIFY_OK;
6759} 6774}
6760#endif
6761 6775
6762void __init sched_init_smp(void) 6776void __init sched_init_smp(void)
6763{ 6777{
@@ -6867,6 +6881,7 @@ void __might_sleep(char *file, int line)
6867 " context at %s:%d\n", file, line); 6881 " context at %s:%d\n", file, line);
6868 printk("in_atomic():%d, irqs_disabled():%d\n", 6882 printk("in_atomic():%d, irqs_disabled():%d\n",
6869 in_atomic(), irqs_disabled()); 6883 in_atomic(), irqs_disabled());
6884 debug_show_held_locks(current);
6870 dump_stack(); 6885 dump_stack();
6871 } 6886 }
6872#endif 6887#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index df18c167a2a7..ec81defde339 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
23#include <linux/ptrace.h> 23#include <linux/ptrace.h>
24#include <linux/signal.h> 24#include <linux/signal.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/freezer.h>
26#include <asm/param.h> 27#include <asm/param.h>
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28#include <asm/unistd.h> 29#include <asm/unistd.h>
@@ -33,7 +34,7 @@
33 * SLAB caches for signal bits. 34 * SLAB caches for signal bits.
34 */ 35 */
35 36
36static kmem_cache_t *sigqueue_cachep; 37static struct kmem_cache *sigqueue_cachep;
37 38
38/* 39/*
39 * In POSIX a signal is sent either to a specific thread (Linux task) 40 * In POSIX a signal is sent either to a specific thread (Linux task)
@@ -1133,8 +1134,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1133 return error; 1134 return error;
1134} 1135}
1135 1136
1136int 1137static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1137kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1138{ 1138{
1139 int error; 1139 int error;
1140 rcu_read_lock(); 1140 rcu_read_lock();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce16..918e52df090e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
574 574
575 switch (action) { 575 switch (action) {
576 case CPU_UP_PREPARE: 576 case CPU_UP_PREPARE:
577 BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
578 BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
579 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 577 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
580 if (IS_ERR(p)) { 578 if (IS_ERR(p)) {
581 printk("ksoftirqd for %i failed\n", hotcpu); 579 printk("ksoftirqd for %i failed\n", hotcpu);
diff --git a/kernel/sys.c b/kernel/sys.c
index c87b461de38d..a0c1a29a507f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
1102asmlinkage long sys_setuid(uid_t uid) 1102asmlinkage long sys_setuid(uid_t uid)
1103{ 1103{
1104 int old_euid = current->euid; 1104 int old_euid = current->euid;
1105 int old_ruid, old_suid, new_ruid, new_suid; 1105 int old_ruid, old_suid, new_suid;
1106 int retval; 1106 int retval;
1107 1107
1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 1108 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
1109 if (retval) 1109 if (retval)
1110 return retval; 1110 return retval;
1111 1111
1112 old_ruid = new_ruid = current->uid; 1112 old_ruid = current->uid;
1113 old_suid = current->suid; 1113 old_suid = current->suid;
1114 new_suid = old_suid; 1114 new_suid = old_suid;
1115 1115
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09e569f4792b..8e9f00fd6d18 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
54 54
55#ifdef CONFIG_X86 55#ifdef CONFIG_X86
56#include <asm/nmi.h> 56#include <asm/nmi.h>
57#include <asm/stacktrace.h>
57#endif 58#endif
58 59
59#if defined(CONFIG_SYSCTL) 60#if defined(CONFIG_SYSCTL)
@@ -170,7 +171,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
170static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); 171static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
171static int proc_opensys(struct inode *, struct file *); 172static int proc_opensys(struct inode *, struct file *);
172 173
173struct file_operations proc_sys_file_operations = { 174const struct file_operations proc_sys_file_operations = {
174 .open = proc_opensys, 175 .open = proc_opensys,
175 .read = proc_readsys, 176 .read = proc_readsys,
176 .write = proc_writesys, 177 .write = proc_writesys,
@@ -707,6 +708,14 @@ static ctl_table kern_table[] = {
707 .mode = 0444, 708 .mode = 0444,
708 .proc_handler = &proc_dointvec, 709 .proc_handler = &proc_dointvec,
709 }, 710 },
711 {
712 .ctl_name = CTL_UNNUMBERED,
713 .procname = "kstack_depth_to_print",
714 .data = &kstack_depth_to_print,
715 .maxlen = sizeof(int),
716 .mode = 0644,
717 .proc_handler = &proc_dointvec,
718 },
710#endif 719#endif
711#if defined(CONFIG_MMU) 720#if defined(CONFIG_MMU)
712 { 721 {
@@ -977,17 +986,6 @@ static ctl_table vm_table[] = {
977 .extra1 = &zero, 986 .extra1 = &zero,
978 }, 987 },
979#endif 988#endif
980#ifdef CONFIG_SWAP
981 {
982 .ctl_name = VM_SWAP_TOKEN_TIMEOUT,
983 .procname = "swap_token_timeout",
984 .data = &swap_token_default_timeout,
985 .maxlen = sizeof(swap_token_default_timeout),
986 .mode = 0644,
987 .proc_handler = &proc_dointvec_jiffies,
988 .strategy = &sysctl_jiffies,
989 },
990#endif
991#ifdef CONFIG_NUMA 989#ifdef CONFIG_NUMA
992 { 990 {
993 .ctl_name = VM_ZONE_RECLAIM_MODE, 991 .ctl_name = VM_ZONE_RECLAIM_MODE,
@@ -1886,7 +1884,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
1886 p = buf; 1884 p = buf;
1887 if (*p == '-' && left > 1) { 1885 if (*p == '-' && left > 1) {
1888 neg = 1; 1886 neg = 1;
1889 left--, p++; 1887 p++;
1890 } 1888 }
1891 if (*p < '0' || *p > '9') 1889 if (*p < '0' || *p > '9')
1892 break; 1890 break;
@@ -2137,7 +2135,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
2137 p = buf; 2135 p = buf;
2138 if (*p == '-' && left > 1) { 2136 if (*p == '-' && left > 1) {
2139 neg = 1; 2137 neg = 1;
2140 left--, p++; 2138 p++;
2141 } 2139 }
2142 if (*p < '0' || *p > '9') 2140 if (*p < '0' || *p > '9')
2143 break; 2141 break;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d3d28919d4b4..4c3476fa058d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
34 34
35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 35static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
36static int family_registered; 36static int family_registered;
37kmem_cache_t *taskstats_cache; 37struct kmem_cache *taskstats_cache;
38 38
39static struct genl_family family = { 39static struct genl_family family = {
40 .id = GENL_ID_GENERATE, 40 .id = GENL_ID_GENERATE,
@@ -69,7 +69,7 @@ enum actions {
69}; 69};
70 70
71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 71static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
72 void **replyp, size_t size) 72 size_t size)
73{ 73{
74 struct sk_buff *skb; 74 struct sk_buff *skb;
75 void *reply; 75 void *reply;
@@ -94,7 +94,6 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
94 } 94 }
95 95
96 *skbp = skb; 96 *skbp = skb;
97 *replyp = reply;
98 return 0; 97 return 0;
99} 98}
100 99
@@ -119,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
119/* 118/*
120 * Send taskstats data in @skb to listeners registered for @cpu's exit data 119 * Send taskstats data in @skb to listeners registered for @cpu's exit data
121 */ 120 */
122static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) 121static void send_cpu_listeners(struct sk_buff *skb,
122 struct listener_list *listeners)
123{ 123{
124 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 124 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
125 struct listener_list *listeners;
126 struct listener *s, *tmp; 125 struct listener *s, *tmp;
127 struct sk_buff *skb_next, *skb_cur = skb; 126 struct sk_buff *skb_next, *skb_cur = skb;
128 void *reply = genlmsg_data(genlhdr); 127 void *reply = genlmsg_data(genlhdr);
@@ -135,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
135 } 134 }
136 135
137 rc = 0; 136 rc = 0;
138 listeners = &per_cpu(listener_array, cpu);
139 down_read(&listeners->sem); 137 down_read(&listeners->sem);
140 list_for_each_entry(s, &listeners->list, list) { 138 list_for_each_entry(s, &listeners->list, list) {
141 skb_next = NULL; 139 skb_next = NULL;
@@ -186,6 +184,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
186 } else 184 } else
187 get_task_struct(tsk); 185 get_task_struct(tsk);
188 186
187 memset(stats, 0, sizeof(*stats));
189 /* 188 /*
190 * Each accounting subsystem adds calls to its functions to 189 * Each accounting subsystem adds calls to its functions to
191 * fill in relevant parts of struct taskstsats as follows 190 * fill in relevant parts of struct taskstsats as follows
@@ -228,6 +227,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
228 227
229 if (first->signal->stats) 228 if (first->signal->stats)
230 memcpy(stats, first->signal->stats, sizeof(*stats)); 229 memcpy(stats, first->signal->stats, sizeof(*stats));
230 else
231 memset(stats, 0, sizeof(*stats));
231 232
232 tsk = first; 233 tsk = first;
233 do { 234 do {
@@ -344,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask)
344 return ret; 345 return ret;
345} 346}
346 347
348static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
349{
350 struct nlattr *na, *ret;
351 int aggr;
352
353 aggr = (type == TASKSTATS_TYPE_PID)
354 ? TASKSTATS_TYPE_AGGR_PID
355 : TASKSTATS_TYPE_AGGR_TGID;
356
357 na = nla_nest_start(skb, aggr);
358 if (!na)
359 goto err;
360 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
361 goto err;
362 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
363 if (!ret)
364 goto err;
365 nla_nest_end(skb, na);
366
367 return nla_data(ret);
368err:
369 return NULL;
370}
371
347static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 372static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
348{ 373{
349 int rc = 0; 374 int rc = 0;
350 struct sk_buff *rep_skb; 375 struct sk_buff *rep_skb;
351 struct taskstats stats; 376 struct taskstats *stats;
352 void *reply;
353 size_t size; 377 size_t size;
354 struct nlattr *na;
355 cpumask_t mask; 378 cpumask_t mask;
356 379
357 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 380 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -372,83 +395,71 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
372 size = nla_total_size(sizeof(u32)) + 395 size = nla_total_size(sizeof(u32)) +
373 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 396 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
374 397
375 memset(&stats, 0, sizeof(stats)); 398 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
376 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
377 if (rc < 0) 399 if (rc < 0)
378 return rc; 400 return rc;
379 401
402 rc = -EINVAL;
380 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 403 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
381 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 404 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
382 rc = fill_pid(pid, NULL, &stats); 405 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
383 if (rc < 0) 406 if (!stats)
384 goto err; 407 goto err;
385 408
386 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 409 rc = fill_pid(pid, NULL, stats);
387 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); 410 if (rc < 0)
388 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 411 goto err;
389 stats);
390 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 412 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
391 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 413 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
392 rc = fill_tgid(tgid, NULL, &stats); 414 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
393 if (rc < 0) 415 if (!stats)
394 goto err; 416 goto err;
395 417
396 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 418 rc = fill_tgid(tgid, NULL, stats);
397 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); 419 if (rc < 0)
398 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 420 goto err;
399 stats); 421 } else
400 } else {
401 rc = -EINVAL;
402 goto err; 422 goto err;
403 }
404
405 nla_nest_end(rep_skb, na);
406 423
407 return send_reply(rep_skb, info->snd_pid); 424 return send_reply(rep_skb, info->snd_pid);
408
409nla_put_failure:
410 rc = genlmsg_cancel(rep_skb, reply);
411err: 425err:
412 nlmsg_free(rep_skb); 426 nlmsg_free(rep_skb);
413 return rc; 427 return rc;
414} 428}
415 429
416void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 430static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
417{ 431{
418 struct listener_list *listeners; 432 struct signal_struct *sig = tsk->signal;
419 struct taskstats *tmp; 433 struct taskstats *stats;
420 /*
421 * This is the cpu on which the task is exiting currently and will
422 * be the one for which the exit event is sent, even if the cpu
423 * on which this function is running changes later.
424 */
425 *mycpu = raw_smp_processor_id();
426 434
427 *ptidstats = NULL; 435 if (sig->stats || thread_group_empty(tsk))
428 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 436 goto ret;
429 if (!tmp)
430 return;
431 437
432 listeners = &per_cpu(listener_array, *mycpu); 438 /* No problem if kmem_cache_zalloc() fails */
433 down_read(&listeners->sem); 439 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
434 if (!list_empty(&listeners->list)) { 440
435 *ptidstats = tmp; 441 spin_lock_irq(&tsk->sighand->siglock);
436 tmp = NULL; 442 if (!sig->stats) {
443 sig->stats = stats;
444 stats = NULL;
437 } 445 }
438 up_read(&listeners->sem); 446 spin_unlock_irq(&tsk->sighand->siglock);
439 kfree(tmp); 447
448 if (stats)
449 kmem_cache_free(taskstats_cache, stats);
450ret:
451 return sig->stats;
440} 452}
441 453
442/* Send pid data out on exit */ 454/* Send pid data out on exit */
443void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 455void taskstats_exit(struct task_struct *tsk, int group_dead)
444 int group_dead, unsigned int mycpu)
445{ 456{
446 int rc; 457 int rc;
458 struct listener_list *listeners;
459 struct taskstats *stats;
447 struct sk_buff *rep_skb; 460 struct sk_buff *rep_skb;
448 void *reply;
449 size_t size; 461 size_t size;
450 int is_thread_group; 462 int is_thread_group;
451 struct nlattr *na;
452 463
453 if (!family_registered) 464 if (!family_registered)
454 return; 465 return;
@@ -459,7 +470,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
459 size = nla_total_size(sizeof(u32)) + 470 size = nla_total_size(sizeof(u32)) +
460 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 471 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
461 472
462 is_thread_group = (tsk->signal->stats != NULL); 473 is_thread_group = !!taskstats_tgid_alloc(tsk);
463 if (is_thread_group) { 474 if (is_thread_group) {
464 /* PID + STATS + TGID + STATS */ 475 /* PID + STATS + TGID + STATS */
465 size = 2 * size; 476 size = 2 * size;
@@ -467,49 +478,39 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
467 fill_tgid_exit(tsk); 478 fill_tgid_exit(tsk);
468 } 479 }
469 480
470 if (!tidstats) 481 listeners = &__raw_get_cpu_var(listener_array);
482 if (list_empty(&listeners->list))
471 return; 483 return;
472 484
473 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 485 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
474 if (rc < 0)
475 goto ret;
476
477 rc = fill_pid(tsk->pid, tsk, tidstats);
478 if (rc < 0) 486 if (rc < 0)
479 goto err_skb; 487 return;
480 488
481 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 489 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
482 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); 490 if (!stats)
483 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 491 goto err;
484 *tidstats);
485 nla_nest_end(rep_skb, na);
486 492
487 if (!is_thread_group) 493 rc = fill_pid(tsk->pid, tsk, stats);
488 goto send; 494 if (rc < 0)
495 goto err;
489 496
490 /* 497 /*
491 * Doesn't matter if tsk is the leader or the last group member leaving 498 * Doesn't matter if tsk is the leader or the last group member leaving
492 */ 499 */
493 if (!group_dead) 500 if (!is_thread_group || !group_dead)
494 goto send; 501 goto send;
495 502
496 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 503 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
497 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); 504 if (!stats)
498 /* No locking needed for tsk->signal->stats since group is dead */ 505 goto err;
499 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 506
500 *tsk->signal->stats); 507 memcpy(stats, tsk->signal->stats, sizeof(*stats));
501 nla_nest_end(rep_skb, na);
502 508
503send: 509send:
504 send_cpu_listeners(rep_skb, mycpu); 510 send_cpu_listeners(rep_skb, listeners);
505 return; 511 return;
506 512err:
507nla_put_failure:
508 genlmsg_cancel(rep_skb, reply);
509err_skb:
510 nlmsg_free(rep_skb); 513 nlmsg_free(rep_skb);
511ret:
512 return;
513} 514}
514 515
515static struct genl_ops taskstats_ops = { 516static struct genl_ops taskstats_ops = {
diff --git a/kernel/unwind.c b/kernel/unwind.c
index ed0a21d4a902..09c261329249 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -14,11 +14,12 @@
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/sort.h> 15#include <linux/sort.h>
16#include <linux/stop_machine.h> 16#include <linux/stop_machine.h>
17#include <linux/uaccess.h>
17#include <asm/sections.h> 18#include <asm/sections.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include <asm/unaligned.h> 20#include <asm/unaligned.h>
20 21
21extern char __start_unwind[], __end_unwind[]; 22extern const char __start_unwind[], __end_unwind[];
22extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; 23extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
23 24
24#define MAX_STACK_DEPTH 8 25#define MAX_STACK_DEPTH 8
@@ -94,6 +95,7 @@ static const struct {
94 95
95typedef unsigned long uleb128_t; 96typedef unsigned long uleb128_t;
96typedef signed long sleb128_t; 97typedef signed long sleb128_t;
98#define sleb128abs __builtin_labs
97 99
98static struct unwind_table { 100static struct unwind_table {
99 struct { 101 struct {
@@ -135,6 +137,17 @@ struct unwind_state {
135 137
136static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; 138static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
137 139
140static unsigned unwind_debug;
141static int __init unwind_debug_setup(char *s)
142{
143 unwind_debug = simple_strtoul(s, NULL, 0);
144 return 1;
145}
146__setup("unwind_debug=", unwind_debug_setup);
147#define dprintk(lvl, fmt, args...) \
148 ((void)(lvl > unwind_debug \
149 || printk(KERN_DEBUG "unwind: " fmt "\n", ##args)))
150
138static struct unwind_table *find_table(unsigned long pc) 151static struct unwind_table *find_table(unsigned long pc)
139{ 152{
140 struct unwind_table *table; 153 struct unwind_table *table;
@@ -151,7 +164,9 @@ static struct unwind_table *find_table(unsigned long pc)
151 164
152static unsigned long read_pointer(const u8 **pLoc, 165static unsigned long read_pointer(const u8 **pLoc,
153 const void *end, 166 const void *end,
154 signed ptrType); 167 signed ptrType,
168 unsigned long text_base,
169 unsigned long data_base);
155 170
156static void init_unwind_table(struct unwind_table *table, 171static void init_unwind_table(struct unwind_table *table,
157 const char *name, 172 const char *name,
@@ -176,10 +191,13 @@ static void init_unwind_table(struct unwind_table *table,
176 /* See if the linker provided table looks valid. */ 191 /* See if the linker provided table looks valid. */
177 if (header_size <= 4 192 if (header_size <= 4
178 || header_start[0] != 1 193 || header_start[0] != 1
179 || (void *)read_pointer(&ptr, end, header_start[1]) != table_start 194 || (void *)read_pointer(&ptr, end, header_start[1], 0, 0)
180 || header_start[2] == DW_EH_PE_omit 195 != table_start
181 || read_pointer(&ptr, end, header_start[2]) <= 0 196 || !read_pointer(&ptr, end, header_start[2], 0, 0)
182 || header_start[3] == DW_EH_PE_omit) 197 || !read_pointer(&ptr, end, header_start[3], 0,
198 (unsigned long)header_start)
199 || !read_pointer(&ptr, end, header_start[3], 0,
200 (unsigned long)header_start))
183 header_start = NULL; 201 header_start = NULL;
184 table->hdrsz = header_size; 202 table->hdrsz = header_size;
185 smp_wmb(); 203 smp_wmb();
@@ -269,7 +287,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
269 ptr = (const u8 *)(fde + 2); 287 ptr = (const u8 *)(fde + 2);
270 if (!read_pointer(&ptr, 288 if (!read_pointer(&ptr,
271 (const u8 *)(fde + 1) + *fde, 289 (const u8 *)(fde + 1) + *fde,
272 ptrType)) 290 ptrType, 0, 0))
273 return; 291 return;
274 ++n; 292 ++n;
275 } 293 }
@@ -279,6 +297,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
279 297
280 hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) 298 hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
281 + 2 * n * sizeof(unsigned long); 299 + 2 * n * sizeof(unsigned long);
300 dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize);
282 header = alloc(hdrSize); 301 header = alloc(hdrSize);
283 if (!header) 302 if (!header)
284 return; 303 return;
@@ -303,7 +322,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
303 ptr = (const u8 *)(fde + 2); 322 ptr = (const u8 *)(fde + 2);
304 header->table[n].start = read_pointer(&ptr, 323 header->table[n].start = read_pointer(&ptr,
305 (const u8 *)(fde + 1) + *fde, 324 (const u8 *)(fde + 1) + *fde,
306 fde_pointer_type(cie)); 325 fde_pointer_type(cie), 0, 0);
307 header->table[n].fde = (unsigned long)fde; 326 header->table[n].fde = (unsigned long)fde;
308 ++n; 327 ++n;
309 } 328 }
@@ -486,7 +505,9 @@ static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
486 505
487static unsigned long read_pointer(const u8 **pLoc, 506static unsigned long read_pointer(const u8 **pLoc,
488 const void *end, 507 const void *end,
489 signed ptrType) 508 signed ptrType,
509 unsigned long text_base,
510 unsigned long data_base)
490{ 511{
491 unsigned long value = 0; 512 unsigned long value = 0;
492 union { 513 union {
@@ -498,13 +519,17 @@ static unsigned long read_pointer(const u8 **pLoc,
498 const unsigned long *pul; 519 const unsigned long *pul;
499 } ptr; 520 } ptr;
500 521
501 if (ptrType < 0 || ptrType == DW_EH_PE_omit) 522 if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
523 dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end);
502 return 0; 524 return 0;
525 }
503 ptr.p8 = *pLoc; 526 ptr.p8 = *pLoc;
504 switch(ptrType & DW_EH_PE_FORM) { 527 switch(ptrType & DW_EH_PE_FORM) {
505 case DW_EH_PE_data2: 528 case DW_EH_PE_data2:
506 if (end < (const void *)(ptr.p16u + 1)) 529 if (end < (const void *)(ptr.p16u + 1)) {
530 dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end);
507 return 0; 531 return 0;
532 }
508 if(ptrType & DW_EH_PE_signed) 533 if(ptrType & DW_EH_PE_signed)
509 value = get_unaligned(ptr.p16s++); 534 value = get_unaligned(ptr.p16s++);
510 else 535 else
@@ -512,8 +537,10 @@ static unsigned long read_pointer(const u8 **pLoc,
512 break; 537 break;
513 case DW_EH_PE_data4: 538 case DW_EH_PE_data4:
514#ifdef CONFIG_64BIT 539#ifdef CONFIG_64BIT
515 if (end < (const void *)(ptr.p32u + 1)) 540 if (end < (const void *)(ptr.p32u + 1)) {
541 dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end);
516 return 0; 542 return 0;
543 }
517 if(ptrType & DW_EH_PE_signed) 544 if(ptrType & DW_EH_PE_signed)
518 value = get_unaligned(ptr.p32s++); 545 value = get_unaligned(ptr.p32s++);
519 else 546 else
@@ -525,8 +552,10 @@ static unsigned long read_pointer(const u8 **pLoc,
525 BUILD_BUG_ON(sizeof(u32) != sizeof(value)); 552 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
526#endif 553#endif
527 case DW_EH_PE_native: 554 case DW_EH_PE_native:
528 if (end < (const void *)(ptr.pul + 1)) 555 if (end < (const void *)(ptr.pul + 1)) {
556 dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end);
529 return 0; 557 return 0;
558 }
530 value = get_unaligned(ptr.pul++); 559 value = get_unaligned(ptr.pul++);
531 break; 560 break;
532 case DW_EH_PE_leb128: 561 case DW_EH_PE_leb128:
@@ -534,10 +563,14 @@ static unsigned long read_pointer(const u8 **pLoc,
534 value = ptrType & DW_EH_PE_signed 563 value = ptrType & DW_EH_PE_signed
535 ? get_sleb128(&ptr.p8, end) 564 ? get_sleb128(&ptr.p8, end)
536 : get_uleb128(&ptr.p8, end); 565 : get_uleb128(&ptr.p8, end);
537 if ((const void *)ptr.p8 > end) 566 if ((const void *)ptr.p8 > end) {
567 dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end);
538 return 0; 568 return 0;
569 }
539 break; 570 break;
540 default: 571 default:
572 dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
573 ptrType, ptr.p8, end);
541 return 0; 574 return 0;
542 } 575 }
543 switch(ptrType & DW_EH_PE_ADJUST) { 576 switch(ptrType & DW_EH_PE_ADJUST) {
@@ -546,12 +579,33 @@ static unsigned long read_pointer(const u8 **pLoc,
546 case DW_EH_PE_pcrel: 579 case DW_EH_PE_pcrel:
547 value += (unsigned long)*pLoc; 580 value += (unsigned long)*pLoc;
548 break; 581 break;
582 case DW_EH_PE_textrel:
583 if (likely(text_base)) {
584 value += text_base;
585 break;
586 }
587 dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
588 ptrType, *pLoc, end);
589 return 0;
590 case DW_EH_PE_datarel:
591 if (likely(data_base)) {
592 value += data_base;
593 break;
594 }
595 dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
596 ptrType, *pLoc, end);
597 return 0;
549 default: 598 default:
599 dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
600 ptrType, *pLoc, end);
550 return 0; 601 return 0;
551 } 602 }
552 if ((ptrType & DW_EH_PE_indirect) 603 if ((ptrType & DW_EH_PE_indirect)
553 && __get_user(value, (unsigned long *)value)) 604 && probe_kernel_address((unsigned long *)value, value)) {
605 dprintk(1, "Cannot read indirect value %lx (%p,%p).",
606 value, *pLoc, end);
554 return 0; 607 return 0;
608 }
555 *pLoc = ptr.p8; 609 *pLoc = ptr.p8;
556 610
557 return value; 611 return value;
@@ -594,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie)
594 case 'P': { 648 case 'P': {
595 signed ptrType = *ptr++; 649 signed ptrType = *ptr++;
596 650
597 if (!read_pointer(&ptr, end, ptrType) || ptr > end) 651 if (!read_pointer(&ptr, end, ptrType, 0, 0)
652 || ptr > end)
598 return -1; 653 return -1;
599 } 654 }
600 break; 655 break;
@@ -654,7 +709,8 @@ static int processCFI(const u8 *start,
654 case DW_CFA_nop: 709 case DW_CFA_nop:
655 break; 710 break;
656 case DW_CFA_set_loc: 711 case DW_CFA_set_loc:
657 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) 712 state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0);
713 if (state->loc == 0)
658 result = 0; 714 result = 0;
659 break; 715 break;
660 case DW_CFA_advance_loc1: 716 case DW_CFA_advance_loc1:
@@ -700,8 +756,10 @@ static int processCFI(const u8 *start,
700 state->label = NULL; 756 state->label = NULL;
701 return 1; 757 return 1;
702 } 758 }
703 if (state->stackDepth >= MAX_STACK_DEPTH) 759 if (state->stackDepth >= MAX_STACK_DEPTH) {
760 dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end);
704 return 0; 761 return 0;
762 }
705 state->stack[state->stackDepth++] = ptr.p8; 763 state->stack[state->stackDepth++] = ptr.p8;
706 break; 764 break;
707 case DW_CFA_restore_state: 765 case DW_CFA_restore_state:
@@ -716,8 +774,10 @@ static int processCFI(const u8 *start,
716 result = processCFI(start, end, 0, ptrType, state); 774 result = processCFI(start, end, 0, ptrType, state);
717 state->loc = loc; 775 state->loc = loc;
718 state->label = label; 776 state->label = label;
719 } else 777 } else {
778 dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end);
720 return 0; 779 return 0;
780 }
721 break; 781 break;
722 case DW_CFA_def_cfa: 782 case DW_CFA_def_cfa:
723 state->cfa.reg = get_uleb128(&ptr.p8, end); 783 state->cfa.reg = get_uleb128(&ptr.p8, end);
@@ -749,6 +809,7 @@ static int processCFI(const u8 *start,
749 break; 809 break;
750 case DW_CFA_GNU_window_save: 810 case DW_CFA_GNU_window_save:
751 default: 811 default:
812 dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end);
752 result = 0; 813 result = 0;
753 break; 814 break;
754 } 815 }
@@ -764,12 +825,17 @@ static int processCFI(const u8 *start,
764 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); 825 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
765 break; 826 break;
766 } 827 }
767 if (ptr.p8 > end) 828 if (ptr.p8 > end) {
829 dprintk(1, "Data overrun (%p,%p).", ptr.p8, end);
768 result = 0; 830 result = 0;
831 }
769 if (result && targetLoc != 0 && targetLoc < state->loc) 832 if (result && targetLoc != 0 && targetLoc < state->loc)
770 return 1; 833 return 1;
771 } 834 }
772 835
836 if (result && ptr.p8 < end)
837 dprintk(1, "Data underrun (%p,%p).", ptr.p8, end);
838
773 return result 839 return result
774 && ptr.p8 == end 840 && ptr.p8 == end
775 && (targetLoc == 0 841 && (targetLoc == 0
@@ -786,7 +852,7 @@ int unwind(struct unwind_frame_info *frame)
786#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) 852#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
787 const u32 *fde = NULL, *cie = NULL; 853 const u32 *fde = NULL, *cie = NULL;
788 const u8 *ptr = NULL, *end = NULL; 854 const u8 *ptr = NULL, *end = NULL;
789 unsigned long pc = UNW_PC(frame) - frame->call_frame; 855 unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
790 unsigned long startLoc = 0, endLoc = 0, cfa; 856 unsigned long startLoc = 0, endLoc = 0, cfa;
791 unsigned i; 857 unsigned i;
792 signed ptrType = -1; 858 signed ptrType = -1;
@@ -813,9 +879,9 @@ int unwind(struct unwind_frame_info *frame)
813 ptr = hdr + 4; 879 ptr = hdr + 4;
814 end = hdr + table->hdrsz; 880 end = hdr + table->hdrsz;
815 if (tableSize 881 if (tableSize
816 && read_pointer(&ptr, end, hdr[1]) 882 && read_pointer(&ptr, end, hdr[1], 0, 0)
817 == (unsigned long)table->address 883 == (unsigned long)table->address
818 && (i = read_pointer(&ptr, end, hdr[2])) > 0 884 && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0
819 && i == (end - ptr) / (2 * tableSize) 885 && i == (end - ptr) / (2 * tableSize)
820 && !((end - ptr) % (2 * tableSize))) { 886 && !((end - ptr) % (2 * tableSize))) {
821 do { 887 do {
@@ -823,7 +889,8 @@ int unwind(struct unwind_frame_info *frame)
823 889
824 startLoc = read_pointer(&cur, 890 startLoc = read_pointer(&cur,
825 cur + tableSize, 891 cur + tableSize,
826 hdr[3]); 892 hdr[3], 0,
893 (unsigned long)hdr);
827 if (pc < startLoc) 894 if (pc < startLoc)
828 i /= 2; 895 i /= 2;
829 else { 896 else {
@@ -834,13 +901,17 @@ int unwind(struct unwind_frame_info *frame)
834 if (i == 1 901 if (i == 1
835 && (startLoc = read_pointer(&ptr, 902 && (startLoc = read_pointer(&ptr,
836 ptr + tableSize, 903 ptr + tableSize,
837 hdr[3])) != 0 904 hdr[3], 0,
905 (unsigned long)hdr)) != 0
838 && pc >= startLoc) 906 && pc >= startLoc)
839 fde = (void *)read_pointer(&ptr, 907 fde = (void *)read_pointer(&ptr,
840 ptr + tableSize, 908 ptr + tableSize,
841 hdr[3]); 909 hdr[3], 0,
910 (unsigned long)hdr);
842 } 911 }
843 } 912 }
913 if(hdr && !fde)
914 dprintk(3, "Binary lookup for %lx failed.", pc);
844 915
845 if (fde != NULL) { 916 if (fde != NULL) {
846 cie = cie_for_fde(fde, table); 917 cie = cie_for_fde(fde, table);
@@ -851,17 +922,19 @@ int unwind(struct unwind_frame_info *frame)
851 && (ptrType = fde_pointer_type(cie)) >= 0 922 && (ptrType = fde_pointer_type(cie)) >= 0
852 && read_pointer(&ptr, 923 && read_pointer(&ptr,
853 (const u8 *)(fde + 1) + *fde, 924 (const u8 *)(fde + 1) + *fde,
854 ptrType) == startLoc) { 925 ptrType, 0, 0) == startLoc) {
855 if (!(ptrType & DW_EH_PE_indirect)) 926 if (!(ptrType & DW_EH_PE_indirect))
856 ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; 927 ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
857 endLoc = startLoc 928 endLoc = startLoc
858 + read_pointer(&ptr, 929 + read_pointer(&ptr,
859 (const u8 *)(fde + 1) + *fde, 930 (const u8 *)(fde + 1) + *fde,
860 ptrType); 931 ptrType, 0, 0);
861 if(pc >= endLoc) 932 if(pc >= endLoc)
862 fde = NULL; 933 fde = NULL;
863 } else 934 } else
864 fde = NULL; 935 fde = NULL;
936 if(!fde)
937 dprintk(1, "Binary lookup result for %lx discarded.", pc);
865 } 938 }
866 if (fde == NULL) { 939 if (fde == NULL) {
867 for (fde = table->address, tableSize = table->size; 940 for (fde = table->address, tableSize = table->size;
@@ -881,7 +954,7 @@ int unwind(struct unwind_frame_info *frame)
881 ptr = (const u8 *)(fde + 2); 954 ptr = (const u8 *)(fde + 2);
882 startLoc = read_pointer(&ptr, 955 startLoc = read_pointer(&ptr,
883 (const u8 *)(fde + 1) + *fde, 956 (const u8 *)(fde + 1) + *fde,
884 ptrType); 957 ptrType, 0, 0);
885 if (!startLoc) 958 if (!startLoc)
886 continue; 959 continue;
887 if (!(ptrType & DW_EH_PE_indirect)) 960 if (!(ptrType & DW_EH_PE_indirect))
@@ -889,10 +962,12 @@ int unwind(struct unwind_frame_info *frame)
889 endLoc = startLoc 962 endLoc = startLoc
890 + read_pointer(&ptr, 963 + read_pointer(&ptr,
891 (const u8 *)(fde + 1) + *fde, 964 (const u8 *)(fde + 1) + *fde,
892 ptrType); 965 ptrType, 0, 0);
893 if (pc >= startLoc && pc < endLoc) 966 if (pc >= startLoc && pc < endLoc)
894 break; 967 break;
895 } 968 }
969 if(!fde)
970 dprintk(3, "Linear lookup for %lx failed.", pc);
896 } 971 }
897 } 972 }
898 if (cie != NULL) { 973 if (cie != NULL) {
@@ -926,6 +1001,8 @@ int unwind(struct unwind_frame_info *frame)
926 if (ptr >= end || *ptr) 1001 if (ptr >= end || *ptr)
927 cie = NULL; 1002 cie = NULL;
928 } 1003 }
1004 if(!cie)
1005 dprintk(1, "CIE unusable (%p,%p).", ptr, end);
929 ++ptr; 1006 ++ptr;
930 } 1007 }
931 if (cie != NULL) { 1008 if (cie != NULL) {
@@ -935,7 +1012,12 @@ int unwind(struct unwind_frame_info *frame)
935 state.dataAlign = get_sleb128(&ptr, end); 1012 state.dataAlign = get_sleb128(&ptr, end);
936 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) 1013 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
937 cie = NULL; 1014 cie = NULL;
938 else { 1015 else if (UNW_PC(frame) % state.codeAlign
1016 || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
1017 dprintk(1, "Input pointer(s) misaligned (%lx,%lx).",
1018 UNW_PC(frame), UNW_SP(frame));
1019 return -EPERM;
1020 } else {
939 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); 1021 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
940 /* skip augmentation */ 1022 /* skip augmentation */
941 if (((const char *)(cie + 2))[1] == 'z') { 1023 if (((const char *)(cie + 2))[1] == 'z') {
@@ -949,6 +1031,8 @@ int unwind(struct unwind_frame_info *frame)
949 || reg_info[retAddrReg].width != sizeof(unsigned long)) 1031 || reg_info[retAddrReg].width != sizeof(unsigned long))
950 cie = NULL; 1032 cie = NULL;
951 } 1033 }
1034 if(!cie)
1035 dprintk(1, "CIE validation failed (%p,%p).", ptr, end);
952 } 1036 }
953 if (cie != NULL) { 1037 if (cie != NULL) {
954 state.cieStart = ptr; 1038 state.cieStart = ptr;
@@ -962,11 +1046,15 @@ int unwind(struct unwind_frame_info *frame)
962 if ((ptr += augSize) > end) 1046 if ((ptr += augSize) > end)
963 fde = NULL; 1047 fde = NULL;
964 } 1048 }
1049 if(!fde)
1050 dprintk(1, "FDE validation failed (%p,%p).", ptr, end);
965 } 1051 }
966 if (cie == NULL || fde == NULL) { 1052 if (cie == NULL || fde == NULL) {
967#ifdef CONFIG_FRAME_POINTER 1053#ifdef CONFIG_FRAME_POINTER
968 unsigned long top, bottom; 1054 unsigned long top, bottom;
969 1055
1056 if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long))
1057 return -EPERM;
970 top = STACK_TOP(frame->task); 1058 top = STACK_TOP(frame->task);
971 bottom = STACK_BOTTOM(frame->task); 1059 bottom = STACK_BOTTOM(frame->task);
972# if FRAME_RETADDR_OFFSET < 0 1060# if FRAME_RETADDR_OFFSET < 0
@@ -982,18 +1070,19 @@ int unwind(struct unwind_frame_info *frame)
982 & (sizeof(unsigned long) - 1))) { 1070 & (sizeof(unsigned long) - 1))) {
983 unsigned long link; 1071 unsigned long link;
984 1072
985 if (!__get_user(link, 1073 if (!probe_kernel_address(
986 (unsigned long *)(UNW_FP(frame) 1074 (unsigned long *)(UNW_FP(frame)
987 + FRAME_LINK_OFFSET)) 1075 + FRAME_LINK_OFFSET),
1076 link)
988# if FRAME_RETADDR_OFFSET < 0 1077# if FRAME_RETADDR_OFFSET < 0
989 && link > bottom && link < UNW_FP(frame) 1078 && link > bottom && link < UNW_FP(frame)
990# else 1079# else
991 && link > UNW_FP(frame) && link < bottom 1080 && link > UNW_FP(frame) && link < bottom
992# endif 1081# endif
993 && !(link & (sizeof(link) - 1)) 1082 && !(link & (sizeof(link) - 1))
994 && !__get_user(UNW_PC(frame), 1083 && !probe_kernel_address(
995 (unsigned long *)(UNW_FP(frame) 1084 (unsigned long *)(UNW_FP(frame)
996 + FRAME_RETADDR_OFFSET))) { 1085 + FRAME_RETADDR_OFFSET), UNW_PC(frame))) {
997 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET 1086 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
998# if FRAME_RETADDR_OFFSET < 0 1087# if FRAME_RETADDR_OFFSET < 0
999 - 1088 -
@@ -1016,8 +1105,11 @@ int unwind(struct unwind_frame_info *frame)
1016 || state.regs[retAddrReg].where == Nowhere 1105 || state.regs[retAddrReg].where == Nowhere
1017 || state.cfa.reg >= ARRAY_SIZE(reg_info) 1106 || state.cfa.reg >= ARRAY_SIZE(reg_info)
1018 || reg_info[state.cfa.reg].width != sizeof(unsigned long) 1107 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
1019 || state.cfa.offs % sizeof(unsigned long)) 1108 || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
1109 || state.cfa.offs % sizeof(unsigned long)) {
1110 dprintk(1, "Unusable unwind info (%p,%p).", ptr, end);
1020 return -EIO; 1111 return -EIO;
1112 }
1021 /* update frame */ 1113 /* update frame */
1022#ifndef CONFIG_AS_CFI_SIGNAL_FRAME 1114#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
1023 if(frame->call_frame 1115 if(frame->call_frame
@@ -1036,10 +1128,14 @@ int unwind(struct unwind_frame_info *frame)
1036#else 1128#else
1037# define CASES CASE(8); CASE(16); CASE(32); CASE(64) 1129# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
1038#endif 1130#endif
1131 pc = UNW_PC(frame);
1132 sp = UNW_SP(frame);
1039 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { 1133 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
1040 if (REG_INVALID(i)) { 1134 if (REG_INVALID(i)) {
1041 if (state.regs[i].where == Nowhere) 1135 if (state.regs[i].where == Nowhere)
1042 continue; 1136 continue;
1137 dprintk(1, "Cannot restore register %u (%d).",
1138 i, state.regs[i].where);
1043 return -EIO; 1139 return -EIO;
1044 } 1140 }
1045 switch(state.regs[i].where) { 1141 switch(state.regs[i].where) {
@@ -1048,8 +1144,11 @@ int unwind(struct unwind_frame_info *frame)
1048 case Register: 1144 case Register:
1049 if (state.regs[i].value >= ARRAY_SIZE(reg_info) 1145 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
1050 || REG_INVALID(state.regs[i].value) 1146 || REG_INVALID(state.regs[i].value)
1051 || reg_info[i].width > reg_info[state.regs[i].value].width) 1147 || reg_info[i].width > reg_info[state.regs[i].value].width) {
1148 dprintk(1, "Cannot restore register %u from register %lu.",
1149 i, state.regs[i].value);
1052 return -EIO; 1150 return -EIO;
1151 }
1053 switch(reg_info[state.regs[i].value].width) { 1152 switch(reg_info[state.regs[i].value].width) {
1054#define CASE(n) \ 1153#define CASE(n) \
1055 case sizeof(u##n): \ 1154 case sizeof(u##n): \
@@ -1059,6 +1158,9 @@ int unwind(struct unwind_frame_info *frame)
1059 CASES; 1158 CASES;
1060#undef CASE 1159#undef CASE
1061 default: 1160 default:
1161 dprintk(1, "Unsupported register size %u (%lu).",
1162 reg_info[state.regs[i].value].width,
1163 state.regs[i].value);
1062 return -EIO; 1164 return -EIO;
1063 } 1165 }
1064 break; 1166 break;
@@ -1083,12 +1185,17 @@ int unwind(struct unwind_frame_info *frame)
1083 CASES; 1185 CASES;
1084#undef CASE 1186#undef CASE
1085 default: 1187 default:
1188 dprintk(1, "Unsupported register size %u (%u).",
1189 reg_info[i].width, i);
1086 return -EIO; 1190 return -EIO;
1087 } 1191 }
1088 break; 1192 break;
1089 case Value: 1193 case Value:
1090 if (reg_info[i].width != sizeof(unsigned long)) 1194 if (reg_info[i].width != sizeof(unsigned long)) {
1195 dprintk(1, "Unsupported value size %u (%u).",
1196 reg_info[i].width, i);
1091 return -EIO; 1197 return -EIO;
1198 }
1092 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value 1199 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
1093 * state.dataAlign; 1200 * state.dataAlign;
1094 break; 1201 break;
@@ -1100,15 +1207,20 @@ int unwind(struct unwind_frame_info *frame)
1100 % sizeof(unsigned long) 1207 % sizeof(unsigned long)
1101 || addr < startLoc 1208 || addr < startLoc
1102 || addr + sizeof(unsigned long) < addr 1209 || addr + sizeof(unsigned long) < addr
1103 || addr + sizeof(unsigned long) > endLoc) 1210 || addr + sizeof(unsigned long) > endLoc) {
1211 dprintk(1, "Bad memory location %lx (%lx).",
1212 addr, state.regs[i].value);
1104 return -EIO; 1213 return -EIO;
1214 }
1105 switch(reg_info[i].width) { 1215 switch(reg_info[i].width) {
1106#define CASE(n) case sizeof(u##n): \ 1216#define CASE(n) case sizeof(u##n): \
1107 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ 1217 probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
1108 break 1218 break
1109 CASES; 1219 CASES;
1110#undef CASE 1220#undef CASE
1111 default: 1221 default:
1222 dprintk(1, "Unsupported memory size %u (%u).",
1223 reg_info[i].width, i);
1112 return -EIO; 1224 return -EIO;
1113 } 1225 }
1114 } 1226 }
@@ -1116,6 +1228,17 @@ int unwind(struct unwind_frame_info *frame)
1116 } 1228 }
1117 } 1229 }
1118 1230
1231 if (UNW_PC(frame) % state.codeAlign
1232 || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
1233 dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
1234 UNW_PC(frame), UNW_SP(frame));
1235 return -EIO;
1236 }
1237 if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) {
1238 dprintk(1, "No progress (%lx,%lx).", pc, sp);
1239 return -EIO;
1240 }
1241
1119 return 0; 1242 return 0;
1120#undef CASES 1243#undef CASES
1121#undef FRAME_REG 1244#undef FRAME_REG
diff --git a/kernel/user.c b/kernel/user.c
index 220e586127a0..4869563080e9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) 27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
28 28
29static kmem_cache_t *uid_cachep; 29static struct kmem_cache *uid_cachep;
30static struct list_head uidhash_table[UIDHASH_SZ]; 30static struct list_head uidhash_table[UIDHASH_SZ];
31 31
32/* 32/*
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
132 if (!up) { 132 if (!up) {
133 struct user_struct *new; 133 struct user_struct *new;
134 134
135 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); 135 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
136 if (!new) 136 if (!new)
137 return NULL; 137 return NULL;
138 new->uid = uid; 138 new->uid = uid;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8d1e7cb8a51a..6b186750e9be 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,9 @@
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30#include <linux/hardirq.h> 30#include <linux/hardirq.h>
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/freezer.h>
33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h>
32 35
33/* 36/*
34 * The per-CPU workqueue (if single thread, we always use the first 37 * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct {
55 struct task_struct *thread; 58 struct task_struct *thread;
56 59
57 int run_depth; /* Detect run_workqueue() recursion depth */ 60 int run_depth; /* Detect run_workqueue() recursion depth */
61
62 int freezeable; /* Freeze the thread during suspend */
58} ____cacheline_aligned; 63} ____cacheline_aligned;
59 64
60/* 65/*
@@ -103,6 +108,79 @@ static inline void *get_wq_data(struct work_struct *work)
103 return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK); 108 return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
104} 109}
105 110
111static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
112{
113 int ret = 0;
114 unsigned long flags;
115
116 spin_lock_irqsave(&cwq->lock, flags);
117 /*
118 * We need to re-validate the work info after we've gotten
119 * the cpu_workqueue lock. We can run the work now iff:
120 *
121 * - the wq_data still matches the cpu_workqueue_struct
122 * - AND the work is still marked pending
123 * - AND the work is still on a list (which will be this
124 * workqueue_struct list)
125 *
126 * All these conditions are important, because we
127 * need to protect against the work being run right
128 * now on another CPU (all but the last one might be
129 * true if it's currently running and has not been
130 * released yet, for example).
131 */
132 if (get_wq_data(work) == cwq
133 && work_pending(work)
134 && !list_empty(&work->entry)) {
135 work_func_t f = work->func;
136 list_del_init(&work->entry);
137 spin_unlock_irqrestore(&cwq->lock, flags);
138
139 if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
140 work_release(work);
141 f(work);
142
143 spin_lock_irqsave(&cwq->lock, flags);
144 cwq->remove_sequence++;
145 wake_up(&cwq->work_done);
146 ret = 1;
147 }
148 spin_unlock_irqrestore(&cwq->lock, flags);
149 return ret;
150}
151
152/**
153 * run_scheduled_work - run scheduled work synchronously
154 * @work: work to run
155 *
156 * This checks if the work was pending, and runs it
157 * synchronously if so. It returns a boolean to indicate
158 * whether it had any scheduled work to run or not.
159 *
160 * NOTE! This _only_ works for normal work_structs. You
161 * CANNOT use this for delayed work, because the wq data
162 * for delayed work will not point properly to the per-
163 * CPU workqueue struct, but will change!
164 */
165int fastcall run_scheduled_work(struct work_struct *work)
166{
167 for (;;) {
168 struct cpu_workqueue_struct *cwq;
169
170 if (!work_pending(work))
171 return 0;
172 if (list_empty(&work->entry))
173 return 0;
174 /* NOTE! This depends intimately on __queue_work! */
175 cwq = get_wq_data(work);
176 if (!cwq)
177 return 0;
178 if (__run_work(cwq, work))
179 return 1;
180 }
181}
182EXPORT_SYMBOL(run_scheduled_work);
183
106/* Preempt must be disabled. */ 184/* Preempt must be disabled. */
107static void __queue_work(struct cpu_workqueue_struct *cwq, 185static void __queue_work(struct cpu_workqueue_struct *cwq,
108 struct work_struct *work) 186 struct work_struct *work)
@@ -250,6 +328,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
250 work_release(work); 328 work_release(work);
251 f(work); 329 f(work);
252 330
331 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
332 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
333 "%s/0x%08x/%d\n",
334 current->comm, preempt_count(),
335 current->pid);
336 printk(KERN_ERR " last function: ");
337 print_symbol("%s\n", (unsigned long)f);
338 debug_show_held_locks(current);
339 dump_stack();
340 }
341
253 spin_lock_irqsave(&cwq->lock, flags); 342 spin_lock_irqsave(&cwq->lock, flags);
254 cwq->remove_sequence++; 343 cwq->remove_sequence++;
255 wake_up(&cwq->work_done); 344 wake_up(&cwq->work_done);
@@ -265,7 +354,8 @@ static int worker_thread(void *__cwq)
265 struct k_sigaction sa; 354 struct k_sigaction sa;
266 sigset_t blocked; 355 sigset_t blocked;
267 356
268 current->flags |= PF_NOFREEZE; 357 if (!cwq->freezeable)
358 current->flags |= PF_NOFREEZE;
269 359
270 set_user_nice(current, -5); 360 set_user_nice(current, -5);
271 361
@@ -288,6 +378,9 @@ static int worker_thread(void *__cwq)
288 378
289 set_current_state(TASK_INTERRUPTIBLE); 379 set_current_state(TASK_INTERRUPTIBLE);
290 while (!kthread_should_stop()) { 380 while (!kthread_should_stop()) {
381 if (cwq->freezeable)
382 try_to_freeze();
383
291 add_wait_queue(&cwq->more_work, &wait); 384 add_wait_queue(&cwq->more_work, &wait);
292 if (list_empty(&cwq->worklist)) 385 if (list_empty(&cwq->worklist))
293 schedule(); 386 schedule();
@@ -364,7 +457,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
364EXPORT_SYMBOL_GPL(flush_workqueue); 457EXPORT_SYMBOL_GPL(flush_workqueue);
365 458
366static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 459static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
367 int cpu) 460 int cpu, int freezeable)
368{ 461{
369 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 462 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
370 struct task_struct *p; 463 struct task_struct *p;
@@ -374,6 +467,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
374 cwq->thread = NULL; 467 cwq->thread = NULL;
375 cwq->insert_sequence = 0; 468 cwq->insert_sequence = 0;
376 cwq->remove_sequence = 0; 469 cwq->remove_sequence = 0;
470 cwq->freezeable = freezeable;
377 INIT_LIST_HEAD(&cwq->worklist); 471 INIT_LIST_HEAD(&cwq->worklist);
378 init_waitqueue_head(&cwq->more_work); 472 init_waitqueue_head(&cwq->more_work);
379 init_waitqueue_head(&cwq->work_done); 473 init_waitqueue_head(&cwq->work_done);
@@ -389,7 +483,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
389} 483}
390 484
391struct workqueue_struct *__create_workqueue(const char *name, 485struct workqueue_struct *__create_workqueue(const char *name,
392 int singlethread) 486 int singlethread, int freezeable)
393{ 487{
394 int cpu, destroy = 0; 488 int cpu, destroy = 0;
395 struct workqueue_struct *wq; 489 struct workqueue_struct *wq;
@@ -409,7 +503,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
409 mutex_lock(&workqueue_mutex); 503 mutex_lock(&workqueue_mutex);
410 if (singlethread) { 504 if (singlethread) {
411 INIT_LIST_HEAD(&wq->list); 505 INIT_LIST_HEAD(&wq->list);
412 p = create_workqueue_thread(wq, singlethread_cpu); 506 p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
413 if (!p) 507 if (!p)
414 destroy = 1; 508 destroy = 1;
415 else 509 else
@@ -417,7 +511,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
417 } else { 511 } else {
418 list_add(&wq->list, &workqueues); 512 list_add(&wq->list, &workqueues);
419 for_each_online_cpu(cpu) { 513 for_each_online_cpu(cpu) {
420 p = create_workqueue_thread(wq, cpu); 514 p = create_workqueue_thread(wq, cpu, freezeable);
421 if (p) { 515 if (p) {
422 kthread_bind(p, cpu); 516 kthread_bind(p, cpu);
423 wake_up_process(p); 517 wake_up_process(p);
@@ -634,7 +728,6 @@ int current_is_keventd(void)
634 728
635} 729}
636 730
637#ifdef CONFIG_HOTPLUG_CPU
638/* Take the work from this (downed) CPU. */ 731/* Take the work from this (downed) CPU. */
639static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 732static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
640{ 733{
@@ -667,7 +760,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
667 mutex_lock(&workqueue_mutex); 760 mutex_lock(&workqueue_mutex);
668 /* Create a new workqueue thread for it. */ 761 /* Create a new workqueue thread for it. */
669 list_for_each_entry(wq, &workqueues, list) { 762 list_for_each_entry(wq, &workqueues, list) {
670 if (!create_workqueue_thread(wq, hotcpu)) { 763 if (!create_workqueue_thread(wq, hotcpu, 0)) {
671 printk("workqueue for %i failed\n", hotcpu); 764 printk("workqueue for %i failed\n", hotcpu);
672 return NOTIFY_BAD; 765 return NOTIFY_BAD;
673 } 766 }
@@ -717,7 +810,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
717 810
718 return NOTIFY_OK; 811 return NOTIFY_OK;
719} 812}
720#endif
721 813
722void init_workqueues(void) 814void init_workqueues(void)
723{ 815{